Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
#!/usr/bin/env python
import numpy as np
from kmodes.kmodes import KModes
# reproduce results on small soybean data set
x = np.genfromtxt('soybean.csv', dtype=int, delimiter=',')[:, :-1]
y = np.genfromtxt('soybean.csv', dtype=str, delimiter=',', usecols=(35, ))
kmodes_huang = KModes(n_clusters=4, init='Huang', verbose=1)
kmodes_huang.fit(x)
# Print cluster centroids of the trained model.
print('k-modes (Huang) centroids:')
print(kmodes_huang.cluster_centroids_)
# Print training statistics
print('Final training cost: {}'.format(kmodes_huang.cost_))
print('Training iterations: {}'.format(kmodes_huang.n_iter_))
kmodes_cao = KModes(n_clusters=4, init='Cao', verbose=1)
kmodes_cao.fit(x)
# Print cluster centroids of the trained model.
print('k-modes (Cao) centroids:')
print(kmodes_cao.cluster_centroids_)
# Print training statistics
def _kmodes(k, n_init, n_jobs, seed):
KModes(n_clusters=k, init='Huang', n_init=n_init, n_jobs=n_jobs,
random_state=seed) \
.fit(data[:N_kmodes, :])
# reproduce results on small soybean data set
x = np.genfromtxt('soybean.csv', dtype=int, delimiter=',')[:, :-1]
y = np.genfromtxt('soybean.csv', dtype=str, delimiter=',', usecols=(35, ))
kmodes_huang = KModes(n_clusters=4, init='Huang', verbose=1)
kmodes_huang.fit(x)
# Print cluster centroids of the trained model.
print('k-modes (Huang) centroids:')
print(kmodes_huang.cluster_centroids_)
# Print training statistics
print('Final training cost: {}'.format(kmodes_huang.cost_))
print('Training iterations: {}'.format(kmodes_huang.n_iter_))
kmodes_cao = KModes(n_clusters=4, init='Cao', verbose=1)
kmodes_cao.fit(x)
# Print cluster centroids of the trained model.
print('k-modes (Cao) centroids:')
print(kmodes_cao.cluster_centroids_)
# Print training statistics
print('Final training cost: {}'.format(kmodes_cao.cost_))
print('Training iterations: {}'.format(kmodes_cao.n_iter_))
print('Results tables:')
for result in (kmodes_huang, kmodes_cao):
classtable = np.zeros((4, 4), dtype=int)
for ii, _ in enumerate(y):
classtable[int(y[ii][-1]) - 1, result.labels_[ii]] += 1
print("\n")
def _kprototypes(k, n_init, n_jobs, seed):
KPrototypes(n_clusters=k, init='Huang', n_init=n_init, n_jobs=n_jobs,
random_state=seed) \
.fit(data[:N_kproto, :], categorical=list(range(M - MN, M)))
def kproto(self): # TODO- solve clustering issue with PCA + K-means
cluster_data = self.data
opt_k = self.silouhette_analysis(cluster_data, prototype=True)
kp = KPrototypes(n_clusters=opt_k)
kp.fit(cluster_data, categorical=self.categorical_features)
labels = kp.predict(
cluster_data, categorical=self.categorical_features)
cluster_data['labels'] = labels
self.data_clustered = cluster_data
return cluster_data
def fit(self, X, y=None, **kwargs):
"""Compute k-modes clustering.
Parameters
----------
X : array-like, shape=[n_samples, n_features]
"""
X = pandas_to_numpy(X)
random_state = check_random_state(self.random_state)
self._enc_cluster_centroids, self._enc_map, self.labels_, self.cost_, \
self.n_iter_, self.epoch_costs_ = k_modes(
X,
self.n_clusters,
self.max_iter,
self.cat_dissim,
self.init,
self.n_init,
self.verbose,
random_state,
self.n_jobs,
)
return self
def fit(self, X, y=None, categorical=None):
"""Compute k-prototypes clustering.
Parameters
----------
X : array-like, shape=[n_samples, n_features]
categorical : Index of columns that contain categorical data
"""
if categorical is not None:
assert isinstance(categorical, (int, list, tuple)), "The 'categorical' \
argument needs to be an integer with the index of the categorical \
column in your data, or a list or tuple of several of them, \
but it is a {}.".format(type(categorical))
X = pandas_to_numpy(X)
random_state = check_random_state(self.random_state)
# If self.gamma is None, gamma will be automatically determined from
# the data. The function below returns its value.
self._enc_cluster_centroids, self._enc_map, self.labels_, self.cost_, \
self.n_iter_, self.epoch_costs_, self.gamma = k_prototypes(
X,
categorical,
self.n_clusters,
self.max_iter,
self.num_dissim,
self.cat_dissim,
self.gamma,
self.init,
self.n_init,
self.verbose,
if init_tries == MAX_INIT_TRIES:
# Could not get rid of empty clusters. Randomly
# initialize instead.
init = 'random'
elif init_tries == RAISE_INIT_TRIES:
raise ValueError(
"Clustering algorithm could not initialize. "
"Consider assigning the initial clusters manually."
)
# Perform an initial centroid update.
for ik in range(n_clusters):
for iattr in range(nnumattrs):
centroids[0][ik, iattr] = cl_attr_sum[ik, iattr] / cl_memb_sum[ik]
for iattr in range(ncatattrs):
centroids[1][ik, iattr] = get_max_value_key(cl_attr_freq[ik][iattr])
# _____ ITERATION _____
if verbose:
print("Starting iterations...")
itr = 0
labels = None
converged = False
_, cost = _labels_cost(Xnum, Xcat, centroids,
num_dissim, cat_dissim, gamma, membship)
epoch_costs = [cost]
while itr <= max_iter and not converged:
itr += 1
centroids, moves = _k_prototypes_iter(Xnum, Xcat, centroids,
cl_attr_sum, cl_memb_sum, cl_attr_freq,
current_attribute_value_freq = to_attr_counts[curattr]
current_centroid_value = centroids[to_clust][iattr]
current_centroid_freq = to_attr_counts[current_centroid_value]
if current_centroid_freq < current_attribute_value_freq:
# We have incremented this value to the new mode. Update the centroid.
centroids[to_clust][iattr] = curattr
# Decrement the attribute count for the old "from" cluster
from_attr_counts[curattr] -= 1
old_centroid_value = centroids[from_clust][iattr]
if old_centroid_value == curattr:
# We have just removed a count from the old centroid value. We need to
# recalculate the centroid as it may no longer be the maximum
centroids[from_clust][iattr] = get_max_value_key(from_attr_counts)
return cl_attr_freq, membship, centroids
for _ in range(n_clusters)]
for ipoint, curpoint in enumerate(X):
# Initial assignment to clusters
clust = np.argmin(dissim(centroids, curpoint, X=X, membship=membship))
membship[clust, ipoint] = 1
# Count attribute values per cluster.
for iattr, curattr in enumerate(curpoint):
cl_attr_freq[clust][iattr][curattr] += 1
# Perform an initial centroid update.
for ik in range(n_clusters):
for iattr in range(n_attrs):
if sum(membship[ik]) == 0:
# Empty centroid, choose randomly
centroids[ik, iattr] = random_state.choice(X[:, iattr])
else:
centroids[ik, iattr] = get_max_value_key(cl_attr_freq[ik][iattr])
# _____ ITERATION _____
if verbose:
print("Starting iterations...")
itr = 0
labels = None
converged = False
_, cost = _labels_cost(X, centroids, dissim, membship)
epoch_costs = [cost]
while itr <= max_iter and not converged:
itr += 1
centroids, moves = _k_modes_iter(
X,
centroids,