Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
if sparse.issparse(X):
raise TypeError("k-modes does not support sparse data.")
X = check_array(X, dtype=None)
# Convert the categorical values in X to integers for speed.
# Based on the unique values in X, we can make a mapping to achieve this.
X, enc_map = encode_features(X)
n_points, n_attrs = X.shape
assert n_clusters <= n_points, "Cannot have more clusters ({}) " \
"than data points ({}).".format(n_clusters, n_points)
# Are there more n_clusters than unique rows? Then set the unique
# rows as initial values and skip iteration.
unique = get_unique_rows(X)
n_unique = unique.shape[0]
if n_unique <= n_clusters:
max_iter = 0
n_init = 1
n_clusters = n_unique
init = unique
results = []
seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
if n_jobs == 1:
for init_no in range(n_init):
results.append(k_modes_single(X, n_clusters, n_points, n_attrs, max_iter,
dissim, init, init_no, verbose, seeds[init_no]))
else:
results = Parallel(n_jobs=n_jobs, verbose=0)(
delayed(k_modes_single)(X, n_clusters, n_points, n_attrs, max_iter,
ncatattrs = len(categorical)
nnumattrs = X.shape[1] - ncatattrs
n_points = X.shape[0]
assert n_clusters <= n_points, "Cannot have more clusters ({}) " \
"than data points ({}).".format(n_clusters, n_points)
Xnum, Xcat = _split_num_cat(X, categorical)
Xnum, Xcat = check_array(Xnum), check_array(Xcat, dtype=None)
# Convert the categorical values in Xcat to integers for speed.
# Based on the unique values in Xcat, we can make a mapping to achieve this.
Xcat, enc_map = encode_features(Xcat)
# Are there more n_clusters than unique rows? Then set the unique
# rows as initial values and skip iteration.
unique = get_unique_rows(X)
n_unique = unique.shape[0]
if n_unique <= n_clusters:
max_iter = 0
n_init = 1
n_clusters = n_unique
init = list(_split_num_cat(unique, categorical))
init[1], _ = encode_features(init[1], enc_map)
# Estimate a good value for gamma, which determines the weighing of
# categorical values in clusters (see Huang [1997]).
if gamma is None:
gamma = 0.5 * Xnum.std()
results = []
seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
if n_jobs == 1: