Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _equal_arrays(x1, x2):
if sp.issparse(x1):
return np.allclose(x1.toarray(), x2.toarray())
else:
return np.allclose(x1, x2)
def fit(self, X, y, sample_weight=None, check_input=True,
X_idx_sorted=None):
random_state = check_random_state(self.random_state)
if check_input:
X = check_array(X, dtype=DTYPE, accept_sparse="csc")
y = check_array(y, ensure_2d=False, dtype=None)
if issparse(X):
X.sort_indices()
if X.indices.dtype != np.intc or X.indptr.dtype != np.intc:
raise ValueError("No support for np.int64 index based "
"sparse matrices")
# Determine output settings
n_samples, self.n_features_ = X.shape
is_classification = is_classifier(self)
y = np.atleast_1d(y)
expanded_class_weight = None
if y.ndim == 1:
# reshape is necessary to preserve the data contiguity against vs
# [:, np.newaxis] that does not.
----------
X : array-like, shape (n_samples, n_features)
Training data, where n_samples is the number of samples and
n_features is the number of features.
check_input : bool
Run check_array on X.
y : Ignored
Returns
-------
self : object
Returns the instance itself.
"""
if check_input:
if sparse.issparse(X):
raise TypeError(
"IncrementalPCA.partial_fit does not support "
"sparse input. Either convert data to dense "
"or use IncrementalPCA.fit to do so in batches.")
X = check_array(X, copy=self.copy, dtype=[np.float64, np.float32])
n_samples, n_features = X.shape
if not hasattr(self, 'components_'):
self.components_ = None
if self.n_components is None:
if self.components_ is None:
self.n_components_ = min(n_samples, n_features)
else:
self.n_components_ = self.components_.shape[0]
elif not 1 <= self.n_components <= n_features:
raise ValueError("n_components=%r invalid for n_features=%d, need "
This method can be used in two ways:
* On an unfitted model in which case the model is initialized and trained on `X`.
* On an already fitted model in which case the model is **updated** by `X`.
Parameters
----------
X : {iterable of iterable of (int, int), scipy.sparse matrix}
A collection of documents in BOW format used for training the model.
Returns
-------
:class:`~gensim.sklearn_api.ldamodel.LdaTransformer`
The trained model.
"""
if sparse.issparse(X):
X = matutils.Sparse2Corpus(sparse=X, documents_columns=False)
if self.gensim_model is None:
self.gensim_model = models.LdaModel(
num_topics=self.num_topics, id2word=self.id2word,
chunksize=self.chunksize, passes=self.passes, update_every=self.update_every,
alpha=self.alpha, eta=self.eta, decay=self.decay, offset=self.offset,
eval_every=self.eval_every, iterations=self.iterations, gamma_threshold=self.gamma_threshold,
minimum_probability=self.minimum_probability, random_state=self.random_state,
dtype=self.dtype
)
self.gensim_model.update(corpus=X)
return self
def toNLTK(_mat, prefix = 2):
import mscs
if scipy.sparse.issparse(_mat):
mat = _mat.tocsc()
else:
mat = _mat
result = []
for i in xrange(mat.shape[1]):
if scipy.sparse.issparse(mat):
a0 = mat[:, i].toarray()
else:
a0 = mat[:, i]
nnind = a0.nonzero()[0]
nnvals = a0.take(nnind)
features = dict(zip(nnind, nnvals))
id = docids[i]
okarts = [a for a in arts if a.id_int == id]
if len(okarts) != 1:
raise Exception('%i articles with id=%s' % repr(id))
def _inverse_binarize_multiclass(y, classes):
"""Inverse label binarization transformation for multiclass.
Multiclass uses the maximal score instead of a threshold.
"""
classes = np.asarray(classes)
if sp.issparse(y):
# Find the argmax for each row in y where y is a CSR matrix
y = y.tocsr()
n_samples, n_outputs = y.shape
outputs = np.arange(n_outputs)
row_max = sparse_min_max(y, 1)[1]
row_nnz = np.diff(y.indptr)
y_data_repeated_max = np.repeat(row_max, row_nnz)
# picks out all indices obtaining the maximum per row
y_i_all_argmax = np.flatnonzero(y_data_repeated_max == y.data)
# For corner case where last row has a max of 0
if row_max[-1] == 0:
y_i_all_argmax = np.append(y_i_all_argmax, [len(y.data)])
def _computeMatrixReward(self, reward, transition):
if _sp.issparse(reward):
# An approach like this might be more memory efficeint
# reward.data = reward.data * transition[reward.nonzero()]
# return reward.sum(1).A.reshape(self.S)
# but doesn't work as it is.
return reward.multiply(transition).sum(1).A.reshape(self.S)
elif _sp.issparse(transition):
return transition.multiply(reward).sum(1).A.reshape(self.S)
else:
return _np.multiply(transition, reward).sum(1).reshape(self.S)
def _convert_vec(vec1, vec2, num_features=None):
if scipy.sparse.issparse(vec1):
vec1 = vec1.toarray()
if scipy.sparse.issparse(vec2):
vec2 = vec2.toarray() # converted both the vectors to dense in case they were in sparse matrix
if isbow(vec1) and isbow(vec2): # if they are in bag of words format we make it dense
if num_features is not None: # if not None, make as large as the documents drawing from
dense1 = sparse2full(vec1, num_features)
dense2 = sparse2full(vec2, num_features)
return dense1, dense2
else:
max_len = max(len(vec1), len(vec2))
dense1 = sparse2full(vec1, max_len)
dense2 = sparse2full(vec2, max_len)
return dense1, dense2
else:
# this conversion is made because if it is not in bow format, it might be a list within a list after conversion
# the scipy implementation of Kullback fails in such a case so we pick up only the nested list.
if len(vec1) == 1:
vec1 = vec1[0]
def _create_lookups(self, X):
"""
Create document and term lookups for all tokens.
"""
docs, terms = np.nonzero(X)
if issparse(X):
x = np.array(X[docs, terms])[0]
else:
x = X[docs, terms]
doc_lookup = np.ascontiguousarray(np.repeat(docs, x), dtype=np.intc)
term_lookup = np.ascontiguousarray(np.repeat(terms, x), dtype=np.intc)
return doc_lookup, term_lookup