Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
X_tr = pca.transform(X)
print "INFO: X_tr.shape=",X_tr.shape
if len(X_tr[0])<3:
print "ERROR: Dataset has dimension less than 3"
elif n_components <3:
print "WARNING: set n_components to 3 for 3D graph"
n_components=3
X_reduced = X_tr[:,0:n_components]
k=n_components
print "INFO: sklearn_PCA_transform: n_components =",n_components, ", threshold=",threshold
print "RESULT: PCA ratio_vec=",ratio_vec
elif k >0: # by n_components ===============
pca = PCA(n_components=k)
pca.fit(X)
X_reduced = pca.transform(X)
print "INFO: PCA n_components =",k
if not X_reduced is None:
print "INFO: sklearn_PCA_transform: X_reduced.shape=",X_reduced.shape
return (X_reduced, k, pca)
scale_covariates:
"""
# Initialise mean of the Q distribution
if qmean is not None:
if isinstance(qmean,str):
if qmean == "random": # Random initialisation of latent variables
qmean = stats.norm.rvs(loc=0, scale=1, size=(self.N,self.K))
elif qmean == "orthogonal": # Latent variables are initialised randomly but ensuring orthogonality
pca = sklearn.decomposition.PCA(n_components=self.K, copy=True, whiten=True)
pca.fit(stats.norm.rvs(loc=0, scale=1, size=(self.N,9999)).T)
qmean = pca.components_.T
elif qmean == "pca": # Latent variables are initialised from PCA in the concatenated matrix
pca = sklearn.decomposition.PCA(n_components=self.K, copy=True, whiten=True)
pca.fit(s.concatenate(self.data,axis=1).T)
qmean = pca.components_.T
elif isinstance(qmean,s.ndarray):
assert qmean.shape == (self.N,self.K)
elif isinstance(qmean,(int,float)):
qmean = s.ones((self.N,self.K)) * qmean
else:
print("Wrong initialisation for Z")
exit()
# Add covariates
if covariates is not None:
assert scale_covariates != None, "If you use covariates also define data_opts['scale_covariates']"
X, y = make_classification(n_samples = 100,
n_features = 3,
n_informative = 2,
n_redundant = 1,
n_classes = 2,
weights = [.4, .6])
X_train, X_test, y_train, y_test = train_test_split(X,
y,
test_size=0.1,
random_state=1)
corr = np.corrcoef(X_train, rowvar=False)
sns.heatmap(corr)
plot_data_3d(X_train, y_train)
lpca = decomposition.PCA(2)
pca_data = lpca.fit_transform(X_train)
print(lpca.explained_variance_ratio_)
plot_data_2d(pca_data, y_train, ['PC1', 'PC2'])
#######################################################
# model parameters and placeholders
#######################################################
# read parameters
job = basenji.dna_io.read_job_params(options.params_file)
job['num_targets'] = targets.shape[1]
# construct model
print('Constructing model')
sys.stdout.flush()
if job.get('model', 'autoencoder') == 'pca':
# construct
model = PCA(n_components=job['latent_dim'])
# train
model.fit(targets[tv_line:])
# validate
latent_valid = model.transform(targets[:tv_line])
recon_valid = model.inverse_transform(latent_valid)
valid_var = targets[:tv_line].var()
recon_var = (targets[:tv_line] - recon_valid).var()
r2 = 1.0 - np.divide(recon_var, valid_var)
print('Valid R2: %7.5f' % r2.mean())
# save
joblib.dump(model, model_out_file)
if options.reconstruct_out_pre:
def pca(x, comps_num=1):
import sklearn.decomposition as deco
remove_cols = np.where(np.all(x == np.mean(x, 0), 0))[0]
x = np.delete(x, remove_cols, 1)
x = (x - np.mean(x, 0)) / np.std(x, 0)
pca = deco.PCA(comps_num)
x = x.T
x_r = pca.fit(x).transform(x)
return x_r
def main(args, _=None):
print("[== Loading features ==]")
features = None
for in_npy in args.in_npy.split(","):
features_ = np.load(in_npy, mmap_mode="r")
if features is None:
features = features_
else:
features = np.concatenate((features, features_), axis=0)
if args.n_hidden is not None:
pipeline = Pipeline(
[
("scale", StandardScaler()),
("pca", PCA(n_components=args.n_hidden, random_state=42)),
("normalize", Normalizer()),
]
)
print("[== Transforming features ==]")
features = pipeline.fit_transform(features)
np.save(args.out_npy, features)
print(
"[ Explained variance ratio: {ratio:.4} ]".format(
ratio=pipeline.named_steps["pca"].explained_variance_ratio_.
sum()
)
)
print("[== Saving pipeline ==]")
diff = np.subtract(embeddings1, embeddings2)
dist = np.sum(np.square(diff), 1)
global max_threshold
global min_threshold
for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
# print('train_set', train_set)
# print('test_set', test_set)
if pca > 0:
print('doing pca on', fold_idx)
embed1_train = embeddings1[train_set]
embed2_train = embeddings2[train_set]
_embed_train = np.concatenate((embed1_train, embed2_train), axis=0)
# print(_embed_train.shape)
pca_model = PCA(n_components=pca)
pca_model.fit(_embed_train)
embed1 = pca_model.transform(embeddings1)
embed2 = pca_model.transform(embeddings2)
embed1 = sklearn.preprocessing.normalize(embed1)
embed2 = sklearn.preprocessing.normalize(embed2)
# print(embed1.shape, embed2.shape)
diff = np.subtract(embed1, embed2)
dist = np.sum(np.square(diff), 1)
# Find the best threshold for the fold
acc_train = np.zeros((nrof_thresholds))
for threshold_idx, threshold in enumerate(thresholds):
_, _, acc_train[threshold_idx] = calculate_accuracy(threshold, dist[train_set], actual_issame[train_set])
best_threshold_index = np.argmax(acc_train)
# print('best_threshold_index', best_threshold_index, acc_train[best_threshold_index])
for threshold_idx, threshold in enumerate(thresholds):
if options.clean:
#### do pca twice to remove outliers
pca=PCA(options.nbasis)
pout=pca.fit_transform(imgsnp)
dst=np.linalg.norm(pout-np.mean(pout, 0), axis=1)
outlr=dst>np.mean(dst)+np.std(dst)*2
np.savetxt("{}/pca_rmoutlier.txt".format(options.outpath),
np.hstack([ptclids, pout]))
print("Removing {} outliers...".format(np.sum(outlr)))
imgsnp=imgsnp[outlr==False]
ptclids=ptclids[outlr==False]
pca=PCA(options.nbasis)
pout=pca.fit_transform(imgsnp)
np.savetxt("{}/pca_ptcls.txt".format(options.outpath),
np.hstack([ptclids, pout]))
basisfile = "{}/pca_basis.hdf".format(options.outpath)
#threed.process("math.meanshrink",{"n":options.shrink}).write_image(basisfile, 0)
l=len(data[0])
for i,c in enumerate(pca.components_):
eg=c[:l]+c[l:]*1j
egmap=eg.reshape((sz,sz,sz))
o=np.real(np.fft.ifftn(np.fft.ifftshift(egmap)))
m=from_numpy(o.copy())
m.write_image(basisfile,i)
print("Classifying particles...")
kmeans = KMeans(n_clusters=options.nclass).fit(pout)
def compute_pca(data, n_components):
""" Computes PCA on a subset of nr_samples of descriptors. """
pca = PCA(n_components=n_components)
pca.fit(data)
return pca
n_components: int
The dimension of the embedding space.
random_state: Union[int, RandomState]
If the value is an int, random_state is the seed used by the random
number generator. If the value is a RandomState instance, then it will
be used as the random number generator. If the value is None, the random
number generator is the RandomState instance used by `np.random`.
Returns
-------
initialization: np.ndarray
"""
pca_ = PCA(
n_components=n_components, svd_solver=svd_solver, random_state=random_state
)
embedding = pca_.fit_transform(X)
# The PCA embedding may have high variance, which leads to poor convergence
normalization = np.std(embedding[:, 0]) * 100
embedding /= normalization
return np.ascontiguousarray(embedding)