Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_pca_inverse():
# Test that the projection of data can be inverted
rng = np.random.RandomState(0)
n, p = 50, 3
X = rng.randn(n, p) # spherical data
X[:, 1] *= 0.00001 # make middle component relatively small
X += [5, 4, 3] # make a large mean
dX = da.from_array(X, chunks=(n // 2, p))
# same check that we can find the original data from the transformed
# signal (since the data is almost of rank n_components)
pca = dd.PCA(n_components=2, svd_solver="full").fit(dX)
Y = pca.transform(dX)
Y_inverse = pca.inverse_transform(Y)
assert_almost_equal(X, Y_inverse, decimal=3)
# same as above with whitening (approximate reconstruction)
for solver in solver_list:
pca = dd.PCA(n_components=2, whiten=True, svd_solver=solver)
pca.fit(dX)
Y = pca.transform(dX)
Y_inverse = pca.inverse_transform(Y)
assert_eq(dX, Y_inverse, atol=1e-3)
dfParts = []
chunkSize = min(self.CHUNK_SIZE, self.nEvents / self.ncores)
nPartitions = int(self.nEvents // chunkSize) + 1
# Determine the column names
gNames = kwds.pop('groupnames', self.getGroupNames(wexpr='Stream'))
colNames = self.name2alias(gNames)
for p in range(nPartitions): # Generate partitioned dataframe
# Calculate the starting and ending index of every chunk of events
eventIDStart = int(p * chunkSize)
eventIDEnd = int(min(eventIDStart + chunkSize, self.nEvents))
dfParts.append(d.delayed(self._assembleGroups)(gNames, amin=eventIDStart, amax=eventIDEnd, **kwds))
# Construct eda (event dask array) and edf (event dask dataframe)
eda = da.from_array(np.concatenate(d.compute(*dfParts), axis=1).T, chunks=self.CHUNK_SIZE)
self.edf = ddf.from_dask_array(eda, columns=colNames)
if ret == True:
return self.edf
# Delayed array for loading an HDF5 file of reasonable size (e.g. < 1GB)
elif form == 'darray':
gNames = kwds.pop('groupnames', self.getGroupNames(wexpr='Stream'))
darray = d.delayed(self._assembleGroups)(gNames, amin=None, amax=None, timeStamps=timeStamps, ret='array', **kwds)
if ret == True:
return darray
def _check_inputs(self, X, accept_sparse_negative=False, copy=False):
kwargs = {}
if SK_022:
kwargs["copy"] = copy
if isinstance(X, (pd.DataFrame, dd.DataFrame)):
X = X.values
if isinstance(X, np.ndarray):
C = len(X) // min(multiprocessing.cpu_count(), 2)
X = da.from_array(X, chunks=C)
rng = check_random_state(self.random_state)
# TODO: non-float dtypes?
# TODO: sparse arrays?
# TODO: mix of sparse, dense?
sample = rng.uniform(size=(5, X.shape[1])).astype(X.dtype)
super(QuantileTransformer, self)._check_inputs(
sample, accept_sparse_negative=accept_sparse_negative, **kwargs
)
return X
Optimizes chunk size in different orientations to facilitate rapid
screening of algorithm output
Returns
-------
darray : Dask Array
chunk_init : tuple (len 3), chunk size before ghosting. Used in select cases
"""
# Compute chunk size and convert if not a Dask Array
if not isinstance(darray, da.core.Array):
chunk_size = util.compute_chunk_size(darray.shape,
darray.dtype.itemsize,
kernel=kernel,
preview=preview)
darray = da.from_array(darray, chunks=chunk_size)
chunks_init = darray.chunks
else:
chunks_init = darray.chunks
# Ghost Dask Array if operation specifies a kernel
if kernel != None:
hw = tuple(np.array(kernel) // 2)
darray = da.ghost.ghost(darray, depth=hw, boundary='reflect')
return(darray, chunks_init)
def test_incremental_basic(scheduler, dataframes):
# Create observations that we know linear models can recover
n, d = 100, 3
rng = da.random.RandomState(42)
X = rng.normal(size=(n, d), chunks=30)
coef_star = rng.uniform(size=d, chunks=d)
y = da.sign(X.dot(coef_star))
y = (y + 1) / 2
if dataframes:
X = dd.from_array(X)
y = dd.from_array(y)
with scheduler() as (s, [_, _]):
est1 = SGDClassifier(random_state=0, tol=1e-3, average=True)
est2 = clone(est1)
clf = Incremental(est1, random_state=0)
result = clf.fit(X, y, classes=[0, 1])
assert result is clf
def test_basic(self, output_distribution):
rs = da.random.RandomState(0)
a = dpp.QuantileTransformer(output_distribution=output_distribution)
b = spp.QuantileTransformer(output_distribution=output_distribution)
X = rs.uniform(size=(1000, 3), chunks=50)
a.fit(X)
b.fit(X)
assert_estimator_equal(a, b, atol=0.02)
# set the quantiles, so that from here out, we're exact
a.quantiles_ = b.quantiles_
assert_eq_ar(a.transform(X), b.transform(X), atol=1e-7)
assert_eq_ar(X, a.inverse_transform(a.transform(X)))
def test_fit_shuffle_blocks():
N = 10
X = da.from_array(1 + np.arange(N).reshape(-1, 1), chunks=1)
y = da.from_array(np.ones(N), chunks=1)
classes = [0, 1]
sgd = SGDClassifier(
max_iter=5, random_state=0, fit_intercept=False, shuffle=False, tol=1e-3
)
sgd1 = fit(clone(sgd), X, y, random_state=0, classes=classes)
sgd2 = fit(clone(sgd), X, y, random_state=42, classes=classes)
assert len(sgd1.coef_) == len(sgd2.coef_) == 1
assert not np.allclose(sgd1.coef_, sgd2.coef_)
X, y = make_classification(random_state=0, chunks=20)
sgd_a = fit(clone(sgd), X, y, random_state=0, classes=classes, shuffle_blocks=False)
sgd_b = fit(
clone(sgd), X, y, random_state=42, classes=classes, shuffle_blocks=False
def huge_2d_array():
array = np.vstack(1000 * [np.arange(0, 1000)])
return da.from_array(array, chunks=(500, 500))
def test_dask_dataframe(self):
with LocalCUDACluster() as cluster:
with Client(cluster) as client:
X, y = generate_array()
X = dd.from_dask_array(X)
y = dd.from_dask_array(y)
X = X.map_partitions(cudf.from_pandas)
y = y.map_partitions(cudf.from_pandas)
dtrain = dxgb.DaskDMatrix(client, X, y)
out = dxgb.train(client, {'tree_method': 'gpu_hist'},
dtrain=dtrain,
evals=[(dtrain, 'X')],
num_boost_round=2)
assert isinstance(out['booster'], dxgb.Booster)
assert len(out['history']['X']['rmse']) == 2
predictions = dxgb.predict(client, out, dtrain).compute()
assert isinstance(predictions, np.ndarray)
def test_example(query, expected, model):
if model == 'dask':
sc = {k: dd.from_pandas(df, npartitions=3) for k, df in scope.items()}
actual = fq.execute(query, scope=sc, model=model)
actual = actual.compute()
else:
actual = fq.execute(query, scope=scope, model=model)
expected = expected()
# set empty columns in expected to the ones in actual
expected.columns = [e or a for a, e in zip(actual.columns, expected.columns)]
actual = actual.reset_index(drop=True)
expected = actual.reset_index(drop=True)
pdt.assert_frame_equal(actual, expected, check_dtype=False)