Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_vectorized_math_applymap_on_large_dataframe(self):
LOG.info("test_vectorized_math_applymap_on_large_dataframe")
df = pd.DataFrame({"x": np.random.normal(size=1_000_000), "y": np.random.uniform(size=1_000_000)})
tqdm.pandas(desc="Pandas Vec math applymap ~ DF")
start_pd = time.time()
pd_val = df.progress_applymap(math_vec_square)
end_pd = time.time()
pd_time = end_pd - start_pd
start_swifter = time.time()
swifter_val = df.swifter.progress_bar(desc="Vec math applymap ~ DF").applymap(math_vec_square)
end_swifter = time.time()
swifter_time = end_swifter - start_swifter
self.assertEqual(pd_val, swifter_val) # equality test
if self.ncores > 1: # speed test
self.assertLess(swifter_time, pd_time)
def test_nonvectorized_math_apply_on_large_series(self):
LOG.info("test_nonvectorized_math_apply_on_large_series")
df = pd.DataFrame({"x": np.random.normal(size=10_000_000)})
series = df["x"]
tqdm.pandas(desc="Pandas Nonvec math apply ~ Series")
start_pd = time.time()
pd_val = series.progress_apply(math_foo, compare_to=1)
end_pd = time.time()
pd_time = end_pd - start_pd
start_swifter = time.time()
swifter_val = series.swifter.progress_bar(desc="Nonvec math apply ~ Series").apply(math_foo, compare_to=1)
end_swifter = time.time()
swifter_time = end_swifter - start_swifter
self.assertEqual(pd_val, swifter_val) # equality test
if self.ncores > 1: # speed test
self.assertLess(swifter_time, pd_time)
def test_vectorized_math_apply_on_large_dataframe(self):
LOG.info("test_vectorized_math_apply_on_large_dataframe")
df = pd.DataFrame({"x": np.random.normal(size=1_000_000), "y": np.random.uniform(size=1_000_000)})
tqdm.pandas(desc="Pandas Vec math apply ~ DF")
start_pd = time.time()
pd_val = df.progress_apply(math_vec_multiply, axis=1)
end_pd = time.time()
pd_time = end_pd - start_pd
start_swifter = time.time()
swifter_val = df.swifter.progress_bar(desc="Vec math apply ~ DF").apply(math_vec_multiply, axis=1)
end_swifter = time.time()
swifter_time = end_swifter - start_swifter
self.assertEqual(pd_val, swifter_val) # equality test
if self.ncores > 1: # speed test
self.assertLess(swifter_time, pd_time)
def test_vectorized_math_apply_on_large_dataframe(self):
LOG.info("test_vectorized_math_apply_on_large_dataframe")
df = pd.DataFrame({"x": np.random.normal(size=1_000_000), "y": np.random.uniform(size=1_000_000)})
tqdm.pandas(desc="Pandas Vec math apply ~ DF")
start_pd = time.time()
pd_val = df.progress_apply(math_vec_multiply, axis=1)
end_pd = time.time()
pd_time = end_pd - start_pd
start_swifter = time.time()
swifter_val = df.swifter.progress_bar(desc="Vec math apply ~ DF").apply(math_vec_multiply, axis=1)
end_swifter = time.time()
swifter_time = end_swifter - start_swifter
self.assertEqual(pd_val, swifter_val) # equality test
if self.ncores > 1: # speed test
self.assertLess(swifter_time, pd_time)
def test_nonvectorized_math_applymap_on_large_dataframe(self):
LOG.info("test_nonvectorized_math_applymap_on_large_dataframe")
df = pd.DataFrame({"x": np.random.normal(size=5_000_000), "y": np.random.uniform(size=5_000_000)})
tqdm.pandas(desc="Pandas Nonvec math applymap ~ DF")
start_pd = time.time()
pd_val = df.progress_applymap(math_foo)
end_pd = time.time()
pd_time = end_pd - start_pd
start_swifter = time.time()
swifter_val = df.swifter.progress_bar(desc="Nonvec math applymap ~ DF").applymap(math_foo)
end_swifter = time.time()
swifter_time = end_swifter - start_swifter
self.assertEqual(pd_val, swifter_val) # equality test
if self.ncores > 1: # speed test
self.assertLess(swifter_time, pd_time)
def test_vectorized_math_apply_on_large_series(self):
LOG.info("test_vectorized_math_apply_on_large_series")
df = pd.DataFrame({"x": np.random.normal(size=1_000_000)})
series = df["x"]
tqdm.pandas(desc="Pandas Vec math apply ~ Series")
start_pd = time.time()
pd_val = series.progress_apply(math_vec_square)
end_pd = time.time()
pd_time = end_pd - start_pd
start_swifter = time.time()
swifter_val = series.swifter.progress_bar(desc="Vec math apply ~ Series").apply(math_vec_square, axis=0)
end_swifter = time.time()
swifter_time = end_swifter - start_swifter
self.assertEqual(pd_val, swifter_val) # equality test
if self.ncores > 1: # speed test
self.assertLess(swifter_time, pd_time)
np.array_equal(sample_df, tmp_df) & (sample_df.shape == tmp_df.shape),
error_message="Vectorized function sample doesn't match pandas apply sample.",
)
return func(self._obj, *args, **kwds)
except ERRORS_TO_HANDLE: # if can't vectorize, estimate time to pandas apply
wrapped = self._wrapped_apply(func, convert_dtype=convert_dtype, args=args, **kwds)
timed = timeit.timeit(wrapped, number=N_REPEATS)
sample_proc_est = timed / N_REPEATS
est_apply_duration = sample_proc_est / self._SAMPLE_SIZE * self._obj.shape[0]
# if pandas sample apply takes too long and not performing str processing, use dask
if (est_apply_duration > self._dask_threshold) and allow_dask_processing:
return self._dask_apply(func, convert_dtype, *args, **kwds)
else: # use pandas
if self._progress_bar:
tqdm.pandas(desc=self._progress_bar_desc or "Pandas Apply")
return self._obj.progress_apply(func, convert_dtype=convert_dtype, args=args, **kwds)
else:
return self._obj.apply(func, convert_dtype=convert_dtype, args=args, **kwds)
# if the transformed dataframe is empty, return early using Pandas
if not self._nrows:
return self._obj_pd.apply(func, args=args, **kwds)
# estimate time to pandas apply
wrapped = self._wrapped_apply(func, *args, **kwds)
timed = timeit.timeit(wrapped, number=N_REPEATS)
sample_proc_est = timed / N_REPEATS
est_apply_duration = sample_proc_est / self._SAMPLE_SIZE * self._nrows
# No `allow_dask_processing` variable here, because we don't know the dtypes of the transformation
if est_apply_duration > self._dask_threshold:
return self._dask_apply(func, *args, **kwds)
else: # use pandas
if self._progress_bar and hasattr(self._obj_pd, "progress_apply"):
tqdm.pandas(desc=self._progress_bar_desc or "Pandas Apply")
return self._obj_pd.progress_apply(func, *args, **kwds)
else:
return self._obj_pd.apply(func, *args, **kwds)
import sys
import json
import copy
import time
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas()
from supervised.models.learner_xgboost import XgbLearner
from supervised.iterative_learner_framework import IterativeLearner
from supervised.callbacks.early_stopping import EarlyStopping
from supervised.callbacks.metric_logger import MetricLogger
from supervised.callbacks.time_constraint import TimeConstraint
from supervised.metric import Metric
from supervised.tuner.random_parameters import RandomParameters
from supervised.tuner.registry import ModelsRegistry
from supervised.tuner.registry import BINARY_CLASSIFICATION
from supervised.tuner.preprocessing_tuner import PreprocessingTuner
from supervised.tuner.hill_climbing import HillClimbing
from supervised.models.ensemble import Ensemble
from supervised.models.compute_additional_metrics import ComputeAdditionalMetrics
from supervised.preprocessing.preprocessing_exclude_missing import (
PreprocessingExcludeMissingValues,
with TQDMDaskProgressBar(desc=self._progress_bar_desc or "Dask Apply"):
return (
dd.from_pandas(self._obj, npartitions=self._npartitions)
.apply(func, *args, axis=axis, raw=raw, result_type=result_type, meta=meta, **kwds)
.compute(scheduler=self._scheduler)
)
else:
return (
dd.from_pandas(self._obj, npartitions=self._npartitions)
.apply(func, *args, axis=axis, raw=raw, result_type=result_type, meta=meta, **kwds)
.compute(scheduler=self._scheduler)
)
except ERRORS_TO_HANDLE:
# if dask apply doesn't match pandas apply, fallback to pandas
if self._progress_bar:
tqdm.pandas(desc=self._progress_bar_desc or "Pandas Apply")
apply_func = self._obj.progress_apply
else:
apply_func = self._obj.apply
return apply_func(func, axis=axis, raw=raw, result_type=result_type, args=args, **kwds)