Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
pyunit_utils.insert_nan_in_data(self.training_data_file_enum, self.training_data_file_enum_nans,
self.nan_fraction)
pyunit_utils.insert_nan_in_data(self.validation_data_file_enum, self.validation_data_file_enum_nans,
self.nan_fraction)
pyunit_utils.insert_nan_in_data(self.test_data_file_enum, self.test_data_file_enum_nans,
self.nan_fraction)
pyunit_utils.insert_nan_in_data(self.training_data_file_enum_true_one_hot,
self.training_data_file_enum_nans_true_one_hot, self.nan_fraction)
pyunit_utils.insert_nan_in_data(self.validation_data_file_enum_true_one_hot,
self.validation_data_file_enum_nans_true_one_hot, self.nan_fraction)
pyunit_utils.insert_nan_in_data(self.test_data_file_enum_true_one_hot,
self.test_data_file_enum_nans_true_one_hot, self.nan_fraction)
# only preload data sets that will be used for multiple tests and change the response to enums
self.training_data = h2o.import_file(pyunit_utils.locate(self.training_data_file))
# set indices for response and predictor columns in data set for H2O GLM model to use
self.y_index = self.training_data.ncol-1
self.x_indices = list(range(self.y_index))
self.training_data[self.y_index] = self.training_data[self.y_index].round().asfactor()
# check to make sure all response classes are represented, otherwise, quit
if self.training_data[self.y_index].nlevels()[0] < self.class_number:
print("Response classes are not represented in training dataset.")
sys.exit(0)
self.valid_data = h2o.import_file(pyunit_utils.locate(self.validation_data_file))
self.valid_data[self.y_index] = self.valid_data[self.y_index].round().asfactor()
self.test_data = h2o.import_file(pyunit_utils.locate(self.test_data_file))
self.test_data[self.y_index] = self.test_data[self.y_index].round().asfactor()
def setup_data(self):
"""
This function performs all initializations necessary:
load the data sets and set the training set indices and response column index
"""
# clean out the sandbox directory first
self.sandbox_dir = pyunit_utils.make_Rsandbox_dir(self.current_dir, self.test_name, True)
# preload data sets
self.training1_data = h2o.import_file(path=pyunit_utils.locate(self.training1_filename))
# set data set indices for predictors and response
self.y_index = self.training1_data.ncol-1
self.x_indices = list(range(self.y_index))
self.training1_data[self.y_index] = self.training1_data[self.y_index].round().asfactor()
# save the training data files just in case the code crashed.
pyunit_utils.remove_csv_files(self.current_dir, ".csv", action='copy', new_dir_path=self.sandbox_dir)
def setup_data(self):
"""
This function performs all initializations necessary:
load the data sets and set the training set indices
"""
# create and clean out the sandbox directory first
self.sandbox_dir = pyunit_utils.make_Rsandbox_dir(self.current_dir, self.test_name, True)
self.training1_data = h2o.import_file(path=pyunit_utils.locate(self.training1_filenames))
self.x_indices = list(range(self.training1_data.ncol))
# save the training data files just in case the code crashed.
pyunit_utils.remove_csv_files(self.current_dir, ".csv", action='copy', new_dir_path=self.sandbox_dir)
pyunit_utils.insert_nan_in_data(self.training_data_file_enum_true_one_hot,
self.training_data_file_enum_nans_true_one_hot, self.nan_fraction)
pyunit_utils.insert_nan_in_data(self.validation_data_file_enum_true_one_hot,
self.validation_data_file_enum_nans_true_one_hot, self.nan_fraction)
pyunit_utils.insert_nan_in_data(self.test_data_file_enum_true_one_hot,
self.test_data_file_enum_nans_true_one_hot, self.nan_fraction)
# only preload data sets that will be used for multiple tests
self.training_data = h2o.import_file(pyunit_utils.locate(self.training_data_file))
# set indices for response and predictor columns in data set for H2O GLM model to use
self.y_index = self.training_data.ncol-1
self.x_indices = list(range(self.y_index))
self.valid_data = h2o.import_file(pyunit_utils.locate(self.validation_data_file))
self.test_data = h2o.import_file(pyunit_utils.locate(self.test_data_file))
# make a bigger training set by combining data from validation data set
self.training_data_grid = self.training_data.rbind(self.valid_data)
# save the training data files just in case the code crashed.
pyunit_utils.remove_csv_files(self.current_dir, ".csv", action='copy', new_dir_path=self.sandbox_dir)
self.sandbox_dir = pyunit_utils.make_Rsandbox_dir(self.current_dir, self.test_name, True)
# randomly choose which family of GBM algo to use
self.family = self.families[random.randint(0, len(self.families)-1)]
# preload datasets, set x_indices, y_index and change response to factor for classification
if 'multinomial' in self.family:
self.training_metric = 'logloss'
self.training1_data = h2o.import_file(path=pyunit_utils.locate(self.training1_filenames[1]))
self.y_index = self.training1_data.ncol-1
self.x_indices = list(range(self.y_index))
self.training1_data[self.y_index] = self.training1_data[self.y_index].round().asfactor()
self.scale_model = 1
else:
self.training1_data = h2o.import_file(path=pyunit_utils.locate(self.training1_filenames[0]))
self.y_index = self.training1_data.ncol-1
self.x_indices = list(range(self.y_index))
self.scale_model = 0.75
# save the training data files just in case the code crashed.
pyunit_utils.remove_csv_files(self.current_dir, ".csv", action='copy', new_dir_path=self.sandbox_dir)
import h2o
from h2o.estimators.random_forest import H2ORandomForestEstimator
import mlflow
import mlflow.h2o
h2o.init()
wine = h2o.import_file(path="wine-quality.csv")
r = wine['quality'].runif()
train = wine[r < 0.7]
test = wine[0.3 <= r]
def train_random_forest(ntrees):
with mlflow.start_run():
rf = H2ORandomForestEstimator(ntrees=ntrees)
train_cols = [n for n in wine.col_names if n != "quality"]
rf.train(train_cols, "quality", training_frame=train, validation_frame=test)
mlflow.log_param("ntrees", ntrees)
mlflow.log_metric("rmse", rf.rmse())
mlflow.log_metric("r2", rf.r2())
mlflow.log_metric("mae", rf.mae())
log.info("Starting H2O cluster with %s cores, %s memory.", nthreads, jvm_memory)
max_port_range = 49151
min_port_range = 1024
port = os.getpid() % (max_port_range-min_port_range) + min_port_range
h2o.init(nthreads=nthreads,
port=port,
min_mem_size=jvm_memory,
max_mem_size=jvm_memory,
strict_version_check=config.framework_params.get('_strict_version_check', True)
# log_dir=os.path.join(config.output_dir, 'logs', config.name, str(config.fold))
)
# Load train as an H2O Frame, but test as a Pandas DataFrame
log.debug("Loading train data from %s.", dataset.train.path)
train = h2o.import_file(dataset.train.path, destination_frame=frame_name('train', config))
# train.impute(method='mean')
log.debug("Loading test data from %s.", dataset.test.path)
test = h2o.import_file(dataset.test.path, destination_frame=frame_name('test', config))
# test.impute(method='mean')
log.info("Running model on task %s, fold %s.", config.name, config.fold)
log.debug("Running H2O AutoML with a maximum time of %ss on %s core(s), optimizing %s.",
config.max_runtime_seconds, config.cores, sort_metric)
aml = H2OAutoML(max_runtime_secs=config.max_runtime_seconds,
max_runtime_secs_per_model=round(config.max_runtime_seconds/2), # to prevent timeout on ensembles
sort_metric=sort_metric,
seed=config.seed,
**training_params)
monitor = (BackendMemoryMonitoring(frequency_seconds=rconfig().monitoring.frequency_seconds,
import h2o
import pandas
from h2o.estimators.deepwater import H2ODeepWaterEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
project_path="/gtc-2017"
# Connect or Start H2O
h2o.init()
# Import Data
train = h2o.import_file(project_path+"/data/train-odd.csv.gz")
valid = h2o.import_file(project_path+"/data/test-odd.csv.gz")
features = list(range(0,784))
target = 784
train[features] = train[features]/255
train[target] = train[target].asfactor()
valid[features] = valid[features]/255
valid[target] = valid[target].asfactor()
# Explore Data
print(train.head())
# Cross-Validation
nfolds = 5
# Build GBM Model
# The following examples use the Covertype dataset from UC Irvine, which concerns predicting forest cover based on cartographical data.
# We import the full covertype dataset (581k rows, 13 columns, 10 numerical, 3 categorical) and then split the data 3 ways:
#
# 60% for training
# 20% for validation (hyper parameter tuning)
# 20% for final testing
#
# We will train a data set on one set and use the others to test the validity of the model by ensuring that it can predict accurately on data the model has not been shown.
#
# The second set will be used for validation most of the time.
#
# The third set will be withheld until the end, to ensure that our validation accuracy is consistent with data we have never seen during the iterative process.
# In[ ]:
covtype_df = h2o.import_file(path = os.path.realpath("../data/covtype.full.csv"))
#split the data as described above
train, valid, test = covtype_df.split_frame([0.6, 0.2], seed=1234)
#Prepare predictors and response columns
covtype_X = covtype_df.col_names[:-1] #last column is cover_type,
covtype_y = covtype_df.col_names[-1]
# ####First Impressions
# Let's run our first Deep Learning model on the covtype dataset.
# We want to predict the `Cover_Type` column, a categorical feature with 7 levels, and the Deep Learning model will be tasked to perform (multi-class) classification. It uses the other 12 predictors of the dataset, of which 10 are numerical, and 2 are categorical with a total of 44 levels.
#
# We can expect the Deep Learning model to have 56 input neurons (after automatic one-hot encoding). First run will be only one epoch to get a feel for the model construction.
# In[ ]:
from h2o.estimators.gbm import H2OGradientBoostingEstimator
iris_data_path = "http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris.csv" # load demonstration data
iris_df = h2o.import_file(path=iris_data_path)
iris_df.describe()
gbm_regressor = H2OGradientBoostingEstimator(distribution="gaussian", ntrees=10, max_depth=3, min_rows=2, learn_rate="0.2")
gbm_regressor.train(x=range(1,iris_df.ncol), y=0, training_frame=iris_df)
gbm_regressor
gbm_classifier = H2OGradientBoostingEstimator(distribution="multinomial", ntrees=10, max_depth=3, min_rows=2, learn_rate="0.2")
gbm_classifier.train(x=range(0,iris_df.ncol-1), y=iris_df.ncol-1, training_frame=iris_df)
gbm_classifier
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
prostate_data_path = h2o.system_file("prostate.csv")
prostate_df = h2o.import_file(path=prostate_data_path)
prostate_df["RACE"] = prostate_df["RACE"].asfactor()
prostate_df.describe()
glm_classifier = H2OGeneralizedLinearEstimator(family="binomial", nfolds=10, alpha=0.5)
glm_classifier.train(x=["AGE","RACE","PSA","DCAPS"],y="CAPSULE", training_frame=prostate_df)
glm_classifier
from h2o.estimators.kmeans import H2OKMeansEstimator
cluster_estimator = H2OKMeansEstimator(k=3)
cluster_estimator.train(x=[0,1,2,3], training_frame=iris_df)
cluster_estimator
from h2o.transforms.decomposition import H2OPCA
pca_decomp = H2OPCA(k=2, transform="NONE", pca_method="Power")
pca_decomp.train(x=range(0,4), training_frame=iris_df)
pca_decomp