How to use the h2o.import_file function in h2o

To help you get started, we’ve selected a few h2o examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github h2oai / h2o-3 / h2o-py / dynamic_tests / testdir_algos / glm / pyunit_glm_multinomial_large.py View on Github external
pyunit_utils.insert_nan_in_data(self.training_data_file_enum, self.training_data_file_enum_nans,
                                        self.nan_fraction)
        pyunit_utils.insert_nan_in_data(self.validation_data_file_enum, self.validation_data_file_enum_nans,
                                        self.nan_fraction)
        pyunit_utils.insert_nan_in_data(self.test_data_file_enum, self.test_data_file_enum_nans,
                                        self.nan_fraction)

        pyunit_utils.insert_nan_in_data(self.training_data_file_enum_true_one_hot,
                                        self.training_data_file_enum_nans_true_one_hot, self.nan_fraction)
        pyunit_utils.insert_nan_in_data(self.validation_data_file_enum_true_one_hot,
                                        self.validation_data_file_enum_nans_true_one_hot, self.nan_fraction)
        pyunit_utils.insert_nan_in_data(self.test_data_file_enum_true_one_hot,
                                        self.test_data_file_enum_nans_true_one_hot, self.nan_fraction)

        # only preload data sets that will be used for multiple tests and change the response to enums
        self.training_data = h2o.import_file(pyunit_utils.locate(self.training_data_file))

        # set indices for response and predictor columns in data set for H2O GLM model to use
        self.y_index = self.training_data.ncol-1
        self.x_indices = list(range(self.y_index))

        self.training_data[self.y_index] = self.training_data[self.y_index].round().asfactor()

        # check to make sure all response classes are represented, otherwise, quit
        if self.training_data[self.y_index].nlevels()[0] < self.class_number:
            print("Response classes are not represented in training dataset.")
            sys.exit(0)

        self.valid_data = h2o.import_file(pyunit_utils.locate(self.validation_data_file))
        self.valid_data[self.y_index] = self.valid_data[self.y_index].round().asfactor()
        self.test_data = h2o.import_file(pyunit_utils.locate(self.test_data_file))
        self.test_data[self.y_index] = self.test_data[self.y_index].round().asfactor()
github h2oai / h2o-3 / h2o-py / dynamic_tests / testdir_algos / glm / pyunit_glm_binomial_gridsearch_randomdiscrete_large.py View on Github external
def setup_data(self):
        """
        This function performs all initializations necessary:
        load the data sets and set the training set indices and response column index
        """

        # clean out the sandbox directory first
        self.sandbox_dir = pyunit_utils.make_Rsandbox_dir(self.current_dir, self.test_name, True)

        # preload data sets
        self.training1_data = h2o.import_file(path=pyunit_utils.locate(self.training1_filename))

        # set data set indices for predictors and response
        self.y_index = self.training1_data.ncol-1
        self.x_indices = list(range(self.y_index))
        self.training1_data[self.y_index] = self.training1_data[self.y_index].round().asfactor()

        # save the training data files just in case the code crashed.
        pyunit_utils.remove_csv_files(self.current_dir, ".csv", action='copy', new_dir_path=self.sandbox_dir)
github h2oai / h2o-3 / h2o-py / dynamic_tests / testdir_algos / kmeans / pyunit_kmeans_gridsearch_over_all_params_large.py View on Github external
def setup_data(self):
        """
        This function performs all initializations necessary:
        load the data sets and set the training set indices
        """

        # create and clean out the sandbox directory first
        self.sandbox_dir = pyunit_utils.make_Rsandbox_dir(self.current_dir, self.test_name, True)
        self.training1_data = h2o.import_file(path=pyunit_utils.locate(self.training1_filenames))
        self.x_indices = list(range(self.training1_data.ncol))

        # save the training data files just in case the code crashed.
        pyunit_utils.remove_csv_files(self.current_dir, ".csv", action='copy', new_dir_path=self.sandbox_dir)
github h2oai / h2o-3 / h2o-py / dynamic_tests / testdir_algos / glm / pyunit_glm_gaussian_large.py View on Github external
pyunit_utils.insert_nan_in_data(self.training_data_file_enum_true_one_hot,
                                        self.training_data_file_enum_nans_true_one_hot, self.nan_fraction)
        pyunit_utils.insert_nan_in_data(self.validation_data_file_enum_true_one_hot,
                                        self.validation_data_file_enum_nans_true_one_hot, self.nan_fraction)
        pyunit_utils.insert_nan_in_data(self.test_data_file_enum_true_one_hot,
                                        self.test_data_file_enum_nans_true_one_hot, self.nan_fraction)

        # only preload data sets that will be used for multiple tests
        self.training_data = h2o.import_file(pyunit_utils.locate(self.training_data_file))

        # set indices for response and predictor columns in data set for H2O GLM model to use
        self.y_index = self.training_data.ncol-1
        self.x_indices = list(range(self.y_index))

        self.valid_data = h2o.import_file(pyunit_utils.locate(self.validation_data_file))
        self.test_data = h2o.import_file(pyunit_utils.locate(self.test_data_file))

        # make a bigger training set by combining data from validation data set
        self.training_data_grid = self.training_data.rbind(self.valid_data)

        # save the training data files just in case the code crashed.
        pyunit_utils.remove_csv_files(self.current_dir, ".csv", action='copy', new_dir_path=self.sandbox_dir)
github h2oai / h2o-3 / h2o-py / dynamic_tests / testdir_algos / gbm / pyunit_gbm_gridsearch_over_all_params_large.py View on Github external
self.sandbox_dir = pyunit_utils.make_Rsandbox_dir(self.current_dir, self.test_name, True)

        # randomly choose which family of GBM algo to use
        self.family = self.families[random.randint(0, len(self.families)-1)]

        # preload datasets, set x_indices, y_index and change response to factor for classification
        if 'multinomial' in self.family:
            self.training_metric = 'logloss'
            self.training1_data = h2o.import_file(path=pyunit_utils.locate(self.training1_filenames[1]))
            self.y_index = self.training1_data.ncol-1
            self.x_indices = list(range(self.y_index))
            self.training1_data[self.y_index] = self.training1_data[self.y_index].round().asfactor()
            self.scale_model = 1

        else:
            self.training1_data = h2o.import_file(path=pyunit_utils.locate(self.training1_filenames[0]))
            self.y_index = self.training1_data.ncol-1
            self.x_indices = list(range(self.y_index))
            self.scale_model = 0.75

        # save the training data files just in case the code crashed.
        pyunit_utils.remove_csv_files(self.current_dir, ".csv", action='copy', new_dir_path=self.sandbox_dir)
github mlflow / mlflow / examples / h2o / random_forest.py View on Github external
import h2o
from h2o.estimators.random_forest import H2ORandomForestEstimator

import mlflow
import mlflow.h2o

h2o.init()

wine = h2o.import_file(path="wine-quality.csv")
r = wine['quality'].runif()
train = wine[r < 0.7]
test = wine[0.3 <= r]


def train_random_forest(ntrees):
    with mlflow.start_run():
        rf = H2ORandomForestEstimator(ntrees=ntrees)
        train_cols = [n for n in wine.col_names if n != "quality"]
        rf.train(train_cols, "quality", training_frame=train, validation_frame=test)

        mlflow.log_param("ntrees", ntrees)

        mlflow.log_metric("rmse", rf.rmse())
        mlflow.log_metric("r2", rf.r2())
        mlflow.log_metric("mae", rf.mae())
github openml / automlbenchmark / frameworks / H2OAutoML / exec.py View on Github external
log.info("Starting H2O cluster with %s cores, %s memory.", nthreads, jvm_memory)
        max_port_range = 49151
        min_port_range = 1024
        port = os.getpid() % (max_port_range-min_port_range) + min_port_range

        h2o.init(nthreads=nthreads,
                 port=port,
                 min_mem_size=jvm_memory,
                 max_mem_size=jvm_memory,
                 strict_version_check=config.framework_params.get('_strict_version_check', True)
                 # log_dir=os.path.join(config.output_dir, 'logs', config.name, str(config.fold))
                 )

        # Load train as an H2O Frame, but test as a Pandas DataFrame
        log.debug("Loading train data from %s.", dataset.train.path)
        train = h2o.import_file(dataset.train.path, destination_frame=frame_name('train', config))
        # train.impute(method='mean')
        log.debug("Loading test data from %s.", dataset.test.path)
        test = h2o.import_file(dataset.test.path, destination_frame=frame_name('test', config))
        # test.impute(method='mean')

        log.info("Running model on task %s, fold %s.", config.name, config.fold)
        log.debug("Running H2O AutoML with a maximum time of %ss on %s core(s), optimizing %s.",
                  config.max_runtime_seconds, config.cores, sort_metric)

        aml = H2OAutoML(max_runtime_secs=config.max_runtime_seconds,
                        max_runtime_secs_per_model=round(config.max_runtime_seconds/2),  # to prevent timeout on ensembles
                        sort_metric=sort_metric,
                        seed=config.seed,
                        **training_params)

        monitor = (BackendMemoryMonitoring(frequency_seconds=rconfig().monitoring.frequency_seconds,
github h2oai / h2o-tutorials / archive / gtc-2017-deep-water / scripts / deep-water-stacked-ensemble.py View on Github external
import h2o
import pandas

from h2o.estimators.deepwater import H2ODeepWaterEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator

project_path="/gtc-2017"

# Connect or Start H2O
h2o.init()

# Import Data
train = h2o.import_file(project_path+"/data/train-odd.csv.gz")
valid = h2o.import_file(project_path+"/data/test-odd.csv.gz")

features = list(range(0,784))
target = 784

train[features] = train[features]/255
train[target] = train[target].asfactor()
valid[features] = valid[features]/255
valid[target] = valid[target].asfactor()

# Explore Data
print(train.head())

# Cross-Validation
nfolds = 5

# Build GBM Model
github h2oai / h2o-tutorials / tutorials / deeplearning / deeplearning.py View on Github external
# The following examples use the Covertype dataset from UC Irvine, which concerns predicting forest cover based on cartographical data.  
# We import the full covertype dataset (581k rows, 13 columns, 10 numerical, 3 categorical) and then split the data 3 ways:  
#   
# 60% for training  
# 20% for validation (hyper parameter tuning)  
# 20% for final testing  
# 
#  We will train a data set on one set and use the others to test the validity of the model by ensuring that it can predict accurately on data the model has not been shown.  
#  
#  The second set will be used for validation most of the time.  
#  
#  The third set will be withheld until the end, to ensure that our validation accuracy is consistent with data we have never seen during the iterative process. 

# In[ ]:

covtype_df = h2o.import_file(path = os.path.realpath("../data/covtype.full.csv"))

#split the data as described above
train, valid, test = covtype_df.split_frame([0.6, 0.2], seed=1234)

#Prepare predictors and response columns
covtype_X = covtype_df.col_names[:-1]     #last column is cover_type, 
covtype_y = covtype_df.col_names[-1]    


# ####First Impressions
# Let's run our first Deep Learning model on the covtype dataset.   
# We want to predict the `Cover_Type` column, a categorical feature with 7 levels, and the Deep Learning model will be tasked to perform (multi-class) classification. It uses the other 12 predictors of the dataset, of which 10 are numerical, and 2 are categorical with a total of 44 levels.  
# 
# We can expect the Deep Learning model to have 56 input neurons (after automatic one-hot encoding). First run will be only one epoch to get a feel for the model construction.

# In[ ]:
github h2oai / h2o-3 / h2o-docs / src / booklets / v2_2015 / source / python / ipython_machinelearning_input.py View on Github external
from h2o.estimators.gbm import H2OGradientBoostingEstimator

iris_data_path = "http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris.csv" # load demonstration data
iris_df = h2o.import_file(path=iris_data_path)
iris_df.describe()
gbm_regressor = H2OGradientBoostingEstimator(distribution="gaussian", ntrees=10, max_depth=3, min_rows=2, learn_rate="0.2")
gbm_regressor.train(x=range(1,iris_df.ncol), y=0, training_frame=iris_df)
gbm_regressor

gbm_classifier = H2OGradientBoostingEstimator(distribution="multinomial", ntrees=10, max_depth=3, min_rows=2, learn_rate="0.2")
gbm_classifier.train(x=range(0,iris_df.ncol-1), y=iris_df.ncol-1, training_frame=iris_df)
gbm_classifier

from h2o.estimators.glm import H2OGeneralizedLinearEstimator
prostate_data_path = h2o.system_file("prostate.csv")
prostate_df = h2o.import_file(path=prostate_data_path)
prostate_df["RACE"] = prostate_df["RACE"].asfactor()
prostate_df.describe()
glm_classifier = H2OGeneralizedLinearEstimator(family="binomial", nfolds=10, alpha=0.5)
glm_classifier.train(x=["AGE","RACE","PSA","DCAPS"],y="CAPSULE", training_frame=prostate_df)
glm_classifier

from h2o.estimators.kmeans import H2OKMeansEstimator
cluster_estimator = H2OKMeansEstimator(k=3)
cluster_estimator.train(x=[0,1,2,3], training_frame=iris_df)
cluster_estimator

from h2o.transforms.decomposition import H2OPCA
pca_decomp = H2OPCA(k=2, transform="NONE", pca_method="Power")
pca_decomp.train(x=range(0,4), training_frame=iris_df)
pca_decomp