Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
train_test_split(X, y, test_size=0.20, random_state=42)
if self.meta_algo == 'NN':
X_train_scaled, X_test_scaled = \
self._scale_data(X_train, X_test, save_model)
meta_algo.fit(X_train_scaled, y_train)
else:
meta_algo.fit(X_train, y_train)
if save_model:
if self.verbose >= 2:
self.logger.info(f'''Saving {self.meta_algo} to {self.meta_algo}_{self.algo}_estimator.pkl''')
model_path = f'''{get_path("models")}/{self.meta_algo}_{self.algo}_estimator.pkl'''
joblib.dump(meta_algo, model_path)
json_path = f'''{get_path("models")}/{self.meta_algo}_{self.algo}_estimator.json'''
with open(json_path, 'w') as outfile:
json.dump({"dummy": list(cols),
"original": list(original_cols)}, outfile)
if self.meta_algo == 'NN':
if self.verbose >= 2:
self.logger.info(f'''R squared on train set is {r2_score(y_train, meta_algo.predict(X_train_scaled))}''')
# MAPE is the mean absolute percentage error
test_relu = [max(i, 0) for i in meta_algo.predict(X_test_scaled)]
train_relu = [max(i, 0) for i in meta_algo.predict(X_train_scaled)]
saves the scaler as a pkl file if specified
:param X_train: pd.DataFrame chosen as input for the training set
:param X_test: pd.DataFrame chosen as input for the test set
:param save_model: boolean set to True if the model needs to be saved
:return: X_train and X_test data scaled
:rtype: pd.DataFrame
"""
scaler = StandardScaler()
scaler.fit(X_train)
if save_model:
if self.verbose >= 2:
self.logger.info(f'''Saving scaler model to scaler_{self.algo}_estimator.pkl''')
model_path = f'''{get_path("models")}/scaler_{self.algo}_estimator.pkl'''
joblib.dump(scaler, model_path)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
return X_train_scaled, X_test_scaled
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
if self.meta_algo == 'NN':
X_train_scaled, X_test_scaled = self._scale_data(X_train, X_test, save_model)
meta_algo.fit(X_train_scaled, y_train)
else:
meta_algo.fit(X_train, y_train)
if save_model:
if self.verbose >= 2:
self.logger.info(f'Saving {self.meta_algo} to {self.meta_algo}_{self.algo}_estimator.pkl')
model_path = f'{get_path("models")}/{self.meta_algo}_{self.algo}_estimator.pkl'
joblib.dump(meta_algo, model_path)
json_path = f'{get_path("models")}/{self.meta_algo}_{self.algo}_estimator.json'
with open(json_path, 'w') as outfile:
json.dump({"dummy": list(cols), "original": list(original_cols)}, outfile)
if self.meta_algo == 'NN':
if self.verbose >= 2:
self.logger.info(f'R squared on train set is {r2_score(y_train, meta_algo.predict(X_train_scaled))}')
# MAPE is the mean absolute percentage error https://en.wikipedia.org/wiki/Mean_absolute_percentage_error
y_pred_test = np.array([max(i, 0) for i in meta_algo.predict(X_test_scaled)])
y_pred_train = np.array([max(i, 0) for i in meta_algo.predict(X_train_scaled)])
else:
if self.verbose >= 2:
self.logger.info(f'R squared on train set is {r2_score(y_train, meta_algo.predict(X_train))}')
y_pred_test = meta_algo.predict(X_test)
saves the scaler as a pkl file if specified
:param X_train: pd.DataFrame chosen as input for the training set
:param X_test: pd.DataFrame chosen as input for the test set
:param save_model: boolean set to True if the model needs to be saved
:return: X_train and X_test data scaled
:rtype: pd.DataFrame
"""
scaler = StandardScaler()
scaler.fit(X_train)
if save_model:
if self.verbose >= 2:
self.logger.info(f'Saving scaler model to scaler_{self.algo}_estimator.pkl')
model_path = f'{get_path("models")}/scaler_{self.algo}_estimator.pkl'
joblib.dump(scaler, model_path)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
return X_train_scaled, X_test_scaled
def _fetch_inputs(json_path):
"""
retrieves estimation inputs (made dummy)
:param json_path: list of columns in json fils
:return: list of inputs
"""
return json.load(open(get_path(json_path)))
def _transform_from_csv(self, csv_name):
"""
takes data from csv and returns inputs and outputs in right format for model_fit
this is needed when the pd.read_csv function changes format of ambiguous columns
:param csv_name: name of csv from generate data
:param rename_columns: set to True if csv columns have to be named
:return: inputs and outputs
"""
df = pd.read_csv(get_path(csv_name))
meta_params = self.params
parameters_list = list(meta_params['internal_params'].keys())
external_parameters_list = list(meta_params['external_params'].keys())
df.columns = meta_params['other_params'] + external_parameters_list + parameters_list + ['output']
semi_dummy_inputs = self.params['semi_dummy_inputs']
for col in semi_dummy_inputs:
df[col] = df[col].apply(self._str_to_float)
inputs = df.drop(['output'], axis=1)
outputs = df[['output']]
return inputs, outputs
transforms the dataframe of the params of the estimated
model before predicting runtime
:param df: dataframe of all inputed parameters
:param algo: algo whose runtime the user wants to predict
:param scaled: scaling the input if set to True
:return: np array of all relevant algo parameters
and system features used to estimate algo training time
:rtype: pandas matrix object
"""
param_dic = self._fetch_algo_metadata(algo)
algo_name = param_dic['name']
algo_params = param_dic['params']
params = param_dic['config']
json_path = f'''{get_path("models")}/{self.meta_algo}_{algo_name}_estimator.json'''
estimation_inputs = self._fetch_inputs(json_path)['dummy']
estimation_original_inputs = self._fetch_inputs(json_path)['original']
# first we transform semi dummy features
semi_dummy_inputs = params['semi_dummy_inputs']
# we add columns for each semi dummy features
# (times the number of potential dummy values)
df = self._add_semi_dummy(df, semi_dummy_inputs)
forgotten_inputs = list(set(list(estimation_original_inputs)) - set(list((df.columns))))
if len(forgotten_inputs) > 0:
# if some params that we use to train the underlying
# meta model do not appear, we can't predict the runtime
raise NameError(f'{forgotten_inputs} parameters missing')
def _add_row_to_csv(self, row_input, row_output):
"""
writes a row into the csv results file - parameters (X) and number of seconds (y)
:param input: row inputs
:param output: row output
:return:
"""
csv_name = f'{self.algo}_result.csv'
with open(f'{get_path(csv_name)}', 'a+') as file:
writer = csv.writer(file)
row = list(row_input) + [row_output]
writer.writerows([row])
def _add_row_to_csv(self, row_input, row_output):
"""
writes a row into the csv results file -
parameters (X) and number of seconds (y)
:param input: row inputs
:param output: row output
:return:
"""
csv_name = f'{self.algo}_result.csv'
with open(f'{get_path(csv_name)}', 'a+') as file:
writer = csv.writer(file)
row = list(row_input) + [row_output]
writer.writerows([row])