Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def setUp(self):
self.fpp = FeaturePreprocessor()
converters = {'input_features': configuration.get_default_converter()}
# Initialize the reader
reader = DataReader(file_paths, file_names, converters)
data_container = reader.read(kwargs_dict={'feature_info': {'index_col': 0}})
# load the Modeler to generate the predictions
model = Modeler.load_from_file(join(experiment_output_dir,
'{}.model'.format(experiment_id)))
# Add the model to the configuration object
configuration['model'] = model
# Initialize the processor
processor = FeaturePreprocessor()
(processed_config,
processed_container) = processor.process_data(configuration,
data_container,
context='rsmpredict')
# save the pre-processed features to disk if we were asked to
if feats_file is not None:
logger.info('Saving pre-processed feature values to {}'.format(feats_file))
feats_dir = dirname(feats_file)
# create any directories needed for the output file
os.makedirs(feats_dir, exist_ok=True)
_, feats_filename = split(feats_file)
if (length_column and
(len(df_filtered[df_filtered['length'].isnull()]) != 0 or
df_filtered['length'].std() <= 0)):
logging.warning("The {} column either has missing values or a standard "
"deviation <= 0. No length-based analysis will be "
"provided. The column will be renamed as ##{}## and "
"saved in *train_other_columns.csv.".format(length_column,
length_column))
df_filtered.rename(columns={'length': '##{}##'.format(length_column)},
inplace=True)
# if requested, exclude the candidates with less than X responses
# left after filtering
if min_candidate_items:
(df_filtered_candidates,
df_excluded_candidates) = FeaturePreprocessor.select_candidates(df_filtered,
min_candidate_items)
# check that there are still responses left for analysis
if len(df_filtered_candidates) == 0:
raise ValueError("After filtering non-numeric scores and "
"non-numeric feature values there were "
"no candidates with {} or more responses "
"left for analysis".format(min_candidate_items))
# redefine df_filtered
df_filtered = df_filtered_candidates.copy()
# update df_excluded
df_excluded = pd.concat([df_excluded, df_excluded_candidates], sort=True)
# create separate data frames for features and sc1, all other
# information, and responses excluded during filtering
predict_expected=predict_expected_scores)
df_test_predictions = self.predict(df_test,
int(trim_min),
int(trim_max),
predict_expected=predict_expected_scores)
# get the mean and SD of the training set predictions
train_predictions_mean = df_train_predictions['raw'].mean()
train_predictions_sd = df_train_predictions['raw'].std()
# get the mean and SD of the human labels
human_labels_mean = df_train['sc1'].mean()
human_labels_sd = df_train['sc1'].std()
logging.info('Processing train set predictions.')
df_train_predictions = FeaturePreprocessor.process_predictions(df_train_predictions,
train_predictions_mean,
train_predictions_sd,
human_labels_mean,
human_labels_sd,
trim_min,
trim_max,
trim_tolerance)
logging.info('Processing test set predictions.')
df_test_predictions = FeaturePreprocessor.process_predictions(df_test_predictions,
train_predictions_mean,
train_predictions_sd,
human_labels_mean,
human_labels_sd,
trim_min,
trim_max,
'{}'.format(repr(missing_file_paths)))
# Use the default converter for both train and test
converters = {'train': configuration.get_default_converter(),
'test': configuration.get_default_converter()}
logger.info('Reading in all data from files.')
# Initialize the reader
reader = DataReader(file_paths, file_names, converters)
data_container = reader.read()
logger.info('Preprocessing all features.')
# Initialize the processor
processor = FeaturePreprocessor()
(processed_config,
processed_container) = processor.process_data(configuration,
data_container)
# Rename certain frames with more descriptive names
# for writing out experiment files
rename_dict = {'train_excluded': 'train_excluded_responses',
'test_excluded': 'test_excluded_responses',
'train_length': 'train_response_lengths',
'train_flagged': 'train_responses_with_excluded_flags',
'test_flagged': 'test_responses_with_excluded_flags'}
logger.info('Saving training and test set data to disk.')
# Write out files
file_paths) = configuration.get_names_and_paths(paths, names)
file_paths = DataReader.locate_files(file_paths, configuration.configdir)
converters = {'predictions': configuration.get_default_converter()}
logger.info('Reading predictions: {}.'.format(configuration['predictions_file']))
# Initialize the reader
reader = DataReader(file_paths, file_names, converters)
data_container = reader.read()
logger.info('Preprocessing predictions.')
# Initialize the processor
processor = FeaturePreprocessor()
(processed_config,
processed_container) = processor.process_data(configuration,
data_container,
context='rsmeval')
logger.info('Saving pre-processed predictions and metadata to disk.')
writer.write_experiment_output(csvdir,
processed_container,
new_names_dict={'pred_test':
'pred_processed',
'test_excluded':
'test_excluded_responses'},
file_format=file_format)
# Initialize the analyzer
df_pred_processed : pd.DataFrame
Data frame containing the various trimmed
and rounded predictions.
"""
# rescale the test set predictions by boosting
# them to match the human mean and SD
scaled_test_predictions = (df_test_predictions['raw'] -
train_predictions_mean) / train_predictions_sd
scaled_test_predictions = scaled_test_predictions * human_labels_sd + human_labels_mean
df_pred_process = df_test_predictions.copy()
df_pred_process['scale'] = scaled_test_predictions
# trim and round the predictions before running the analyses
df_pred_process['raw_trim'] = FeaturePreprocessor.trim(df_pred_process['raw'],
trim_min,
trim_max,
trim_tolerance)
df_pred_process['raw_trim_round'] = np.rint(df_pred_process['raw_trim'])
df_pred_process['raw_trim_round'] = df_pred_process['raw_trim_round'].astype('int64')
df_pred_process['scale_trim'] = FeaturePreprocessor.trim(df_pred_process['scale'],
trim_min,
trim_max,
trim_tolerance)
df_pred_process['scale_trim_round'] = np.rint(df_pred_process['scale_trim'])
df_pred_process['scale_trim_round'] = df_pred_process['scale_trim_round'].astype('int64')
return df_pred_process