Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
return pd.Series()
if not os.path.exists(TEMP_PATH):
os.makedirs(TEMP_PATH)
data = pd.read_csv(DATASET_PATH, low_memory=False)
geocoded_cnpjs = [filename[:14]
for filename in os.listdir(TEMP_PATH)
if filename.endswith('.pkl')]
is_not_geocoded = ~data['cnpj'].str.replace(CNPJ_REGEX, '').isin(geocoded_cnpjs)
remaining_companies = data[is_not_geocoded]
print('%i companies, %i to go' % (len(data), len(remaining_companies)))
geocode_companies(remaining_companies)
data = pd.concat([data,
data.apply(read_geocoding_info, axis=1)], axis=1)
data.to_csv(DATASET_PATH,
compression='xz',
encoding='utf-8',
index=False)
shutil.rmtree(TEMP_PATH)
return ys + [ys[-1] + [xs[0]]]
xfms_to_common = (
first_level_results
.assign(uncomposed_xfms=suffixes(list(before.xfm))[:-1] + [None] + prefixes(list(after.xfm))[1:])
.assign(xfm_to_common=lambda df: df.apply(axis=1, func=lambda row:
((lambda x: s.defer(invert_xfmhandler(x)) if row.group >= common_time_pt else x)
(s.defer(concat_xfmhandlers(row.uncomposed_xfms,
name=("%s_to_common"
if row.group < common_time_pt
else "%s_from_common") % row.group))))
if row.uncomposed_xfms is not None else None))
.drop('uncomposed_xfms', axis=1)) # TODO None => identity??
# TODO indexing here is not good ...
first_level_determinants = pd.concat(list(first_level_results.build_model.apply(
lambda x: x.determinants.assign(first_level_avg=x.avg_img))),
ignore_index=True)
resampled_determinants = (
pd.merge(left=first_level_determinants,
right=xfms_to_common.assign(source=lambda df: df.xfm_to_common.apply(
lambda x:
x.source if x is not None else None)),
left_on="first_level_avg", right_on='source')
.assign(resampled_log_full_det=lambda df: df.apply(axis=1, func=lambda row:
s.defer(mincresample_new(img=row.log_full_det,
xfm=row.xfm_to_common.xfm,
like=common_model))
if row.xfm_to_common is not None else row.img),
resampled_log_nlin_det=lambda df: df.apply(axis=1, func=lambda row:
s.defer(mincresample_new(img=row.log_nlin_det,
def _read_csv_sparse(filename, chunksize=1000000, fill_value=0.0, **kwargs):
"""Read a csv file into a pandas.SparseDataFrame
"""
chunks = pd.read_csv(filename, chunksize=chunksize, **kwargs)
data = pd.concat(chunk.to_sparse(fill_value=fill_value)
for chunk in chunks)
return data
num_bin = 20
gene_mean = self._exp_mean(data_norm)
# equal width (not size) of bins
bins = pd.cut(gene_mean, num_bin)
ret = []
for _, sliced in data_norm.groupby(bins):
# Axis 0 will act on all the ROWS in each COLUMN
# Axis 1 will act on all the COLUMNS in each ROW
dispersion = sliced.var(axis=1)/sliced.mean(axis=1)
zscores = (dispersion-dispersion.mean())/dispersion.std()
ret.append(zscores)
ret = pd.concat(ret)
ret = ret.sort_values(ascending=False)
self.top_hvg = ret.head(self.hvg_n)
ret = np.array(self.top_hvg.index)
log_debug('Finishing hvg_seurat()')
return ret
_logger.info('starting predict process with %d workers', nprocs)
with MPRecContext(algo, model), Pool(nprocs) as pool:
results = pool.map(_predict_worker, pairs.groupby('user'))
results = [pd.read_msgpack(r) for r in results]
_logger.info('finished predictions')
else:
results = []
for user, udf in pairs.groupby('user'):
if pfun:
res = pfun(user, udf['item'])
res = pd.DataFrame({'user': user, 'item': res.index, 'prediction': res.values})
else:
res = _predict_user(algo, model, user, udf)
results.append(res)
results = pd.concat(results)
if 'rating' in pairs:
return pairs.join(results.set_index(['user', 'item']), on=('user', 'item'))
return results
#print 'starting cleaning_rfe...'
#train, test = cleaning_rfe(ori_train=train.copy(), ori_test=test.copy())
#print 'done cleaning_rfe'
# make dummy variables of var3 in the threshold(>=5)
var3_cnt = train.var3.value_counts()
index_var3_th = var3_cnt[(var3_cnt>=5).values].index
train['var3_tmp'] = train.var3.apply(lambda x: x if x in index_var3_th else np.nan)
test['var3_tmp'] = test.var3.apply(lambda x: x if x in index_var3_th else np.nan)
train_test = pd.concat([train,test])
#train_test.reset_index(drop=True, inplace=True)
tmp = pd.get_dummies(train_test['var3_tmp'], prefix='ohe_var3', prefix_sep='_')
train = pd.concat([train, tmp.iloc[:len(train),:]], axis=1)
test = pd.concat([test, tmp.iloc[len(train):,:]], axis=1)
del train['var3_tmp'], test['var3_tmp']
# add feature of var38
train['var38mc'] = np.isclose(train.var38, 117310.979016)
train['logvar38'] = train.loc[~train['var38mc'], 'var38'].map(np.log)
train.loc[train['var38mc'], 'logvar38'] = 0
test['var38mc'] = np.isclose(test.var38, 117310.979016)
test['logvar38'] = test.loc[~test['var38mc'], 'var38'].map(np.log)
test.loc[test['var38mc'], 'logvar38'] = 0
train['var38mc'] = train['var38mc'].astype(int)
test['var38mc'] = test['var38mc'].astype(int)
zone_weights_df[sub_geography] = zone_id
zone_weights_df['balanced_weight'] = weights.values
zone_weights_df['integer_weight'] = integer_weights.astype(int).values
if status in STATUS_SUCCESS:
integerized_weights_list.append(zone_weights_df)
integerized_zone_ids.append(zone_id)
else:
rounded_weights_list.append(zone_weights_df)
rounded_zone_ids.append(zone_id)
if combine_results:
integerized_weights_df = pd.concat(integerized_weights_list + rounded_weights_list)
return integerized_weights_df
integerized_weights_df = pd.concat(integerized_weights_list) if integerized_zone_ids else None
rounded_weights_df = pd.concat(rounded_weights_list) if rounded_zone_ids else None
return integerized_zone_ids, rounded_zone_ids, integerized_weights_df, rounded_weights_df
def dataframe(self):
"""
Returns a pandas DataFrame where each row is a representation of the
Game class. Rows are indexed by the boxscore string.
"""
frames = []
for game in self.__iter__():
df = game.dataframe
if df is not None:
frames.append(df)
if frames == []:
return None
return pd.concat(frames)
frame['timestamp'] = timestamp
frame['value'] = data_to_detect.iloc[:, 0]
output = sr_detect(frame, detect_mode, batch_size, threshold, sensitivity)
log_plot_result(frame, output, data_to_detect.columns[0], detect_mode)
else:
logging.debug(f'detect {column_length} columns')
output = pd.DataFrame()
for col in data_to_detect.columns:
frame = pd.DataFrame(columns=['timestamp', 'value'])
frame['timestamp'] = timestamp
frame['value'] = data_to_detect[col]
result = sr_detect(frame, detect_mode, batch_size, threshold, sensitivity)
log_plot_result(frame, result, col, detect_mode)
result.columns = [f'{rc}_{col}' for rc in result.columns]
output = pd.concat((output, result), axis=1)
return output