Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
mine = mine.as_matrix()
#mine_test = pd.merge(grams_test, test_daf,on='Id')
mine_test = grams_test
mine_test = pd.merge(mine_test, test_dll,on='Id')
mine_test_id = mine_test.Id
del mine_test['Id']
clf_se = RF(n_estimators=500, n_jobs=-1,random_state = 0)
clf_se.fit(mine,mine_labels)
mine_train = np.array(clf_se.transform(mine, '1.25*mean'))
mine_test = np.array(clf_se.transform(mine_test, '1.25*mean'))
train_mine = pd.DataFrame(np.column_stack((mine_Id, mine_train)), columns=['Id']+['mine_'+str(x) for x in xrange(mine_train.shape[1])]).convert_objects(convert_numeric=True)
test_mine = pd.DataFrame(np.column_stack((mine_test_id, mine_test)), columns=['Id']+['mine_'+str(x) for x in xrange(mine_test.shape[1])]).convert_objects(convert_numeric=True)
train = pd.merge(train, train_mine, on='Id')
test = pd.merge(test, test_mine, on='Id')
train_image = pd.read_csv("train_asm_image.csv", usecols=['Id']+['asm_%i'%i for i in xrange(800)])
test_image = pd.read_csv("test_asm_image.csv", usecols=['Id']+['asm_%i'%i for i in xrange(800)])
train = pd.merge(train, train_image, on='Id')
test = pd.merge(test, test_image, on='Id')
print "the data dimension:"
print train.shape, test.shape
return train, test
def gen_submission(model):
# Join to persons to get person_id, fp_choice
joint_tours = pandas.merge(left=joint_tours, right=persons, on=['hh_id','person_num'])
# Verify we didn't lose or add rows and that we found everyone's person id
assert(len(joint_tours) == joint_tour_participants)
assert(len(joint_tours.loc[pandas.notnull(joint_tours.person_id)] == joint_tour_participants))
# drop tour_participants so we can merge
joint_tours.drop('tour_participants', axis=1, inplace=True)
assert(sorted(list(indiv_tours.columns.values)) == sorted(list(joint_tours.columns.values)))
tours = pandas.concat([indiv_tours, joint_tours])
# tour duration
tours['tour_duration'] = tours.end_hour - tours.start_hour
# origin county
tours = pandas.merge(left=tours, right=tazdata[['COUNTY']],
left_on='orig_taz', right_index=True)
tours.rename(columns={'COUNTY':'orig_county'}, inplace=True)
# destination county, parking costs
tours = pandas.merge(left=tours, right=tazdata,
left_on='dest_taz', right_index=True)
tours.rename(columns={'COUNTY':'dest_county'}, inplace=True)
assert(len(tours) == joint_tour_participants + indiv_tours_participants)
# make sure this is a good index
dupes = tours.duplicated(subset=['hh_id','person_id','person_num','tour_category','tour_purpose','tour_id'])
assert(dupes.sum()==0)
tours['tour_purpose2'] = tours.tour_purpose # duplicate for index
tours.set_index(['hh_id','person_id','person_num','tour_category','tour_purpose2','tour_id'], inplace=True)
# default: tour_duration * OPRKCST
else LineString([x[0], x[0]]))
df2 = gp.GeoDataFrame(df2, crs=cea)
if isolation:
df2['Iso'] = grisolation
if count == 0:
self.grdist = df2.copy()
else:
self.grdist = self.grdist.append(df2)
count += 1
if routes:
self.grdist = gp.GeoDataFrame(self.grdist, crs=cea)
if export_shape:
start_pointscols = sources.columns.values
end_pointscols = destinations.columns.values
if 'geometry' in end_pointscols:
self.grdist = pd.merge(self.grdist, end_points[['ID'] + end_pointscols.tolist()].drop('geometry', axis=1), left_on='ID2', right_on='ID', how='left')
else:
self.grdist = pd.merge(self.grdist, end_points[['ID']+end_pointscols.tolist()], left_on='ID2', right_on='ID', how='left')
if 'geometry' in self.start_pointscols:
self.grdist = pd.merge(self.grdist, start_points[['ID']+start_pointscols.tolist()].drop('geometry', axis=1), left_on='ID1', right_on='ID', how='left',
suffixes=['_2', '_1'])
else:
self.grdist = pd.merge(self.grdist, start_points[['ID']+start_pointscols.tolist()], left_on='ID1', right_on='ID', how='left',
suffixes=['_2', '_1'])
self.grdist = gp.GeoDataFrame(self.grdist, crs=cea)
self.grdist.to_file(path+'routes.shp')
pass
def ActionFeatures(Startday, PrepareDays, PredictDays, temp, dftemp):
tempfeature = temp[temp.a_day_series < Startday][temp.a_day_series >= Startday-PrepareDays].reset_index(drop=True)
templabel = temp[temp.a_day_series >= Startday][temp.a_day_series < Startday+PredictDays].reset_index(drop=True)
dftemp = pd.merge(dftemp, templabel[['user_id','a_date']].drop_duplicates(subset = 'user_id', keep='last'), on = 'user_id',how='left').fillna(0)
Checkcnt = tempfeature[['user_id','a_date']].groupby(['user_id']).count().reset_index()
Checkcnt.columns = ['user_id', 'checkcnt']
dftemp = pd.merge(dftemp,Checkcnt, how = 'left', on = 'user_id')
monthcnt = tempfeature[['user_id','a_month_series']].drop_duplicates().groupby(['user_id']).size().reset_index()
monthcnt.columns = ['user_id', 'a_monthcnt']
dftemp = pd.merge(dftemp,monthcnt, how = 'left', on = 'user_id')
tempfeature['daybeforelastcheck'] = tempfeature.sort_values(by=['user_id','a_day_series']).a_day_series - tempfeature.sort_values(by=['user_id','a_day_series']).groupby(['user_id']).shift(1).a_day_series
for f in ['daybeforelastcheck', 'price', 'para_1', 'para_2', 'para_3', 'a_num','a_type','a_month_series','a_day_series']:
a = tempfeature[['user_id',f]].groupby(['user_id']).mean().reset_index()
a.columns = ['user_id', '{}_a_ave'.format(f)]
dftemp = pd.merge(dftemp,a, how = 'left', on = 'user_id')
a = tempfeature[['user_id',f]].groupby(['user_id']).std().reset_index()
a.columns = ['user_id', '{}_a_std'.format(f)]
dftemp = pd.merge(dftemp,a, how = 'left', on = 'user_id')
a = tempfeature[['user_id',f]].groupby(['user_id']).sum().reset_index()
a.columns = ['user_id', '{}_a_sum'.format(f)]
dftemp = pd.merge(dftemp,a, how = 'left', on = 'user_id')
a = tempfeature[['user_id',f]].groupby(['user_id']).median().reset_index()
a.columns = ['user_id', '{}_a_median'.format(f)]
def clean_data(cls, x, y, on, col_name):
x.replace([np.inf, -np.inf], np.nan, inplace=True)
y.replace([np.inf, -np.inf], np.nan, inplace=True)
merged_df = pd.merge(left=x, right=y, on=on, how='outer')
clean_df = merged_df.loc[merged_df.notnull().all(axis=1), :]
df_x = pd.DataFrame()
df_y = pd.DataFrame()
df_x[on] = clean_df[on].values
df_y[on] = clean_df[on].values
df_x[col_name] = clean_df[col_name + '_x'].values
df_y[col_name] = clean_df[col_name + '_y'].values
return df_x, df_y
import fedelemflowlist
import pandas as pd
from fedelemflowlist.globals import outputpath
#Set name of mapping file. More than one mapping file can be used
mapping_to_use = ['openLCA']
if __name__ == '__main__':
mapping = fedelemflowlist.get_flowmapping(mapping_to_use)
#Get Flow UUIDs for flows used in selected mapping
mapping_flow_uuids = pd.DataFrame(pd.unique(mapping['TargetFlowUUID']),columns=["Flow UUID"])
#Get all flows
all_flows = fedelemflowlist.get_flows()
#Subset all flows to get just those used in selected mapping
flows_used_in_mapping = pd.merge(all_flows,mapping_flow_uuids)
#Now write out flows and mappings
export_name = ''
for s in mapping_to_use:
export_name = export_name + s + '_'
export_name = export_name+ 'flows_w_mappings.zip'
fedelemflowlist.write_jsonld(flows_used_in_mapping,outputpath+export_name,mapping)
def make_attorney_df():
df = pd.read_html(URLS['attorney general'])[0]
df.columns = df.iloc[0]
df = df.drop(df.index[0])
df.columns = ['county'] + list(df.columns[1:])
df_ = pd.melt(df, id_vars=['county'], value_vars=list(df.columns[1:]))
party_df = df_[pd.isnull(df_['county'])][['variable','value']]
party_df.columns = ['candidate', 'party']
df_.columns = ['county', 'candidate', 'votes']
df_ = df_.dropna(subset=['county'])
df_ = pd.merge(df_, party_df, how='left')
df_['candidate'] = df_['candidate'].str.rstrip(' *')
df_['candidate'] = df_['candidate'].str.replace('\((.*?)\)', '')
df_['candidate'] = df_['candidate'].str.rstrip('()')
df_['office'] = 'Attorney General'
attorney_df = df_
return attorney_df
def extract_events_from_stream(stream_df, event_type):
""" Extracts specific event from stream.
"""
events = stream_df.loc[stream_df.EventType == event_type][['EventTime', 'Event']]
events_json = events['Event'].to_json(orient="records")
json_struct = json.loads(events_json)
# TODO : get rid of structs containing all `int` types
event_extracted = json_normalize(json_struct)
event_extracted = pd.merge(events['EventTime'].reset_index(), event_extracted, left_index=True, right_index=True)
if not event_extracted.empty:
event_extracted = event_extracted[['EventTime', 'order_id', 'limit_price', 'quantity', 'is_buy_order']]
event_extracted.rename(columns={'EventTime': 'TIMESTAMP',
'order_id': 'ORDER_ID',
'limit_price': 'PRICE',
'quantity': 'SIZE',
'is_buy_order': 'BUY_SELL_FLAG'}, inplace=True)
else:
event_extracted = pd.DataFrame({
'TIMESTAMP': [],
'ORDER_ID': [],
'PRICE': [],
'SIZE': [],
'BUY_SELL_FLAG': []
})
def _compute_number_of_frequency_generated_stop_times(self, gtfs_source_path):
"""
Parameters
----------
Same as for "_frequency_generated_trips_rows" but for stop times table
gtfs_source_path:
table_name:
Return
------
"""
df_freq = self._frequency_generated_trips_rows(gtfs_source_path, return_df_freq=True)
df_stop_times = source_csv_to_pandas(gtfs_source_path, "stop_times")
df_stop_freq = pd.merge(df_freq, df_stop_times, how='outer', on='trip_id')
return int(df_stop_freq['n_trips'].fillna(1).sum(axis=0))
Given a graph defined as a pair of dataframes (nodes and edges), the
nodes (id, coordinates) and edges (id, source, target, weight) are
joined by node id to create a single dataframe with each source/target
of an edge (including its optional weight) replaced with the respective
coordinates. For both nodes and edges, each id column is assumed to be
the index.
We also return the dimensions of each point in the final dataframe and
the accumulator function for drawing to an image.
"""
df = pd.merge(edges, nodes, left_on=[params.source], right_index=True)
df = df.rename(columns={params.x: 'src_x', params.y: 'src_y'})
df = pd.merge(df, nodes, left_on=[params.target], right_index=True)
df = df.rename(columns={params.x: 'dst_x', params.y: 'dst_y'})
df = df.sort_index()
df = df.reset_index()
if params.include_edge_id:
df = df.rename(columns={'id': 'edge_id'})
include_weight = params.weight and params.weight in edges
if params.include_edge_id:
if include_weight:
segment_class = WeightedSegment
else:
segment_class = UnweightedSegment
else: