How to use the pandas.merge function in pandas

To help you get started, we’ve selected a few pandas examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

xiaozhouwang / kaggle_Microsoft_Malware / kaggle_Microsoft_malware_full / model3.py View on Github

mine = mine.as_matrix()

    #mine_test = pd.merge(grams_test, test_daf,on='Id')
    mine_test = grams_test
    mine_test = pd.merge(mine_test, test_dll,on='Id')

    mine_test_id = mine_test.Id
    del mine_test['Id']
    clf_se = RF(n_estimators=500, n_jobs=-1,random_state = 0)
    clf_se.fit(mine,mine_labels)
    mine_train = np.array(clf_se.transform(mine, '1.25*mean'))
    mine_test = np.array(clf_se.transform(mine_test, '1.25*mean'))

    train_mine = pd.DataFrame(np.column_stack((mine_Id, mine_train)), columns=['Id']+['mine_'+str(x) for x in xrange(mine_train.shape[1])]).convert_objects(convert_numeric=True)
    test_mine = pd.DataFrame(np.column_stack((mine_test_id, mine_test)), columns=['Id']+['mine_'+str(x) for x in xrange(mine_test.shape[1])]).convert_objects(convert_numeric=True)
    train = pd.merge(train, train_mine, on='Id')
    test = pd.merge(test, test_mine, on='Id')

    train_image = pd.read_csv("train_asm_image.csv", usecols=['Id']+['asm_%i'%i for i in xrange(800)])
    test_image = pd.read_csv("test_asm_image.csv", usecols=['Id']+['asm_%i'%i for i in xrange(800)])
    train = pd.merge(train, train_image, on='Id')
    test = pd.merge(test, test_image, on='Id')
    print "the data dimension:"
    print train.shape, test.shape
    return train, test
def gen_submission(model):

BayAreaMetro / travel-model-one / utilities / PBA40 / metrics / tallyParking.py View on Github

# Join to persons to get person_id, fp_choice
    joint_tours = pandas.merge(left=joint_tours, right=persons,  on=['hh_id','person_num'])
    # Verify we didn't lose or add rows and that we found everyone's person id
    assert(len(joint_tours) == joint_tour_participants)
    assert(len(joint_tours.loc[pandas.notnull(joint_tours.person_id)] == joint_tour_participants))

    # drop tour_participants so we can merge
    joint_tours.drop('tour_participants', axis=1, inplace=True)
    assert(sorted(list(indiv_tours.columns.values)) == sorted(list(joint_tours.columns.values)))
    tours = pandas.concat([indiv_tours, joint_tours])

    # tour duration
    tours['tour_duration'] = tours.end_hour - tours.start_hour
    # origin county
    tours = pandas.merge(left=tours,         right=tazdata[['COUNTY']],
                         left_on='orig_taz', right_index=True)
    tours.rename(columns={'COUNTY':'orig_county'}, inplace=True)
    # destination county, parking costs
    tours = pandas.merge(left=tours,         right=tazdata,
                         left_on='dest_taz', right_index=True)
    tours.rename(columns={'COUNTY':'dest_county'}, inplace=True)
    assert(len(tours) == joint_tour_participants + indiv_tours_participants)

    # make sure this is a good index
    dupes = tours.duplicated(subset=['hh_id','person_id','person_num','tour_category','tour_purpose','tour_id'])
    assert(dupes.sum()==0)

    tours['tour_purpose2'] = tours.tour_purpose  # duplicate for index
    tours.set_index(['hh_id','person_id','person_num','tour_category','tour_purpose2','tour_id'], inplace=True)

    # default: tour_duration * OPRKCST

ozak / georasters / georasters / georasters.py View on Github

else LineString([x[0], x[0]]))
                    df2 = gp.GeoDataFrame(df2, crs=cea)
                if isolation:
                    df2['Iso'] = grisolation
                if count == 0:
                    self.grdist = df2.copy()
                else:
                    self.grdist = self.grdist.append(df2)
                count += 1
        if routes:
            self.grdist = gp.GeoDataFrame(self.grdist, crs=cea)
        if export_shape:
            start_pointscols = sources.columns.values
            end_pointscols = destinations.columns.values
            if 'geometry' in end_pointscols:
                self.grdist = pd.merge(self.grdist, end_points[['ID'] + end_pointscols.tolist()].drop('geometry', axis=1), left_on='ID2', right_on='ID', how='left')
            else:
                self.grdist = pd.merge(self.grdist, end_points[['ID']+end_pointscols.tolist()], left_on='ID2', right_on='ID', how='left')
            if 'geometry' in self.start_pointscols:
                self.grdist = pd.merge(self.grdist, start_points[['ID']+start_pointscols.tolist()].drop('geometry', axis=1), left_on='ID1', right_on='ID', how='left',
                             suffixes=['_2', '_1'])
            else:
                self.grdist = pd.merge(self.grdist, start_points[['ID']+start_pointscols.tolist()], left_on='ID1', right_on='ID', how='left',
                             suffixes=['_2', '_1'])
            self.grdist = gp.GeoDataFrame(self.grdist, crs=cea)
            self.grdist.to_file(path+'routes.shp')
    pass

duxuhao / JData-2018 / Features / Generate_finalB_9month.py View on Github

def ActionFeatures(Startday, PrepareDays, PredictDays, temp, dftemp):
    tempfeature = temp[temp.a_day_series &lt; Startday][temp.a_day_series &gt;= Startday-PrepareDays].reset_index(drop=True)
    templabel = temp[temp.a_day_series &gt;= Startday][temp.a_day_series &lt; Startday+PredictDays].reset_index(drop=True)
    dftemp = pd.merge(dftemp, templabel[['user_id','a_date']].drop_duplicates(subset = 'user_id', keep='last'), on = 'user_id',how='left').fillna(0)
    Checkcnt = tempfeature[['user_id','a_date']].groupby(['user_id']).count().reset_index()
    Checkcnt.columns = ['user_id', 'checkcnt']
    dftemp = pd.merge(dftemp,Checkcnt, how = 'left', on = 'user_id')
    monthcnt = tempfeature[['user_id','a_month_series']].drop_duplicates().groupby(['user_id']).size().reset_index()
    monthcnt.columns = ['user_id', 'a_monthcnt']
    dftemp = pd.merge(dftemp,monthcnt, how = 'left', on = 'user_id')
    tempfeature['daybeforelastcheck'] = tempfeature.sort_values(by=['user_id','a_day_series']).a_day_series - tempfeature.sort_values(by=['user_id','a_day_series']).groupby(['user_id']).shift(1).a_day_series
    for f in ['daybeforelastcheck', 'price', 'para_1', 'para_2', 'para_3', 'a_num','a_type','a_month_series','a_day_series']:
        a = tempfeature[['user_id',f]].groupby(['user_id']).mean().reset_index()
        a.columns = ['user_id', '{}_a_ave'.format(f)]
        dftemp = pd.merge(dftemp,a, how = 'left', on = 'user_id')
        a = tempfeature[['user_id',f]].groupby(['user_id']).std().reset_index()
        a.columns = ['user_id', '{}_a_std'.format(f)]
        dftemp = pd.merge(dftemp,a, how = 'left', on = 'user_id')
        a = tempfeature[['user_id',f]].groupby(['user_id']).sum().reset_index()
        a.columns = ['user_id', '{}_a_sum'.format(f)]
        dftemp = pd.merge(dftemp,a, how = 'left', on = 'user_id')
        a = tempfeature[['user_id',f]].groupby(['user_id']).median().reset_index()
        a.columns = ['user_id', '{}_a_median'.format(f)]

wai-i / Pair-Trading-Reinforcement-Learning / STRATEGY / Cointegration.py View on Github

def clean_data(cls, x, y, on, col_name):
        x.replace([np.inf, -np.inf], np.nan, inplace=True)
        y.replace([np.inf, -np.inf], np.nan, inplace=True)
        merged_df = pd.merge(left=x, right=y, on=on, how='outer')
        clean_df  = merged_df.loc[merged_df.notnull().all(axis=1), :]
        df_x = pd.DataFrame()
        df_y = pd.DataFrame()
        df_x[on] = clean_df[on].values
        df_y[on] = clean_df[on].values
        df_x[col_name] = clean_df[col_name + '_x'].values
        df_y[col_name] = clean_df[col_name + '_y'].values
        return df_x, df_y

USEPA / Federal-LCA-Commons-Elementary-Flow-List / scripts / write_flows_and_mappings.py View on Github

import fedelemflowlist
import pandas as pd
from fedelemflowlist.globals import outputpath

#Set name of mapping file. More than one mapping file can be used
mapping_to_use = ['openLCA']

if __name__ == '__main__':
    mapping = fedelemflowlist.get_flowmapping(mapping_to_use)
    #Get Flow UUIDs for flows used in selected mapping
    mapping_flow_uuids = pd.DataFrame(pd.unique(mapping['TargetFlowUUID']),columns=["Flow UUID"])

    #Get all flows
    all_flows = fedelemflowlist.get_flows()
    #Subset all flows to get just those used in selected mapping
    flows_used_in_mapping =  pd.merge(all_flows,mapping_flow_uuids)

    #Now write out flows and mappings
    export_name = ''
    for s in mapping_to_use:
        export_name = export_name + s + '_'
    export_name = export_name+ 'flows_w_mappings.zip'
    fedelemflowlist.write_jsonld(flows_used_in_mapping,outputpath+export_name,mapping)

openelections / openelections-data-oh / 2008 / 2008general_parser.py View on Github

def make_attorney_df():
    df = pd.read_html(URLS['attorney general'])[0]
    df.columns = df.iloc[0]
    df = df.drop(df.index[0])

    df.columns = ['county'] + list(df.columns[1:])
    df_ = pd.melt(df, id_vars=['county'], value_vars=list(df.columns[1:]))
    party_df = df_[pd.isnull(df_['county'])][['variable','value']]
    party_df.columns = ['candidate', 'party']
    df_.columns = ['county', 'candidate', 'votes']
    df_ = df_.dropna(subset=['county'])
    df_ = pd.merge(df_, party_df, how='left')
    df_['candidate'] = df_['candidate'].str.rstrip(' *')
    df_['candidate'] = df_['candidate'].str.replace('\((.*?)\)', '')
    df_['candidate'] = df_['candidate'].str.rstrip('()')
    
    df_['office'] = 'Attorney General'
    attorney_df = df_

    return attorney_df

abides-sim / abides / util / formatting / convert_order_stream.py View on Github

def extract_events_from_stream(stream_df, event_type):
    """ Extracts specific event from stream.

    """
    events = stream_df.loc[stream_df.EventType == event_type][['EventTime', 'Event']]
    events_json = events['Event'].to_json(orient="records")
    json_struct = json.loads(events_json)
    # TODO : get rid of structs containing all `int` types
    event_extracted = json_normalize(json_struct)
    event_extracted = pd.merge(events['EventTime'].reset_index(), event_extracted, left_index=True, right_index=True)

    if not event_extracted.empty:
        event_extracted = event_extracted[['EventTime', 'order_id', 'limit_price', 'quantity', 'is_buy_order']]
        event_extracted.rename(columns={'EventTime': 'TIMESTAMP',
                                        'order_id': 'ORDER_ID',
                                        'limit_price': 'PRICE',
                                        'quantity': 'SIZE',
                                        'is_buy_order': 'BUY_SELL_FLAG'}, inplace=True)
    else:
        event_extracted = pd.DataFrame({
            'TIMESTAMP': [],
            'ORDER_ID': [],
            'PRICE': [],
            'SIZE': [],
            'BUY_SELL_FLAG': []
        })

CxAalto / gtfspy / gtfspy / import_validator.py View on Github

def _compute_number_of_frequency_generated_stop_times(self, gtfs_source_path):
        """
        Parameters
        ----------
        Same as for "_frequency_generated_trips_rows" but for stop times table
        gtfs_source_path:
        table_name:

        Return
        ------
        """
        df_freq = self._frequency_generated_trips_rows(gtfs_source_path, return_df_freq=True)
        df_stop_times = source_csv_to_pandas(gtfs_source_path, "stop_times")
        df_stop_freq = pd.merge(df_freq, df_stop_times, how='outer', on='trip_id')
        return int(df_stop_freq['n_trips'].fillna(1).sum(axis=0))

holoviz / datashader / datashader / bundling.py View on Github

Given a graph defined as a pair of dataframes (nodes and edges), the
    nodes (id, coordinates) and edges (id, source, target, weight) are
    joined by node id to create a single dataframe with each source/target
    of an edge (including its optional weight) replaced with the respective
    coordinates. For both nodes and edges, each id column is assumed to be
    the index.

    We also return the dimensions of each point in the final dataframe and
    the accumulator function for drawing to an image.
    """

    df = pd.merge(edges, nodes, left_on=[params.source], right_index=True)
    df = df.rename(columns={params.x: 'src_x', params.y: 'src_y'})

    df = pd.merge(df, nodes, left_on=[params.target], right_index=True)
    df = df.rename(columns={params.x: 'dst_x', params.y: 'dst_y'})

    df = df.sort_index()
    df = df.reset_index()

    if params.include_edge_id:
        df = df.rename(columns={'id': 'edge_id'})

    include_weight = params.weight and params.weight in edges

    if params.include_edge_id:
        if include_weight:
            segment_class = WeightedSegment
        else:
            segment_class = UnweightedSegment
    else:

How to use the pandas.merge function in pandas

To help you get started, we’ve selected a few pandas examples, based on popular ways it is used in public projects.

pandas

Package Health Score

Popular pandas functions

Similar packages