From 23802d4a5354ec827b2748ccb81cf29156360150 Mon Sep 17 00:00:00 2001 From: Chris Date: Sun, 20 Sep 2015 08:35:05 -0500 Subject: [PATCH] ENH: add merge indicator to DataFrame.merge --- pandas/core/frame.py | 4 +-- pandas/tools/tests/test_merge.py | 42 +++++++++++++++++++++++--------- 2 files changed, 32 insertions(+), 14 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 50fecf9b2886c..9d40cf3921b98 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4265,12 +4265,12 @@ def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='', @Appender(_merge_doc, indents=2) def merge(self, right, how='inner', on=None, left_on=None, right_on=None, left_index=False, right_index=False, sort=False, - suffixes=('_x', '_y'), copy=True): + suffixes=('_x', '_y'), copy=True, indicator=False): from pandas.tools.merge import merge return merge(self, right, how=how, on=on, left_on=left_on, right_on=right_on, left_index=left_index, right_index=right_index, sort=sort, - suffixes=suffixes, copy=copy) + suffixes=suffixes, copy=copy, indicator=indicator) def round(self, decimals=0, out=None): """ diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index 24b5feb21f5ac..929a72cfd4adc 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -951,25 +951,27 @@ def test_indicator(self): df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b'], 'col_conflict':[1,2]}) df1_copy = df1.copy() - df2 = pd.DataFrame({'col1':[1,2,3,4,5],'col_right':[2,2,2,2,2], + df2 = pd.DataFrame({'col1':[1,2,3,4,5],'col_right':[2,2,2,2,2], 'col_conflict':[1,2,3,4,5]}) df2_copy = df2.copy() - - df_result = pd.DataFrame({'col1':[0,1,2,3,4,5], + + df_result = pd.DataFrame({'col1':[0,1,2,3,4,5], 'col_conflict_x':[1,2,np.nan,np.nan,np.nan,np.nan], - 'col_left':['a','b', np.nan,np.nan,np.nan,np.nan], - 'col_conflict_y':[np.nan,1,2,3,4,5], + 'col_left':['a','b', np.nan,np.nan,np.nan,np.nan], + 'col_conflict_y':[np.nan,1,2,3,4,5], 'col_right':[np.nan, 2,2,2,2,2]}, dtype='float64') df_result['_merge'] = pd.Categorical(['left_only','both','right_only', 'right_only','right_only','right_only'] , categories=['left_only', 'right_only', 'both']) - df_result = df_result[['col1', 'col_conflict_x', 'col_left', + df_result = df_result[['col1', 'col_conflict_x', 'col_left', 'col_conflict_y', 'col_right', '_merge' ]] test = pd.merge(df1, df2, on='col1', how='outer', indicator=True) assert_frame_equal(test, df_result) + test = df1.merge(df2, on='col1', how='outer', indicator=True) + assert_frame_equal(test, df_result) # No side effects assert_frame_equal(df1, df1_copy) @@ -981,49 +983,65 @@ def test_indicator(self): test_custom_name = pd.merge(df1, df2, on='col1', how='outer', indicator='custom_name') assert_frame_equal(test_custom_name, df_result_custom_name) + test_custom_name = df1.merge(df2, on='col1', how='outer', indicator='custom_name') + assert_frame_equal(test_custom_name, df_result_custom_name) # Check only accepts strings and booleans with tm.assertRaises(ValueError): pd.merge(df1, df2, on='col1', how='outer', indicator=5) + with tm.assertRaises(ValueError): + df1.merge(df2, on='col1', how='outer', indicator=5) # Check result integrity - + test2 = pd.merge(df1, df2, on='col1', how='left', indicator=True) self.assertTrue((test2._merge != 'right_only').all()) + test2 = df1.merge(df2, on='col1', how='left', indicator=True) + self.assertTrue((test2._merge != 'right_only').all()) test3 = pd.merge(df1, df2, on='col1', how='right', indicator=True) self.assertTrue((test3._merge != 'left_only').all()) + test3 = df1.merge(df2, on='col1', how='right', indicator=True) + self.assertTrue((test3._merge != 'left_only').all()) test4 = pd.merge(df1, df2, on='col1', how='inner', indicator=True) self.assertTrue((test4._merge == 'both').all()) + test4 = df1.merge(df2, on='col1', how='inner', indicator=True) + self.assertTrue((test4._merge == 'both').all()) # Check if working name in df for i in ['_right_indicator', '_left_indicator', '_merge']: df_badcolumn = pd.DataFrame({'col1':[1,2], i:[2,2]}) - + with tm.assertRaises(ValueError): pd.merge(df1, df_badcolumn, on='col1', how='outer', indicator=True) + with tm.assertRaises(ValueError): + df1.merge(df_badcolumn, on='col1', how='outer', indicator=True) # Check for name conflict with custom name df_badcolumn = pd.DataFrame({'col1':[1,2], 'custom_column_name':[2,2]}) - + with tm.assertRaises(ValueError): pd.merge(df1, df_badcolumn, on='col1', how='outer', indicator='custom_column_name') + with tm.assertRaises(ValueError): + df1.merge(df_badcolumn, on='col1', how='outer', indicator='custom_column_name') # Merge on multiple columns df3 = pd.DataFrame({'col1':[0,1], 'col2':['a','b']}) df4 = pd.DataFrame({'col1':[1,1,3], 'col2':['b','x','y']}) - hand_coded_result = pd.DataFrame({'col1':[0,1,1,3.0], + hand_coded_result = pd.DataFrame({'col1':[0,1,1,3.0], 'col2':['a','b','x','y']}) hand_coded_result['_merge'] = pd.Categorical( ['left_only','both','right_only','right_only'] , categories=['left_only', 'right_only', 'both']) - + test5 = pd.merge(df3, df4, on=['col1', 'col2'], how='outer', indicator=True) assert_frame_equal(test5, hand_coded_result) - + test5 = df3.merge(df4, on=['col1', 'col2'], how='outer', indicator=True) + assert_frame_equal(test5, hand_coded_result) + def _check_merge(x, y): for how in ['inner', 'left', 'outer']: