Skip to content

Commit 8faf51d

Browse files
author
Nick Eubank
committed
Create indicator for obs from left, right, or both
1 parent b7c3271 commit 8faf51d

File tree

2 files changed

+81
-3
lines changed

2 files changed

+81
-3
lines changed

pandas/tools/merge.py

+40-3
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,11 @@
3030
@Appender(_merge_doc, indents=0)
3131
def merge(left, right, how='inner', on=None, left_on=None, right_on=None,
3232
left_index=False, right_index=False, sort=False,
33-
suffixes=('_x', '_y'), copy=True):
33+
suffixes=('_x', '_y'), copy=True, indicator=False):
3434
op = _MergeOperation(left, right, how=how, on=on, left_on=left_on,
3535
right_on=right_on, left_index=left_index,
3636
right_index=right_index, sort=sort, suffixes=suffixes,
37-
copy=copy)
37+
copy=copy, indicator=indicator)
3838
return op.get_result()
3939
if __debug__:
4040
merge.__doc__ = _merge_doc % '\nleft : DataFrame'
@@ -160,7 +160,7 @@ class _MergeOperation(object):
160160
def __init__(self, left, right, how='inner', on=None,
161161
left_on=None, right_on=None, axis=1,
162162
left_index=False, right_index=False, sort=True,
163-
suffixes=('_x', '_y'), copy=True):
163+
suffixes=('_x', '_y'), copy=True, indicator=False):
164164
self.left = self.orig_left = left
165165
self.right = self.orig_right = right
166166
self.how = how
@@ -177,12 +177,18 @@ def __init__(self, left, right, how='inner', on=None,
177177
self.left_index = left_index
178178
self.right_index = right_index
179179

180+
self.indicator = indicator
181+
180182
# note this function has side effects
181183
(self.left_join_keys,
182184
self.right_join_keys,
183185
self.join_names) = self._get_merge_keys()
184186

185187
def get_result(self):
188+
if self.indicator:
189+
# Has side-effects. Most cleaned up in `self._indicator_post_merge`
190+
self._indicator_pre_merge(self.left, self.right)
191+
186192
join_index, left_indexer, right_indexer = self._get_join_info()
187193

188194
ldata, rdata = self.left._data, self.right._data
@@ -202,10 +208,41 @@ def get_result(self):
202208
typ = self.left._constructor
203209
result = typ(result_data).__finalize__(self, method='merge')
204210

211+
if self.indicator:
212+
# Has side-effects
213+
self._indicator_post_merge(result, self.left, self.right)
214+
205215
self._maybe_add_join_keys(result, left_indexer, right_indexer)
206216

207217
return result
208218

219+
def _indicator_pre_merge(self, left, right):
220+
221+
columns = left.columns.values.tolist() + right.columns.values.tolist()
222+
223+
for i in ['_left_indicator', '_right_indicator', '_merge']:
224+
if i in columns:
225+
raise ValueError("Cannot use `indicator=True` option when data contains a column named {}".format(i))
226+
227+
left['_left_indicator'] = 1
228+
left['_left_indicator'] = left['_left_indicator'].astype('int8')
229+
230+
right['_right_indicator'] = 2
231+
right['_right_indicator'] = right['_right_indicator'].astype('int8')
232+
233+
234+
def _indicator_post_merge(self, result, left, right):
235+
result['_left_indicator'].fillna(0, inplace=True)
236+
result['_right_indicator'].fillna(0, inplace=True)
237+
238+
result['_merge'] = Categorical((result['_left_indicator'] + result['_right_indicator']), categories=[1,2,3])
239+
result['_merge'].cat.rename_categories(['left_only', 'right_only', 'both'], inplace=True)
240+
241+
# Cleanup
242+
result.drop(labels=['_left_indicator', '_right_indicator'], axis=1, inplace=True)
243+
left.drop(labels=['_left_indicator'], axis=1, inplace=True)
244+
right.drop(labels=['_right_indicator'], axis=1, inplace=True)
245+
209246
def _maybe_add_join_keys(self, result, left_indexer, right_indexer):
210247
# insert group keys
211248

pandas/tools/tests/test_merge.py

+41
Original file line numberDiff line numberDiff line change
@@ -856,6 +856,47 @@ def test_overlapping_columns_error_message(self):
856856

857857
self.assertRaises(ValueError, merge, df, df2)
858858

859+
def test_indicator(self):
860+
861+
df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b']})
862+
df1_copy = df1.copy()
863+
864+
df2 = pd.DataFrame({'col1':[1,2,3,4,5],'col_right':[2,2,2,2,2]})
865+
df2_copy = df2.copy()
866+
867+
df_result = pd.DataFrame({'col1':[0,1,2,3,4,5],
868+
'col_left':['a','b', np.nan,np.nan,np.nan,np.nan],
869+
'col_right':[np.nan, 2,2,2,2,2]},
870+
dtype='float64')
871+
df_result['_merge'] = pd.Categorical(['left_only','both','right_only',
872+
'right_only','right_only','right_only']
873+
, categories=['left_only', 'right_only', 'both'])
874+
875+
test = pd.merge(df1, df2, on='col1', how='outer', indicator=True)
876+
assert_frame_equal(test, df_result)
877+
878+
# No side effects
879+
assert_frame_equal(df1, df1_copy)
880+
assert_frame_equal(df2, df2_copy)
881+
882+
883+
test2 = pd.merge(df1, df2, on='col1', how='left', indicator=True)
884+
assert (test2._merge != 'right_only').all()
885+
886+
test3 = pd.merge(df1, df2, on='col1', how='right', indicator=True)
887+
assert (test3._merge != 'left_only').all()
888+
889+
test4 = pd.merge(df1, df2, on='col1', how='inner', indicator=True)
890+
assert (test4._merge == 'both').all()
891+
892+
# Check if working name in df
893+
for i in ['_right_indicator', '_left_indicator', '_merge']:
894+
df_badcolumn = pd.DataFrame({'col1':[1,2], i:[2,2]})
895+
896+
with tm.assertRaises(ValueError):
897+
pd.merge(df1, df_badcolumn, on='col1', how='outer', indicator=True)
898+
899+
859900
def _check_merge(x, y):
860901
for how in ['inner', 'left', 'outer']:
861902
result = x.join(y, how=how)

0 commit comments

Comments
 (0)