Skip to content

Commit 5f61765

Browse files
author
Nick Eubank
committed
Create indicator for obs from left, right, or both
1 parent fe735be commit 5f61765

File tree

5 files changed

+111
-4
lines changed

5 files changed

+111
-4
lines changed

doc/source/merging.rst

+7-1
Original file line numberDiff line numberDiff line change
@@ -492,7 +492,7 @@ standard database join operations between DataFrame objects:
492492

493493
merge(left, right, how='inner', on=None, left_on=None, right_on=None,
494494
left_index=False, right_index=False, sort=True,
495-
suffixes=('_x', '_y'), copy=True)
495+
suffixes=('_x', '_y'), copy=True, indicator=False)
496496

497497
Here's a description of what each argument is for:
498498

@@ -523,6 +523,12 @@ Here's a description of what each argument is for:
523523
cases but may improve performance / memory usage. The cases where copying
524524
can be avoided are somewhat pathological but this option is provided
525525
nonetheless.
526+
- ``indicator``: Add a column to the output DataFrame called ``_merge``
527+
with information on the source of each row. ``_merge`` is Categorical-type
528+
and takes on a value of ``left_only`` for observations whose merge key
529+
only appears in ``'left'`` DataFrame, ``right_only`` for observations whose
530+
merge key only appears in ``'right'`` DataFrame, and ``both`` if the
531+
observation's merge key is found in both.
526532

527533
The return type will be the same as ``left``. If ``left`` is a ``DataFrame``
528534
and ``right`` is a subclass of DataFrame, the return type will still be

doc/source/whatsnew/v0.17.0.txt

+7
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,13 @@ Check the :ref:`API Changes <whatsnew_0170.api>` and :ref:`deprecations <whatsne
2020

2121
New features
2222
~~~~~~~~~~~~
23+
- ``merge`` now accepts the argument ``indicator``. If ``True``, adds a column to output DataFrame called ``_merge`` with information on the source of each row. ``_merge`` is Categorical-type and takes on a value of ``left_only`` for observations whose merge key only appears in ``'left'`` DataFrame, ``right_only`` for observations whose merge key only appears in ``'right'`` DataFrame, and ``both`` if the observation's merge key is found in both. (:issue:`7412` and :issue:`8790`.)
24+
25+
.. ipython:: python
26+
27+
df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b']})
28+
df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})
29+
pd.merge(df1, df2, on='col1', how='outer', indicator=True)
2330

2431
.. _whatsnew_0170.enhancements.other:
2532

pandas/core/frame.py

+7
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,13 @@
114114
side, respectively
115115
copy : boolean, default True
116116
If False, do not copy data unnecessarily
117+
indicator : boolean, default False
118+
If True, adds a column to output DataFrame called "_merge" with
119+
information on the source of each row. "_merge" is Categorical-type
120+
and takes on a value of "left_only" for observations whose merge key
121+
only appears in 'left' DataFrame, "right_only" for observations whose
122+
merge key only appears in 'right' DataFrame, and "both" if the
123+
observation's merge key is found in both.
117124
118125
Examples
119126
--------

pandas/tools/merge.py

+42-3
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,11 @@
3030
@Appender(_merge_doc, indents=0)
3131
def merge(left, right, how='inner', on=None, left_on=None, right_on=None,
3232
left_index=False, right_index=False, sort=False,
33-
suffixes=('_x', '_y'), copy=True):
33+
suffixes=('_x', '_y'), copy=True, indicator=False):
3434
op = _MergeOperation(left, right, how=how, on=on, left_on=left_on,
3535
right_on=right_on, left_index=left_index,
3636
right_index=right_index, sort=sort, suffixes=suffixes,
37-
copy=copy)
37+
copy=copy, indicator=indicator)
3838
return op.get_result()
3939
if __debug__:
4040
merge.__doc__ = _merge_doc % '\nleft : DataFrame'
@@ -160,7 +160,7 @@ class _MergeOperation(object):
160160
def __init__(self, left, right, how='inner', on=None,
161161
left_on=None, right_on=None, axis=1,
162162
left_index=False, right_index=False, sort=True,
163-
suffixes=('_x', '_y'), copy=True):
163+
suffixes=('_x', '_y'), copy=True, indicator=False):
164164
self.left = self.orig_left = left
165165
self.right = self.orig_right = right
166166
self.how = how
@@ -177,12 +177,17 @@ def __init__(self, left, right, how='inner', on=None,
177177
self.left_index = left_index
178178
self.right_index = right_index
179179

180+
self.indicator = indicator
181+
180182
# note this function has side effects
181183
(self.left_join_keys,
182184
self.right_join_keys,
183185
self.join_names) = self._get_merge_keys()
184186

185187
def get_result(self):
188+
if self.indicator:
189+
self.left, self.right = self._indicator_pre_merge(self.left, self.right)
190+
186191
join_index, left_indexer, right_indexer = self._get_join_info()
187192

188193
ldata, rdata = self.left._data, self.right._data
@@ -202,10 +207,44 @@ def get_result(self):
202207
typ = self.left._constructor
203208
result = typ(result_data).__finalize__(self, method='merge')
204209

210+
if self.indicator:
211+
result = self._indicator_post_merge(result)
212+
205213
self._maybe_add_join_keys(result, left_indexer, right_indexer)
206214

207215
return result
208216

217+
def _indicator_pre_merge(self, left, right):
218+
219+
columns = left.columns.values.tolist() + right.columns.values.tolist()
220+
221+
for i in ['_left_indicator', '_right_indicator', '_merge']:
222+
if i in columns:
223+
raise ValueError("Cannot use `indicator=True` option when data contains a column named {}".format(i))
224+
225+
left = left.copy()
226+
right = right.copy()
227+
228+
left['_left_indicator'] = 1
229+
left['_left_indicator'] = left['_left_indicator'].astype('int8')
230+
231+
right['_right_indicator'] = 2
232+
right['_right_indicator'] = right['_right_indicator'].astype('int8')
233+
234+
return left, right
235+
236+
def _indicator_post_merge(self, result):
237+
238+
result['_left_indicator'] = result['_left_indicator'].fillna(0)
239+
result['_right_indicator'] = result['_right_indicator'].fillna(0)
240+
241+
result['_merge'] = Categorical((result['_left_indicator'] + result['_right_indicator']), categories=[1,2,3])
242+
result['_merge'] = result['_merge'].cat.rename_categories(['left_only', 'right_only', 'both'])
243+
244+
result = result.drop(labels=['_left_indicator', '_right_indicator'], axis=1)
245+
246+
return result
247+
209248
def _maybe_add_join_keys(self, result, left_indexer, right_indexer):
210249
# insert group keys
211250

pandas/tools/tests/test_merge.py

+48
Original file line numberDiff line numberDiff line change
@@ -856,6 +856,54 @@ def test_overlapping_columns_error_message(self):
856856

857857
self.assertRaises(ValueError, merge, df, df2)
858858

859+
def test_indicator(self):
860+
861+
df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b'], 'col_conflict':[1,2]})
862+
df1_copy = df1.copy()
863+
864+
df2 = pd.DataFrame({'col1':[1,2,3,4,5],'col_right':[2,2,2,2,2],
865+
'col_conflict':[1,2,3,4,5]})
866+
df2_copy = df2.copy()
867+
868+
df_result = pd.DataFrame({'col1':[0,1,2,3,4,5],
869+
'col_conflict_x':[1,2,np.nan,np.nan,np.nan,np.nan],
870+
'col_left':['a','b', np.nan,np.nan,np.nan,np.nan],
871+
'col_conflict_y':[np.nan,1,2,3,4,5],
872+
'col_right':[np.nan, 2,2,2,2,2]},
873+
dtype='float64')
874+
df_result['_merge'] = pd.Categorical(['left_only','both','right_only',
875+
'right_only','right_only','right_only']
876+
, categories=['left_only', 'right_only', 'both'])
877+
878+
df_result = df_result[['col1', 'col_conflict_x', 'col_left',
879+
'col_conflict_y', 'col_right', '_merge' ]]
880+
881+
test = pd.merge(df1, df2, on='col1', how='outer', indicator=True)
882+
assert_frame_equal(test, df_result)
883+
884+
885+
# No side effects
886+
assert_frame_equal(df1, df1_copy)
887+
assert_frame_equal(df2, df2_copy)
888+
889+
890+
test2 = pd.merge(df1, df2, on='col1', how='left', indicator=True)
891+
assert (test2._merge != 'right_only').all()
892+
893+
test3 = pd.merge(df1, df2, on='col1', how='right', indicator=True)
894+
assert (test3._merge != 'left_only').all()
895+
896+
test4 = pd.merge(df1, df2, on='col1', how='inner', indicator=True)
897+
assert (test4._merge == 'both').all()
898+
899+
# Check if working name in df
900+
for i in ['_right_indicator', '_left_indicator', '_merge']:
901+
df_badcolumn = pd.DataFrame({'col1':[1,2], i:[2,2]})
902+
903+
with tm.assertRaises(ValueError):
904+
pd.merge(df1, df_badcolumn, on='col1', how='outer', indicator=True)
905+
906+
859907
def _check_merge(x, y):
860908
for how in ['inner', 'left', 'outer']:
861909
result = x.join(y, how=how)

0 commit comments

Comments
 (0)