Merge pull request #10054 from nickeubank/merge_indicator

jreback · jreback · commit 4b9606bf62de · 2015-09-03T20:51:47.000-04:00
ENH: Create merge indicator for obs from left, right, or both
diff --git a/doc/source/merging.rst b/doc/source/merging.rst
@@ -506,9 +506,9 @@ standard database join operations between DataFrame objects:
 
 ::
 
-    pd.merge(left, right, how='inner', on=None, left_on=None, right_on=None,
-             left_index=False, right_index=False, sort=True,
-             suffixes=('_x', '_y'), copy=True)
+    merge(left, right, how='inner', on=None, left_on=None, right_on=None,
+          left_index=False, right_index=False, sort=True,
+          suffixes=('_x', '_y'), copy=True, indicator=False)
 
 Here's a description of what each argument is for:
 
@@ -539,6 +539,15 @@ Here's a description of what each argument is for:
     cases but may improve performance / memory usage. The cases where copying
     can be avoided are somewhat pathological but this option is provided
     nonetheless.
+  - ``indicator``: Add a column to the output DataFrame called ``_merge``
+    with information on the source of each row. ``_merge`` is Categorical-type 
+    and takes on a value of ``left_only`` for observations whose merge key 
+    only appears in ``'left'`` DataFrame, ``right_only`` for observations whose 
+    merge key only appears in ``'right'`` DataFrame, and ``both`` if the 
+    observation's merge key is found in both. 
+    
+    .. versionadded:: 0.17.0
+
 
 The return type will be the same as ``left``. If ``left`` is a ``DataFrame``
 and ``right`` is a subclass of DataFrame, the return type will still be
@@ -684,6 +693,36 @@ either the left or right tables, the values in the joined table will be
           labels=['left', 'right'], vertical=False);
    plt.close('all');
 
+.. _merging.indicator:
+
+The merge indicator
+~~~~~~~~~~~~~~~~~~~
+
+.. versionadded:: 0.17.0
+
+``merge`` now accepts the argument ``indicator``. If ``True``, a Categorical-type column called ``_merge`` will be added to the output object that takes on values:
+
+  ===================================   ================
+  Observation Origin                    ``_merge`` value
+  ===================================   ================
+  Merge key only in ``'left'`` frame    ``left_only``
+  Merge key only in ``'right'`` frame   ``right_only``
+  Merge key in both frames              ``both``
+  ===================================   ================
+
+.. ipython:: python
+
+   df1 = DataFrame({'col1':[0,1], 'col_left':['a','b']})
+   df2 = DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})
+   merge(df1, df2, on='col1', how='outer', indicator=True)
+
+The ``indicator`` argument will also accept string arguments, in which case the indicator function will use the value of the passed string as the name for the indicator column. 
+
+.. ipython:: python
+
+   merge(df1, df2, on='col1', how='outer', indicator='indicator_column')
+
+
 .. _merging.join.index:
 
 Joining on index
diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
@@ -51,6 +51,27 @@ Check the :ref:`API Changes <whatsnew_0170.api>` and :ref:`deprecations <whatsne
 New features
 ~~~~~~~~~~~~
 
+- ``merge`` now accepts the argument ``indicator`` which adds a Categorical-type column (by default called ``_merge``) to the output object that takes on the values:
+
+  ===================================   ================
+  Observation Origin                    ``_merge`` value
+  ===================================   ================
+  Merge key only in ``'left'`` frame    ``left_only``
+  Merge key only in ``'right'`` frame   ``right_only``
+  Merge key in both frames              ``both``
+  ===================================   ================
+
+For more, see the :ref:`updated docs <merging.indicator>`
+
+  .. ipython:: python
+
+    df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b']})
+    df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})
+    pd.merge(df1, df2, on='col1', how='outer', indicator=True)
+
+
+
+
 - ``DataFrame`` has the ``nlargest`` and ``nsmallest`` methods (:issue:`10393`)
 - SQL io functions now accept a SQLAlchemy connectable. (:issue:`7877`)
 - Enable writing complex values to HDF stores when using table format (:issue:`10447`)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -115,6 +115,17 @@
     side, respectively
 copy : boolean, default True
     If False, do not copy data unnecessarily
+indicator : boolean or string, default False
+    If True, adds a column to output DataFrame called "_merge" with 
+    information on the source of each row. 
+    If string, column with information on source of each row will be added to 
+    output DataFrame, and column will be named value of string. 
+    Information column is Categorical-type and takes on a value of "left_only" 
+    for observations whose merge key only appears in 'left' DataFrame, 
+    "right_only" for observations whose merge key only appears in 'right' 
+    DataFrame, and "both" if the observation's merge key is found in both. 
+
+    .. versionadded:: 0.17.0
 
 Examples
 --------
diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py
@@ -27,11 +27,11 @@
 @Appender(_merge_doc, indents=0)
 def merge(left, right, how='inner', on=None, left_on=None, right_on=None,
           left_index=False, right_index=False, sort=False,
-          suffixes=('_x', '_y'), copy=True):
+          suffixes=('_x', '_y'), copy=True, indicator=False):
     op = _MergeOperation(left, right, how=how, on=on, left_on=left_on,
                          right_on=right_on, left_index=left_index,
                          right_index=right_index, sort=sort, suffixes=suffixes,
-                         copy=copy)
+                         copy=copy, indicator=indicator)
     return op.get_result()
 if __debug__:
     merge.__doc__ = _merge_doc % '\nleft : DataFrame'
@@ -157,7 +157,7 @@ class _MergeOperation(object):
     def __init__(self, left, right, how='inner', on=None,
                  left_on=None, right_on=None, axis=1,
                  left_index=False, right_index=False, sort=True,
-                 suffixes=('_x', '_y'), copy=True):
+                 suffixes=('_x', '_y'), copy=True, indicator=False):
         self.left = self.orig_left = left
         self.right = self.orig_right = right
         self.how = how
@@ -174,12 +174,25 @@ def __init__(self, left, right, how='inner', on=None,
         self.left_index = left_index
         self.right_index = right_index
 
+        self.indicator = indicator
+
+        if isinstance(self.indicator, compat.string_types):
+            self.indicator_name = self.indicator
+        elif isinstance(self.indicator, bool):
+            self.indicator_name = '_merge' if self.indicator else None
+        else:
+            raise ValueError('indicator option can only accept boolean or string arguments')
+
+
         # note this function has side effects
         (self.left_join_keys,
          self.right_join_keys,
          self.join_names) = self._get_merge_keys()
 
     def get_result(self):
+        if self.indicator:
+            self.left, self.right = self._indicator_pre_merge(self.left, self.right)
+
         join_index, left_indexer, right_indexer = self._get_join_info()
 
         ldata, rdata = self.left._data, self.right._data
@@ -199,10 +212,46 @@ def get_result(self):
         typ = self.left._constructor
         result = typ(result_data).__finalize__(self, method='merge')
 
+        if self.indicator:
+            result = self._indicator_post_merge(result)
+
         self._maybe_add_join_keys(result, left_indexer, right_indexer)
 
         return result
 
+    def _indicator_pre_merge(self, left, right):
+                
+        columns = left.columns.union(right.columns)  
+
+        for i in ['_left_indicator', '_right_indicator']:
+            if i in columns:
+                raise ValueError("Cannot use `indicator=True` option when data contains a column named {}".format(i))
+        if self.indicator_name in columns:
+            raise ValueError("Cannot use name of an existing column for indicator column")
+
+        left = left.copy()
+        right = right.copy()
+
+        left['_left_indicator'] = 1  
+        left['_left_indicator'] = left['_left_indicator'].astype('int8')  
+        
+        right['_right_indicator'] = 2     
+        right['_right_indicator'] = right['_right_indicator'].astype('int8') 
+        
+        return left, right
+
+    def _indicator_post_merge(self, result):
+
+        result['_left_indicator'] = result['_left_indicator'].fillna(0)
+        result['_right_indicator'] = result['_right_indicator'].fillna(0)
+
+        result[self.indicator_name] = Categorical((result['_left_indicator'] + result['_right_indicator']), categories=[1,2,3])
+        result[self.indicator_name] = result[self.indicator_name].cat.rename_categories(['left_only', 'right_only', 'both'])        
+ 
+        result = result.drop(labels=['_left_indicator', '_right_indicator'], axis=1)
+
+        return result
+
     def _maybe_add_join_keys(self, result, left_indexer, right_indexer):
         # insert group keys
 
diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py
@@ -946,6 +946,85 @@ def test_overlapping_columns_error_message(self):
         df2.columns = ['key1', 'foo', 'foo']
         self.assertRaises(ValueError, merge, df, df2)
 
+    def test_indicator(self):
+        # PR #10054. xref #7412 and closes #8790.
+        df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b'], 'col_conflict':[1,2]})
+        df1_copy = df1.copy()
+
+        df2 = pd.DataFrame({'col1':[1,2,3,4,5],'col_right':[2,2,2,2,2], 
+                            'col_conflict':[1,2,3,4,5]})
+        df2_copy = df2.copy()
+        
+        df_result = pd.DataFrame({'col1':[0,1,2,3,4,5], 
+                'col_conflict_x':[1,2,np.nan,np.nan,np.nan,np.nan],
+                'col_left':['a','b', np.nan,np.nan,np.nan,np.nan], 
+                'col_conflict_y':[np.nan,1,2,3,4,5], 
+                'col_right':[np.nan, 2,2,2,2,2]},
+                dtype='float64')
+        df_result['_merge'] = pd.Categorical(['left_only','both','right_only',
+            'right_only','right_only','right_only']
+            , categories=['left_only', 'right_only', 'both'])
+
+        df_result = df_result[['col1', 'col_conflict_x', 'col_left', 
+                               'col_conflict_y', 'col_right', '_merge' ]]
+
+        test = pd.merge(df1, df2, on='col1', how='outer', indicator=True)
+        assert_frame_equal(test, df_result)
+
+        # No side effects
+        assert_frame_equal(df1, df1_copy)
+        assert_frame_equal(df2, df2_copy)
+
+        # Check with custom name
+        df_result_custom_name = df_result
+        df_result_custom_name = df_result_custom_name.rename(columns={'_merge':'custom_name'})
+
+        test_custom_name = pd.merge(df1, df2, on='col1', how='outer', indicator='custom_name')
+        assert_frame_equal(test_custom_name, df_result_custom_name)
+
+        # Check only accepts strings and booleans
+        with tm.assertRaises(ValueError):
+            pd.merge(df1, df2, on='col1', how='outer', indicator=5)
+
+        # Check result integrity
+    
+        test2 = pd.merge(df1, df2, on='col1', how='left', indicator=True)
+        self.assertTrue((test2._merge != 'right_only').all())
+
+        test3 = pd.merge(df1, df2, on='col1', how='right', indicator=True)
+        self.assertTrue((test3._merge != 'left_only').all())
+
+        test4 = pd.merge(df1, df2, on='col1', how='inner', indicator=True)
+        self.assertTrue((test4._merge == 'both').all())
+
+        # Check if working name in df
+        for i in ['_right_indicator', '_left_indicator', '_merge']:
+            df_badcolumn = pd.DataFrame({'col1':[1,2], i:[2,2]})
+        
+            with tm.assertRaises(ValueError):
+                pd.merge(df1, df_badcolumn, on='col1', how='outer', indicator=True)
+
+        # Check for name conflict with custom name
+        df_badcolumn = pd.DataFrame({'col1':[1,2], 'custom_column_name':[2,2]})
+        
+        with tm.assertRaises(ValueError):
+            pd.merge(df1, df_badcolumn, on='col1', how='outer', indicator='custom_column_name')
+
+        # Merge on multiple columns
+        df3 = pd.DataFrame({'col1':[0,1], 'col2':['a','b']})
+
+        df4 = pd.DataFrame({'col1':[1,1,3], 'col2':['b','x','y']})
+
+        hand_coded_result = pd.DataFrame({'col1':[0,1,1,3.0], 
+                                         'col2':['a','b','x','y']})
+        hand_coded_result['_merge'] = pd.Categorical(
+            ['left_only','both','right_only','right_only']
+            , categories=['left_only', 'right_only', 'both'])
+ 
+        test5 = pd.merge(df3, df4, on=['col1', 'col2'], how='outer', indicator=True)
+        assert_frame_equal(test5, hand_coded_result)
+    
+
 def _check_merge(x, y):
     for how in ['inner', 'left', 'outer']:
         result = x.join(y, how=how)