diff --git a/doc/source/merging.rst b/doc/source/merging.rst index c62647010a131..6aa42dbdd3374 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -489,9 +489,9 @@ standard database join operations between DataFrame objects: :: - pd.merge(left, right, how='inner', on=None, left_on=None, right_on=None, - left_index=False, right_index=False, sort=True, - suffixes=('_x', '_y'), copy=True) + merge(left, right, how='inner', on=None, left_on=None, right_on=None, + left_index=False, right_index=False, sort=True, + suffixes=('_x', '_y'), copy=True, indicator=False) Here's a description of what each argument is for: @@ -522,6 +522,15 @@ Here's a description of what each argument is for: cases but may improve performance / memory usage. The cases where copying can be avoided are somewhat pathological but this option is provided nonetheless. + - ``indicator``: Add a column to the output DataFrame called ``_merge`` + with information on the source of each row. ``_merge`` is Categorical-type + and takes on a value of ``left_only`` for observations whose merge key + only appears in ``'left'`` DataFrame, ``right_only`` for observations whose + merge key only appears in ``'right'`` DataFrame, and ``both`` if the + observation's merge key is found in both. + + .. versionadded:: 0.17.0 + The return type will be the same as ``left``. If ``left`` is a ``DataFrame`` and ``right`` is a subclass of DataFrame, the return type will still be @@ -667,6 +676,36 @@ either the left or right tables, the values in the joined table will be labels=['left', 'right'], vertical=False); plt.close('all'); +.. _merging.indicator: + +The merge indicator +~~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 0.17.0 + +``merge`` now accepts the argument ``indicator``. If ``True``, a Categorical-type column called ``_merge`` will be added to the output object that takes on values: + + =================================== ================ + Observation Origin ``_merge`` value + =================================== ================ + Merge key only in ``'left'`` frame ``left_only`` + Merge key only in ``'right'`` frame ``right_only`` + Merge key in both frames ``both`` + =================================== ================ + +.. ipython:: python + + df1 = DataFrame({'col1':[0,1], 'col_left':['a','b']}) + df2 = DataFrame({'col1':[1,2,2],'col_right':[2,2,2]}) + merge(df1, df2, on='col1', how='outer', indicator=True) + +The ``indicator`` argument will also accept string arguments, in which case the indicator function will use the value of the passed string as the name for the indicator column. + +.. ipython:: python + + merge(df1, df2, on='col1', how='outer', indicator='indicator_column') + + .. _merging.join.index: Joining on index diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index eae33bc80be32..98b65c288234e 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -51,6 +51,27 @@ Check the :ref:`API Changes ` and :ref:`deprecations ` + + .. ipython:: python + + df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b']}) + df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]}) + pd.merge(df1, df2, on='col1', how='outer', indicator=True) + + + + - ``DataFrame`` has the ``nlargest`` and ``nsmallest`` methods (:issue:`10393`) - SQL io functions now accept a SQLAlchemy connectable. (:issue:`7877`) - Enable writing complex values to HDF stores when using table format (:issue:`10447`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 997dfeb728ade..74397bb95091c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -115,6 +115,17 @@ side, respectively copy : boolean, default True If False, do not copy data unnecessarily +indicator : boolean or string, default False + If True, adds a column to output DataFrame called "_merge" with + information on the source of each row. + If string, column with information on source of each row will be added to + output DataFrame, and column will be named value of string. + Information column is Categorical-type and takes on a value of "left_only" + for observations whose merge key only appears in 'left' DataFrame, + "right_only" for observations whose merge key only appears in 'right' + DataFrame, and "both" if the observation's merge key is found in both. + + .. versionadded:: 0.17.0 Examples -------- diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index a8b0d37b55bfe..144fef4914c09 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -27,11 +27,11 @@ @Appender(_merge_doc, indents=0) def merge(left, right, how='inner', on=None, left_on=None, right_on=None, left_index=False, right_index=False, sort=False, - suffixes=('_x', '_y'), copy=True): + suffixes=('_x', '_y'), copy=True, indicator=False): op = _MergeOperation(left, right, how=how, on=on, left_on=left_on, right_on=right_on, left_index=left_index, right_index=right_index, sort=sort, suffixes=suffixes, - copy=copy) + copy=copy, indicator=indicator) return op.get_result() if __debug__: merge.__doc__ = _merge_doc % '\nleft : DataFrame' @@ -157,7 +157,7 @@ class _MergeOperation(object): def __init__(self, left, right, how='inner', on=None, left_on=None, right_on=None, axis=1, left_index=False, right_index=False, sort=True, - suffixes=('_x', '_y'), copy=True): + suffixes=('_x', '_y'), copy=True, indicator=False): self.left = self.orig_left = left self.right = self.orig_right = right self.how = how @@ -174,12 +174,25 @@ def __init__(self, left, right, how='inner', on=None, self.left_index = left_index self.right_index = right_index + self.indicator = indicator + + if isinstance(self.indicator, compat.string_types): + self.indicator_name = self.indicator + elif isinstance(self.indicator, bool): + self.indicator_name = '_merge' if self.indicator else None + else: + raise ValueError('indicator option can only accept boolean or string arguments') + + # note this function has side effects (self.left_join_keys, self.right_join_keys, self.join_names) = self._get_merge_keys() def get_result(self): + if self.indicator: + self.left, self.right = self._indicator_pre_merge(self.left, self.right) + join_index, left_indexer, right_indexer = self._get_join_info() ldata, rdata = self.left._data, self.right._data @@ -199,10 +212,46 @@ def get_result(self): typ = self.left._constructor result = typ(result_data).__finalize__(self, method='merge') + if self.indicator: + result = self._indicator_post_merge(result) + self._maybe_add_join_keys(result, left_indexer, right_indexer) return result + def _indicator_pre_merge(self, left, right): + + columns = left.columns.union(right.columns) + + for i in ['_left_indicator', '_right_indicator']: + if i in columns: + raise ValueError("Cannot use `indicator=True` option when data contains a column named {}".format(i)) + if self.indicator_name in columns: + raise ValueError("Cannot use name of an existing column for indicator column") + + left = left.copy() + right = right.copy() + + left['_left_indicator'] = 1 + left['_left_indicator'] = left['_left_indicator'].astype('int8') + + right['_right_indicator'] = 2 + right['_right_indicator'] = right['_right_indicator'].astype('int8') + + return left, right + + def _indicator_post_merge(self, result): + + result['_left_indicator'] = result['_left_indicator'].fillna(0) + result['_right_indicator'] = result['_right_indicator'].fillna(0) + + result[self.indicator_name] = Categorical((result['_left_indicator'] + result['_right_indicator']), categories=[1,2,3]) + result[self.indicator_name] = result[self.indicator_name].cat.rename_categories(['left_only', 'right_only', 'both']) + + result = result.drop(labels=['_left_indicator', '_right_indicator'], axis=1) + + return result + def _maybe_add_join_keys(self, result, left_indexer, right_indexer): # insert group keys diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index b7b7dd20a2045..9a7888b1b1c47 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -946,6 +946,85 @@ def test_overlapping_columns_error_message(self): df2.columns = ['key1', 'foo', 'foo'] self.assertRaises(ValueError, merge, df, df2) + def test_indicator(self): + # PR #10054. xref #7412 and closes #8790. + df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b'], 'col_conflict':[1,2]}) + df1_copy = df1.copy() + + df2 = pd.DataFrame({'col1':[1,2,3,4,5],'col_right':[2,2,2,2,2], + 'col_conflict':[1,2,3,4,5]}) + df2_copy = df2.copy() + + df_result = pd.DataFrame({'col1':[0,1,2,3,4,5], + 'col_conflict_x':[1,2,np.nan,np.nan,np.nan,np.nan], + 'col_left':['a','b', np.nan,np.nan,np.nan,np.nan], + 'col_conflict_y':[np.nan,1,2,3,4,5], + 'col_right':[np.nan, 2,2,2,2,2]}, + dtype='float64') + df_result['_merge'] = pd.Categorical(['left_only','both','right_only', + 'right_only','right_only','right_only'] + , categories=['left_only', 'right_only', 'both']) + + df_result = df_result[['col1', 'col_conflict_x', 'col_left', + 'col_conflict_y', 'col_right', '_merge' ]] + + test = pd.merge(df1, df2, on='col1', how='outer', indicator=True) + assert_frame_equal(test, df_result) + + # No side effects + assert_frame_equal(df1, df1_copy) + assert_frame_equal(df2, df2_copy) + + # Check with custom name + df_result_custom_name = df_result + df_result_custom_name = df_result_custom_name.rename(columns={'_merge':'custom_name'}) + + test_custom_name = pd.merge(df1, df2, on='col1', how='outer', indicator='custom_name') + assert_frame_equal(test_custom_name, df_result_custom_name) + + # Check only accepts strings and booleans + with tm.assertRaises(ValueError): + pd.merge(df1, df2, on='col1', how='outer', indicator=5) + + # Check result integrity + + test2 = pd.merge(df1, df2, on='col1', how='left', indicator=True) + self.assertTrue((test2._merge != 'right_only').all()) + + test3 = pd.merge(df1, df2, on='col1', how='right', indicator=True) + self.assertTrue((test3._merge != 'left_only').all()) + + test4 = pd.merge(df1, df2, on='col1', how='inner', indicator=True) + self.assertTrue((test4._merge == 'both').all()) + + # Check if working name in df + for i in ['_right_indicator', '_left_indicator', '_merge']: + df_badcolumn = pd.DataFrame({'col1':[1,2], i:[2,2]}) + + with tm.assertRaises(ValueError): + pd.merge(df1, df_badcolumn, on='col1', how='outer', indicator=True) + + # Check for name conflict with custom name + df_badcolumn = pd.DataFrame({'col1':[1,2], 'custom_column_name':[2,2]}) + + with tm.assertRaises(ValueError): + pd.merge(df1, df_badcolumn, on='col1', how='outer', indicator='custom_column_name') + + # Merge on multiple columns + df3 = pd.DataFrame({'col1':[0,1], 'col2':['a','b']}) + + df4 = pd.DataFrame({'col1':[1,1,3], 'col2':['b','x','y']}) + + hand_coded_result = pd.DataFrame({'col1':[0,1,1,3.0], + 'col2':['a','b','x','y']}) + hand_coded_result['_merge'] = pd.Categorical( + ['left_only','both','right_only','right_only'] + , categories=['left_only', 'right_only', 'both']) + + test5 = pd.merge(df3, df4, on=['col1', 'col2'], how='outer', indicator=True) + assert_frame_equal(test5, hand_coded_result) + + def _check_merge(x, y): for how in ['inner', 'left', 'outer']: result = x.join(y, how=how)