add validate argument to merge

Nick Eubank · Nick Eubank · commit b2e87ea10cb4 · 2017-05-21T09:58:22.000-07:00
diff --git a/doc/source/merging.rst b/doc/source/merging.rst
@@ -513,7 +513,8 @@ standard database join operations between DataFrame objects:
 
     pd.merge(left, right, how='inner', on=None, left_on=None, right_on=None,
              left_index=False, right_index=False, sort=True,
-             suffixes=('_x', '_y'), copy=True, indicator=False)
+             suffixes=('_x', '_y'), copy=True, indicator=False,
+             validate=None)
 
 - ``left``: A DataFrame object
 - ``right``: Another DataFrame object
@@ -551,6 +552,21 @@ standard database join operations between DataFrame objects:
 
   .. versionadded:: 0.17.0
 
+- ``validate`` : string, default None
+  If specified, checks if merge is of specified type.
+
+  * "one_to_one" or "1:1": checks if merge keys are unique in both
+    left and right datasets.
+  * "one_to_many" or "1:m": checks if merge keys are unique in left
+    dataset.
+  * "many_to_one" or "m:1": checks if merge keys are unique in right
+    dataset.
+  * "many_to_many" or "m:m": allowed, but does not result in checks. 
+
+
+  .. versionadded:: 0.21.0
+
+
 The return type will be the same as ``left``. If ``left`` is a ``DataFrame``
 and ``right`` is a subclass of DataFrame, the return type will still be
 ``DataFrame``.
@@ -711,10 +727,42 @@ Here is another example with duplicate join keys in DataFrames:
           labels=['left', 'right'], vertical=False);
    plt.close('all');
 
+
 .. warning::
 
-  Joining / merging on duplicate keys can cause a returned frame that is the multiplication of the row dimensions,
-  may result in memory overflow. It is the user' s responsibility to manage duplicate values in keys before joining large DataFrames.
+  Joining / merging on duplicate keys can cause a returned frame that is the multiplication of the row dimensions, which may result in memory overflow. It is the user' s responsibility to manage duplicate values in keys before joining large DataFrames.
+
+.. _merging.validation:
+
+Checking for duplicate keys
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. versionadded:: 0.21.0
+
+Users can use the ``validate`` argument to automatically check whether there are unexpected duplicates in their merge keys. Key uniqueness is checked before merge operations and so should protect against memory overflows. Checking key uniqueness is also a good way to ensure user data structures are as expected. 
+
+In the following example, there are duplicate values of ``B`` in the right DataFrame. As this is not a one-to-one merge -- as specified in the ``validate`` argument -- an exception will be raised.
+
+
+.. ipython:: python
+
+  left = pd.DataFrame({'A' : [1,2], 'B' : [1, 2]})
+  right = pd.DataFrame({'A' : [4,5,6], 'B': [2, 2, 2]})
+
+.. code-block:: python
+
+  In  [53]: result = pd.merge(left, right, on='B', how='outer', validate="one_to_one")  
+  Out [53]:
+  ---------------------------------------------------------------------------
+  
+  MergeError: Merge keys are not unique in right dataset; not a one-to-one merge    
+
+If the user is aware of the duplicates in the right `DataFrame` but wants to ensure there are no duplicates in the left DataFrame, one can use the `validate='one_to_many'` argument instead, which will not raise an exception. 
+
+.. ipython:: python
+
+   pd.merge(left, right, on='B', how='outer', validate="one_to_many")
+
 
 .. _merging.indicator:
 
diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
@@ -25,16 +25,15 @@ New features
 - Added `__fspath__` method to :class`:pandas.HDFStore`, :class:`pandas.ExcelFile`,
   and :class:`pandas.ExcelWriter` to work properly with the file system path protocol (:issue:`13823`)
 
-
 .. _whatsnew_0210.enhancements.other:
 
 Other Enhancements
 ^^^^^^^^^^^^^^^^^^
+
+- The ``validate`` argument for :func:`merge` function now checks whether a merge is one-to-one, one-to-many, many-to-one, or many-to-many. If a merge is found to not be an example of specified merge type, an exception will be raised. For more, see :ref:`here <merging.validation>` (:issue:`16270`)
 - ``Series.to_dict()`` and ``DataFrame.to_dict()`` now support an ``into`` keyword which allows you to specify the ``collections.Mapping`` subclass that you would like returned.  The default is ``dict``, which is backwards compatible. (:issue:`16122`)
 - ``RangeIndex.append`` now returns a ``RangeIndex`` object when possible (:issue:`16212`)
-
-- :func:`to_pickle` has gained a protocol parameter (:issue:`16252`). By default,
-this parameter is set to `HIGHEST_PROTOCOL <https://docs.python.org/3/library/pickle.html#data-stream-format>`__
+- :func:`to_pickle` has gained a protocol parameter (:issue:`16252`). By default, this parameter is set to `HIGHEST_PROTOCOL <https://docs.python.org/3/library/pickle.html#data-stream-format>`__
 
 .. _whatsnew_0210.api_breaking:
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -175,6 +175,19 @@
 
     .. versionadded:: 0.17.0
 
+validate : string, default None
+    If specified, checks if merge is of specified type.
+
+    * "one_to_one" or "1:1": check if merge keys are unique in both
+      left and right datasets.
+    * "one_to_many" or "1:m": check if merge keys are unique in left
+      dataset.
+    * "many_to_one" or "m:1": check if merge keys are unique in right
+      dataset.
+    * "many_to_may" or "m:m": allowed, but does not result in checks.
+
+    .. versionadded:: 0.21.0
+
 Examples
 --------
 
@@ -4868,12 +4881,13 @@ def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='',
     @Appender(_merge_doc, indents=2)
     def merge(self, right, how='inner', on=None, left_on=None, right_on=None,
               left_index=False, right_index=False, sort=False,
-              suffixes=('_x', '_y'), copy=True, indicator=False):
+              suffixes=('_x', '_y'), copy=True, indicator=False,
+              validate=None):
         from pandas.core.reshape.merge import merge
         return merge(self, right, how=how, on=on, left_on=left_on,
                      right_on=right_on, left_index=left_index,
                      right_index=right_index, sort=sort, suffixes=suffixes,
-                     copy=copy, indicator=indicator)
+                     copy=copy, indicator=indicator, validate=validate)
 
     def round(self, decimals=0, *args, **kwargs):
         """
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
@@ -46,11 +46,13 @@
 @Appender(_merge_doc, indents=0)
 def merge(left, right, how='inner', on=None, left_on=None, right_on=None,
           left_index=False, right_index=False, sort=False,
-          suffixes=('_x', '_y'), copy=True, indicator=False):
+          suffixes=('_x', '_y'), copy=True, indicator=False,
+          validate=None):
     op = _MergeOperation(left, right, how=how, on=on, left_on=left_on,
                          right_on=right_on, left_index=left_index,
                          right_index=right_index, sort=sort, suffixes=suffixes,
-                         copy=copy, indicator=indicator)
+                         copy=copy, indicator=indicator,
+                         validate=validate)
     return op.get_result()
 
 
@@ -341,6 +343,7 @@ def merge_asof(left, right, on=None,
 
         .. versionadded:: 0.20.0
 
+
     Returns
     -------
     merged : DataFrame
@@ -504,7 +507,8 @@ class _MergeOperation(object):
     def __init__(self, left, right, how='inner', on=None,
                  left_on=None, right_on=None, axis=1,
                  left_index=False, right_index=False, sort=True,
-                 suffixes=('_x', '_y'), copy=True, indicator=False):
+                 suffixes=('_x', '_y'), copy=True, indicator=False,
+                 validate=None):
         self.left = self.orig_left = left
         self.right = self.orig_right = right
         self.how = how
@@ -567,6 +571,12 @@ def __init__(self, left, right, how='inner', on=None,
         # to avoid incompat dtypes
         self._maybe_coerce_merge_keys()
 
+        # If argument passed to validate,
+        # check if columns specified as unique
+        # are in fact unique.
+        if validate is not None:
+            self._validate(validate)
+
     def get_result(self):
         if self.indicator:
             self.left, self.right = self._indicator_pre_merge(
@@ -958,6 +968,49 @@ def _validate_specification(self):
         if len(self.right_on) != len(self.left_on):
             raise ValueError("len(right_on) must equal len(left_on)")
 
+    def _validate(self, validate):
+
+        # Check uniqueness of each
+        if self.left_index:
+            left_unique = self.orig_left.index.is_unique
+        else:
+            left_unique = MultiIndex.from_arrays(self.left_join_keys
+                                                 ).is_unique
+
+        if self.right_index:
+            right_unique = self.orig_right.index.is_unique
+        else:
+            right_unique = MultiIndex.from_arrays(self.right_join_keys
+                                                  ).is_unique
+
+        # Check data integrity
+        if validate in ["one_to_one", "1:1"]:
+            if not left_unique and not right_unique:
+                raise ValueError("Merge keys are not unique in either left"
+                                 " or right dataset; not a one-to-one merge")
+            elif not left_unique:
+                raise ValueError("Merge keys are not unique in left dataset;"
+                                 " not a one-to-one merge")
+            elif not right_unique:
+                raise ValueError("Merge keys are not unique in right dataset;"
+                                 " not a one-to-one merge")
+
+        elif validate in ["one_to_many", "1:m"]:
+            if not left_unique:
+                raise ValueError("Merge keys are not unique in left dataset;"
+                                 "not a one-to-many merge")
+
+        elif validate in ["many_to_one", "m:1"]:
+            if not right_unique:
+                raise ValueError("Merge keys are not unique in right dataset;"
+                                 " not a many-to-one merge")
+
+        elif validate in ['many_to_many', 'm:m']:
+            pass
+
+        else:
+            raise ValueError("Not a valid argument for validate")
+
 
 def _get_join_indexers(left_keys, right_keys, sort=False, how='inner',
                        **kwargs):
diff --git a/pandas/tests/reshape/test_merge.py b/pandas/tests/reshape/test_merge.py
@@ -724,6 +724,130 @@ def test_indicator(self):
                           how='outer', indicator=True)
         assert_frame_equal(test5, hand_coded_result)
 
+    def test_validation(self):
+        left = DataFrame({'a': ['a', 'b', 'c', 'd'],
+                          'b': ['cat', 'dog', 'weasel', 'horse']},
+                         index=range(4))
+
+        right = DataFrame({'a': ['a', 'b', 'c', 'd', 'e'],
+                           'c': ['meow', 'bark', 'um... weasel noise?',
+                                 'nay', 'chirp']},
+                          index=range(5))
+
+        # Make sure no side effects.
+        left_copy = left.copy()
+        right_copy = right.copy()
+
+        result = merge(left, right, left_index=True, right_index=True,
+                       validate='1:1')
+        assert_frame_equal(left, left_copy)
+        assert_frame_equal(right, right_copy)
+
+        # make sure merge still correct
+        expected = DataFrame({'a_x': ['a', 'b', 'c', 'd'],
+                              'b': ['cat', 'dog', 'weasel', 'horse'],
+                              'a_y': ['a', 'b', 'c', 'd'],
+                              'c': ['meow', 'bark', 'um... weasel noise?',
+                                    'nay']},
+                             index=range(4),
+                             columns=['a_x', 'b', 'a_y', 'c'])
+
+        result = merge(left, right, left_index=True, right_index=True,
+                       validate='one_to_one')
+        assert_frame_equal(result, expected)
+
+        expected_2 = DataFrame({'a': ['a', 'b', 'c', 'd'],
+                                'b': ['cat', 'dog', 'weasel', 'horse'],
+                                'c': ['meow', 'bark', 'um... weasel noise?',
+                                      'nay']},
+                               index=range(4))
+
+        result = merge(left, right, on='a', validate='1:1')
+        assert_frame_equal(left, left_copy)
+        assert_frame_equal(right, right_copy)
+        assert_frame_equal(result, expected_2)
+
+        result = merge(left, right, on='a', validate='one_to_one')
+        assert_frame_equal(result, expected_2)
+
+        # One index, one column
+        expected_3 = DataFrame({'b': ['cat', 'dog', 'weasel', 'horse'],
+                                'a': ['a', 'b', 'c', 'd'],
+                                'c': ['meow', 'bark', 'um... weasel noise?',
+                                      'nay']},
+                               columns=['b', 'a', 'c'],
+                               index=range(4))
+
+        left_index_reset = left.set_index('a')
+        result = merge(left_index_reset, right, left_index=True,
+                       right_on='a', validate='one_to_one')
+        assert_frame_equal(result, expected_3)
+
+        # Dups on right
+        right_w_dups = right.append(pd.DataFrame({'a': ['e'], 'c': ['moo']},
+                                    index=[4]))
+        merge(left, right_w_dups, left_index=True, right_index=True,
+              validate='one_to_many')
+
+        with pytest.raises(ValueError):
+            merge(left, right_w_dups, left_index=True, right_index=True,
+                  validate='one_to_one')
+
+        with pytest.raises(ValueError):
+            merge(left, right_w_dups, on='a', validate='one_to_one')
+
+        # Dups on left
+        left_w_dups = left.append(pd.DataFrame({'a': ['a'], 'c': ['cow']},
+                                               index=[3]))
+        merge(left_w_dups, right, left_index=True, right_index=True,
+              validate='many_to_one')
+
+        with pytest.raises(ValueError):
+            merge(left_w_dups, right, left_index=True, right_index=True,
+                  validate='one_to_one')
+
+        with pytest.raises(ValueError):
+            merge(left_w_dups, right, on='a', validate='one_to_one')
+
+        # Dups on both
+        merge(left_w_dups, right_w_dups, on='a', validate='many_to_many')
+
+        with pytest.raises(ValueError):
+            merge(left_w_dups, right_w_dups, left_index=True,
+                  right_index=True, validate='many_to_one')
+
+        with pytest.raises(ValueError):
+            merge(left_w_dups, right_w_dups, on='a',
+                  validate='one_to_many')
+
+        # Check invalid arguments
+        with pytest.raises(ValueError):
+            merge(left, right, on='a', validate='jibberish')
+
+        # Two column merge, dups in both, but jointly no dups.
+        left = DataFrame({'a': ['a', 'a', 'b', 'b'],
+                          'b': [0, 1, 0, 1],
+                          'c': ['cat', 'dog', 'weasel', 'horse']},
+                         index=range(4))
+
+        right = DataFrame({'a': ['a', 'a', 'b'],
+                           'b': [0, 1, 0],
+                           'd': ['meow', 'bark', 'um... weasel noise?']},
+                          index=range(3))
+
+        expected_multi = DataFrame({'a': ['a', 'a', 'b'],
+                                    'b': [0, 1, 0],
+                                    'c': ['cat', 'dog', 'weasel'],
+                                    'd': ['meow', 'bark',
+                                          'um... weasel noise?']},
+                                   index=range(3))
+
+        with pytest.raises(ValueError):
+            merge(left, right, on='a', validate='1:1')
+
+        result = merge(left, right, on=['a', 'b'], validate='1:1')
+        assert_frame_equal(result, expected_multi)
+
 
 def _check_merge(x, y):
     for how in ['inner', 'left', 'outer']: