add validate argument to merge

Nick Eubank · Nick Eubank · commit 496e915645d8 · 2017-05-07T18:12:40.000-07:00
diff --git a/doc/source/merging.rst b/doc/source/merging.rst
@@ -513,7 +513,8 @@ standard database join operations between DataFrame objects:
 
     pd.merge(left, right, how='inner', on=None, left_on=None, right_on=None,
              left_index=False, right_index=False, sort=True,
-             suffixes=('_x', '_y'), copy=True, indicator=False)
+             suffixes=('_x', '_y'), copy=True, indicator=False,
+             validate=None)
 
 - ``left``: A DataFrame object
 - ``right``: Another DataFrame object
@@ -551,6 +552,18 @@ standard database join operations between DataFrame objects:
 
   .. versionadded:: 0.17.0
 
+- ``validate`` : {None, '1:1', '1:m', 'm:1', 'm:m', "one_to_one", "one_to_many", "many_to_one", "many_to_many"}, default None
+  If specified, checks if merge is of specified type.
+  * "one_to_one" or "1:1": check if merge keys are unique in both
+  left and right dataset.
+  * "one_to_many" or "1:m": check if merge keys are unique in left
+  dataset.
+  * "many_to_one" or "m:1": check if merge keys are unique in right
+  dataset.
+
+  .. versionadded:: 0.21.0
+
+
 The return type will be the same as ``left``. If ``left`` is a ``DataFrame``
 and ``right`` is a subclass of DataFrame, the return type will still be
 ``DataFrame``.
@@ -711,10 +724,43 @@ Here is another example with duplicate join keys in DataFrames:
           labels=['left', 'right'], vertical=False);
    plt.close('all');
 
+
 .. warning::
 
-  Joining / merging on duplicate keys can cause a returned frame that is the multiplication of the row dimensions,
-  may result in memory overflow. It is the user' s responsibility to manage duplicate values in keys before joining large DataFrames.
+  Joining / merging on duplicate keys can cause a returned frame that is the multiplication of the row dimensions, which may result in memory overflow. It is the user' s responsibility to manage duplicate values in keys before joining large DataFrames.
+
+.. _merging.validation:
+
+Checking for duplicate keys
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. versionadded:: 0.21.0
+
+Users can use the ``validate`` argument to automatically check whether there are unexpected duplicates in their merge keys. Key uniqueness is checked before merge operations and so should protect against memory overflows. Checking key uniqueness is also a good way to ensure user data structures are as expected. 
+
+In the following example, there are duplicate values of ``B`` in the right DataFrame. As this is not a one-to-one merge -- as specified in the ``validate`` argument -- an exception will be raised.
+
+.. code-block:: python
+
+  left = pd.DataFrame({'A' : [1,2], 'B' : [1, 2]})
+  right = pd.DataFrame({'A' : [4,5,6], 'B': [2, 2, 2]})
+  result = pd.merge(left, right, on='B', how='outer', validate="one_to_one");
+  
+  ValueError: Merge keys are not unique in either left or right dataset; not a one-to-one merge
+
+
+If the user is aware of the duplicates in the right `DataFrame` but wants to ensure there are no duplicates in the left DataFrame, one can use the `one_to_many` argument instead, which will not raise an exception. 
+
+.. ipython:: python
+   :suppress:
+
+   left = pd.DataFrame({'A' : [1,2], 'B' : [1, 2]})
+   right = pd.DataFrame({'A' : [4,5,6], 'B': [2, 2, 2]})
+
+.. ipython:: python
+
+   pd.merge(left, right, on='B', how='outer', validate="one_to_many")
+
 
 .. _merging.indicator:
 
diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
@@ -20,13 +20,20 @@ Check the :ref:`API Changes <whatsnew_0210.api_breaking>` and :ref:`deprecations
 New features
 ~~~~~~~~~~~~
 
-
-
 .. _whatsnew_0210.enhancements.other:
 
 Other Enhancements
 ^^^^^^^^^^^^^^^^^^
 
+.. _whatsnew_0210.enhancements.other.merge_validate:
+
+``validate`` argument checks merge key uniqueness
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``validate`` argument for :func:`merge` function now checks whether a merge is
+one-to-one, one-to-many, many-to-one, or many-to-many. If a merge is found to not 
+be an example of specified merge type, an exception will be raised. (:issue:`16270`)
+
 
 
 .. _whatsnew_0210.api_breaking:
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -174,6 +174,18 @@
 
     .. versionadded:: 0.17.0
 
+validate : {None, '1:1', '1:m', 'm:1', 'm:m', "one_to_one", "one_to_many",
+            "many_to_one", "many_to_many"}, default None
+    If specified, checks if merge is of specified type.
+    * "one_to_one" or "1:1": check if merge keys are unique in both
+    left and right dataset.
+    * "one_to_many" or "1:m": check if merge keys are unique in left
+    dataset.
+    * "many_to_one" or "m:1": check if merge keys are unique in right
+    dataset.
+
+    .. versionadded:: 0.21.0
+
 Examples
 --------
 
@@ -4812,12 +4824,13 @@ def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='',
     @Appender(_merge_doc, indents=2)
     def merge(self, right, how='inner', on=None, left_on=None, right_on=None,
               left_index=False, right_index=False, sort=False,
-              suffixes=('_x', '_y'), copy=True, indicator=False):
+              suffixes=('_x', '_y'), copy=True, indicator=False,
+              validate=None):
         from pandas.core.reshape.merge import merge
         return merge(self, right, how=how, on=on, left_on=left_on,
                      right_on=right_on, left_index=left_index,
                      right_index=right_index, sort=sort, suffixes=suffixes,
-                     copy=copy, indicator=indicator)
+                     copy=copy, indicator=indicator, validate=validate)
 
     def round(self, decimals=0, *args, **kwargs):
         """
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
@@ -46,11 +46,13 @@
 @Appender(_merge_doc, indents=0)
 def merge(left, right, how='inner', on=None, left_on=None, right_on=None,
           left_index=False, right_index=False, sort=False,
-          suffixes=('_x', '_y'), copy=True, indicator=False):
+          suffixes=('_x', '_y'), copy=True, indicator=False,
+          validate=None):
     op = _MergeOperation(left, right, how=how, on=on, left_on=left_on,
                          right_on=right_on, left_index=left_index,
                          right_index=right_index, sort=sort, suffixes=suffixes,
-                         copy=copy, indicator=indicator)
+                         copy=copy, indicator=indicator,
+                         validate=validate)
     return op.get_result()
 
 
@@ -263,7 +265,8 @@ def merge_asof(left, right, on=None,
                suffixes=('_x', '_y'),
                tolerance=None,
                allow_exact_matches=True,
-               direction='backward'):
+               direction='backward',
+               validate=None):
     """Perform an asof merge. This is similar to a left-join except that we
     match on nearest key rather than equal keys.
 
@@ -341,6 +344,19 @@ def merge_asof(left, right, on=None,
 
         .. versionadded:: 0.20.0
 
+    validate : {None, '1:1', '1:m', 'm:1', 'm:m', "one_to_one", "one_to_many",
+                "many_to_one", "many_to_many"}, default None
+        If specified, checks if merge is of specified type.
+        * "one_to_one" or "1:1": check if merge keys are unique in both
+        left and right dataset.
+        * "one_to_many" or "1:m": check if merge keys are unique in left
+        dataset.
+        * "many_to_one" or "m:1": check if merge keys are unique in right
+        dataset.
+
+    .. versionadded:: 0.21.0
+
+
     Returns
     -------
     merged : DataFrame
@@ -482,7 +498,7 @@ def merge_asof(left, right, on=None,
                     suffixes=suffixes,
                     how='asof', tolerance=tolerance,
                     allow_exact_matches=allow_exact_matches,
-                    direction=direction)
+                    direction=direction, validate=validate)
     return op.get_result()
 
 
@@ -498,7 +514,8 @@ class _MergeOperation(object):
     def __init__(self, left, right, how='inner', on=None,
                  left_on=None, right_on=None, axis=1,
                  left_index=False, right_index=False, sort=True,
-                 suffixes=('_x', '_y'), copy=True, indicator=False):
+                 suffixes=('_x', '_y'), copy=True, indicator=False,
+                 validate=None):
         self.left = self.orig_left = left
         self.right = self.orig_right = right
         self.how = how
@@ -561,6 +578,12 @@ def __init__(self, left, right, how='inner', on=None,
         # to avoid incompat dtypes
         self._maybe_coerce_merge_keys()
 
+        # If argument passed to validate,
+        # check if columns specified as unique
+        # are in fact unique.
+        if validate is not None:
+            self._validate(validate)
+
     def get_result(self):
         if self.indicator:
             self.left, self.right = self._indicator_pre_merge(
@@ -952,6 +975,51 @@ def _validate_specification(self):
         if len(self.right_on) != len(self.left_on):
             raise ValueError("len(right_on) must equal len(left_on)")
 
+    def _validate(self, validate):
+
+        # Check uniqueness of each
+        if self.left_index:
+            left_unique = not (self.orig_left.index.duplicated()).any()
+        else:
+            left_unique = MultiIndex.from_arrays(self.left_join_keys
+                                                 ).is_unique
+
+        if self.right_index:
+            right_unique = not (self.orig_right.index.duplicated()).any()
+        else:
+            right_unique = MultiIndex.from_arrays(self.right_join_keys
+                                                  ).is_unique
+
+        # Check valid arg
+        if validate not in ['one_to_one', '1:1',
+                            'one_to_many', '1:m',
+                            'many_to_one', 'm:1',
+                            'many_to_many', 'm:m']:
+
+            raise ValueError("Not a valid argument for validate")
+
+        # Check data integrity
+        if validate in ["one_to_one", "1:1"]:
+            if not left_unique or not right_unique:
+                raise ValueError("Merge keys are not unique in either left"
+                                 " or right dataset; not a one-to-one merge")
+            if not left_unique:
+                raise ValueError("Merge keys are not unique in left dataset;"
+                                 " not a one-to-one merge")
+            if not right_unique:
+                raise ValueError("Merge keys are not unique in right dataset;"
+                                 " not a one-to-one merge")
+
+        if validate in ["one_to_many", "1:m"]:
+            if not left_unique:
+                raise ValueError("Merge keys are not unique in left dataset;"
+                                 "not a one-to-many merge")
+
+        if validate in ["many_to_one", "m:1"]:
+            if not right_unique:
+                raise ValueError("Merge keys are not unique in right dataset;"
+                                 " not a many-to-one merge")
+
 
 def _get_join_indexers(left_keys, right_keys, sort=False, how='inner',
                        **kwargs):
@@ -1004,15 +1072,17 @@ class _OrderedMerge(_MergeOperation):
     def __init__(self, left, right, on=None, left_on=None, right_on=None,
                  left_index=False, right_index=False, axis=1,
                  suffixes=('_x', '_y'), copy=True,
-                 fill_method=None, how='outer'):
+                 fill_method=None, how='outer',
+                 validate=None):
 
         self.fill_method = fill_method
         _MergeOperation.__init__(self, left, right, on=on, left_on=left_on,
                                  left_index=left_index,
                                  right_index=right_index,
                                  right_on=right_on, axis=axis,
                                  how=how, suffixes=suffixes,
-                                 sort=True  # factorize sorts
+                                 sort=True,  # factorize sorts
+                                 validate=validate
                                  )
 
     def get_result(self):
@@ -1109,7 +1179,7 @@ def __init__(self, left, right, on=None, left_on=None, right_on=None,
                  fill_method=None,
                  how='asof', tolerance=None,
                  allow_exact_matches=True,
-                 direction='backward'):
+                 direction='backward', validate=None):
 
         self.by = by
         self.left_by = left_by
@@ -1122,7 +1192,8 @@ def __init__(self, left, right, on=None, left_on=None, right_on=None,
                                right_on=right_on, left_index=left_index,
                                right_index=right_index, axis=axis,
                                how=how, suffixes=suffixes,
-                               fill_method=fill_method)
+                               fill_method=fill_method,
+                               validate=validate)
 
     def _validate_specification(self):
         super(_AsOfMerge, self)._validate_specification()
diff --git a/pandas/tests/reshape/test_merge.py b/pandas/tests/reshape/test_merge.py
@@ -724,6 +724,63 @@ def test_indicator(self):
                           how='outer', indicator=True)
         assert_frame_equal(test5, hand_coded_result)
 
+    def test_validation(self):
+        left = DataFrame({'a': ['a', 'b', 'c', 'd'],
+                          'b': ['cat', 'dog', 'weasel', 'horse']},
+                         index=range(4))
+
+        right = DataFrame({'a': ['a', 'b', 'c', 'd', 'e'],
+                           'c': ['meow', 'bark', 'um... weasel noise?',
+                                 'nay', 'chirp']},
+                          index=range(5))
+
+        merge(left, right, left_index=True, right_index=True, validate='1:1')
+        merge(left, right, left_index=True, right_index=True,
+              validate='one_to_one')
+        merge(left, right, on='a', validate='1:1')
+        merge(left, right, on='a', validate='one_to_one')
+
+        # Dups on right
+        right_w_dups = right.append(pd.DataFrame({'a': ['e'], 'c': ['moo']},
+                                    index=[4]))
+        merge(left, right_w_dups, left_index=True, right_index=True,
+              validate='one_to_many')
+
+        with pytest.raises(ValueError):
+            merge(left, right_w_dups, left_index=True, right_index=True,
+                  validate='one_to_one')
+
+        with pytest.raises(ValueError):
+            merge(left, right_w_dups, on='a', validate='one_to_one')
+
+        # Dups on left
+        left_w_dups = left.append(pd.DataFrame({'a': ['a'], 'c': ['cow']},
+                                               index=[3]))
+        merge(left_w_dups, right, left_index=True, right_index=True,
+              validate='many_to_one')
+
+        with pytest.raises(ValueError):
+            merge(left_w_dups, right, left_index=True, right_index=True,
+                  validate='one_to_one')
+
+        with pytest.raises(ValueError):
+            merge(left_w_dups, right, on='a', validate='one_to_one')
+
+        # Dups on both
+        merge(left_w_dups, right_w_dups, on='a', validate='many_to_many')
+
+        with pytest.raises(ValueError):
+            merge(left_w_dups, right_w_dups, left_index=True,
+                  right_index=True, validate='many_to_one')
+
+        with pytest.raises(ValueError):
+            merge(left_w_dups, right_w_dups, on='a',
+                  validate='one_to_many')
+
+        # Check invalid arguments
+        with pytest.raises(ValueError):
+            merge(left, right, on='a', validate='jibberish')
+
 
 def _check_merge(x, y):
     for how in ['inner', 'left', 'outer']:
diff --git a/pandas/tests/reshape/test_merge_asof.py b/pandas/tests/reshape/test_merge_asof.py
@@ -973,3 +973,29 @@ def test_on_float_by_int(self):
             columns=['symbol', 'exch', 'price', 'mpv'])
 
         assert_frame_equal(result, expected)
+
+    def test_validate(self):
+
+        left = pd.DataFrame({'a': [1, 5, 10],
+                             'left_val': ['a', 'b', 'c']})
+        right = pd.DataFrame({'a': [1, 2, 3, 6, 7],
+                              'right_val': [1, 2, 3, 6, 7]})
+        # Simple run 1:1
+        pd.merge_asof(left, right, on='a', validate="1:1")
+
+        # Dups on right
+        right_w_dups = right.append(pd.DataFrame({'a': [7],
+                                                  'right_val': [-2]}))
+        right_w_dups = right_w_dups.sort_values('a')
+
+        pd.merge_asof(left, right_w_dups, on='a', validate="1:m")
+        with pytest.raises(ValueError):
+            pd.merge_asof(left, right_w_dups, on='a', validate="1:1")
+
+        # Dups on left
+        left_w_dups = left.append(pd.DataFrame({'a': [1],
+                                                'left_val': [-2]}))
+        left_w_dups = left_w_dups.sort_values('a')
+        pd.merge_asof(left_w_dups, right, on='a', validate="m:1")
+        with pytest.raises(ValueError):
+            pd.merge_asof(left_w_dups, right_w_dups, on='a', validate="1:1")