add docs, integrate with merge_asof, add tests

Nick Eubank · Nick Eubank · commit 4e3c138ede4e · 2017-05-07T16:11:52.000-07:00
diff --git a/doc/source/merging.rst b/doc/source/merging.rst
@@ -513,7 +513,8 @@ standard database join operations between DataFrame objects:
 
     pd.merge(left, right, how='inner', on=None, left_on=None, right_on=None,
              left_index=False, right_index=False, sort=True,
-             suffixes=('_x', '_y'), copy=True, indicator=False)
+             suffixes=('_x', '_y'), copy=True, indicator=False,
+             validate=None)
 
 - ``left``: A DataFrame object
 - ``right``: Another DataFrame object
@@ -551,6 +552,18 @@ standard database join operations between DataFrame objects:
 
   .. versionadded:: 0.17.0
 
+- ``validate`` : {None, '1:1', '1:m', 'm:1', 'm:m', "one_to_one", "one_to_many", "many_to_one", "many_to_many"}, default None
+  If specified, checks if merge is of specified type.
+  * "one_to_one" or "1:1": check if merge keys are unique in both
+  left and right dataset.
+  * "one_to_many" or "1:m": check if merge keys are unique in left
+  dataset.
+  * "many_to_one" or "m:1": check if merge keys are unique in right
+  dataset.
+
+  .. versionadded:: 0.21.0
+
+
 The return type will be the same as ``left``. If ``left`` is a ``DataFrame``
 and ``right`` is a subclass of DataFrame, the return type will still be
 ``DataFrame``.
@@ -711,10 +724,32 @@ Here is another example with duplicate join keys in DataFrames:
           labels=['left', 'right'], vertical=False);
    plt.close('all');
 
+
 .. warning::
 
-  Joining / merging on duplicate keys can cause a returned frame that is the multiplication of the row dimensions,
-  may result in memory overflow. It is the user' s responsibility to manage duplicate values in keys before joining large DataFrames.
+  Joining / merging on duplicate keys can cause a returned frame that is the multiplication of the row dimensions, which may result in memory overflow. It is the user' s responsibility to manage duplicate values in keys before joining large DataFrames.
+
+Checking for duplicate keys
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Users can use the ``validate`` argument to automatically check whether there are unexpected duplicates in their merge keys. Key uniqueness is checked before merge operations and so should protect against memory overflows. Checking key uniqueness is also a good way to ensure user data structures are as expected. 
+
+In the following example, there are duplicate values of ``B`` in the right DataFrame. As this is not a one-to-one merge -- as specified in the ``validate`` argument -- an exception will be raised.
+
+.. ipython:: python
+
+   left = pd.DataFrame({'A' : [1,2], 'B' : [1, 2]})
+
+   right = pd.DataFrame({'A' : [4,5,6], 'B': [2, 2, 2]})
+
+   result = pd.merge(left, right, on='B', how='outer', validate="one_to_one");
+
+If the user is aware of the duplicates in the right `DataFrame` but wants to ensure there are no duplicates in the left DataFrame, one can use the `one_to_many` argument instead, which will not raise an exception. 
+
+.. ipython:: python
+
+   result = pd.merge(left, right, on='B', how='outer', validate="one_to_many")
+
 
 .. _merging.indicator:
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -174,13 +174,14 @@
 
     .. versionadded:: 0.17.0
 
-validate: None or string, default None
-    If specified, checks to ensure merge is of specified type.
-    If "one_to_one" or "1:1", checks merge keys are unique in both
+validate : {None, '1:1', '1:m', 'm:1', 'm:m', "one_to_one", "one_to_many",
+            "many_to_one", "many_to_many"}, default None
+    If specified, checks if merge is of specified type.
+    * "one_to_one" or "1:1": check if merge keys are unique in both
     left and right dataset.
-    If "one_to_many" or "1:m", checks merge keys are unique in left
+    * "one_to_many" or "1:m": check if merge keys are unique in left
     dataset.
-    If "many_to_one" or "m:1", checks merge keys are unique in right
+    * "many_to_one" or "m:1": check if merge keys are unique in right
     dataset.
 
     .. versionadded:: 0.21.0
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
@@ -265,7 +265,8 @@ def merge_asof(left, right, on=None,
                suffixes=('_x', '_y'),
                tolerance=None,
                allow_exact_matches=True,
-               direction='backward'):
+               direction='backward',
+               validate=None):
     """Perform an asof merge. This is similar to a left-join except that we
     match on nearest key rather than equal keys.
 
@@ -343,6 +344,19 @@ def merge_asof(left, right, on=None,
 
         .. versionadded:: 0.20.0
 
+    validate : {None, '1:1', '1:m', 'm:1', 'm:m', "one_to_one", "one_to_many",
+                "many_to_one", "many_to_many"}, default None
+        If specified, checks if merge is of specified type.
+        * "one_to_one" or "1:1": check if merge keys are unique in both
+        left and right dataset.
+        * "one_to_many" or "1:m": check if merge keys are unique in left
+        dataset.
+        * "many_to_one" or "m:1": check if merge keys are unique in right
+        dataset.
+
+    .. versionadded:: 0.21.0
+
+
     Returns
     -------
     merged : DataFrame
@@ -484,7 +498,7 @@ def merge_asof(left, right, on=None,
                     suffixes=suffixes,
                     how='asof', tolerance=tolerance,
                     allow_exact_matches=allow_exact_matches,
-                    direction=direction)
+                    direction=direction, validate=validate)
     return op.get_result()
 
 
@@ -519,7 +533,6 @@ def __init__(self, left, right, how='inner', on=None,
         self.right_index = right_index
 
         self.indicator = indicator
-        self.validate = validate
 
         if isinstance(self.indicator, compat.string_types):
             self.indicator_name = self.indicator
@@ -565,8 +578,11 @@ def __init__(self, left, right, how='inner', on=None,
         # to avoid incompat dtypes
         self._maybe_coerce_merge_keys()
 
-        if self.validate is not None:
-            self._validate()
+        # If argument passed to validate,
+        # check if columns specified as unique
+        # are in fact unique.
+        if validate is not None:
+            self._validate(validate)
 
     def get_result(self):
         if self.indicator:
@@ -959,11 +975,13 @@ def _validate_specification(self):
         if len(self.right_on) != len(self.left_on):
             raise ValueError("len(right_on) must equal len(left_on)")
 
-    def _validate(self):
-        # Get merging series:
+    def _validate(self, validate):
+
+        # Get axes
         left_key = self.left_on if self.left_on is not None else self.on
         right_key = self.right_on if self.right_on is not None else self.on
 
+        # Check uniqueness of each
         if self.left_index:
             left_unique = not (self.orig_left.index.duplicated()).any()
         else:
@@ -975,15 +993,15 @@ def _validate(self):
             right_unique = not (self.orig_right[right_key].duplicated()).any()
 
         # Check valid arg
-        if self.validate not in ['one_to_one', '1:1',
-                                 'one_to_many', '1:m',
-                                 'many_to_one', 'm:1',
-                                 'many_to_many', 'm:m']:
+        if validate not in ['one_to_one', '1:1',
+                            'one_to_many', '1:m',
+                            'many_to_one', 'm:1',
+                            'many_to_many', 'm:m']:
 
             raise ValueError("Not a valid argument for validate")
 
         # Check data integrity
-        if self.validate in ["one_to_one", "1:1"]:
+        if validate in ["one_to_one", "1:1"]:
             if not left_unique or not right_unique:
                 raise ValueError("Merge keys are not unique in either left"
                                  " or right dataset; not a one-to-one merge")
@@ -994,12 +1012,12 @@ def _validate(self):
                 raise ValueError("Merge keys are not unique in right dataset;"
                                  " not a one-to-one merge")
 
-        if self.validate in ["one_to_many", "1:m"]:
+        if validate in ["one_to_many", "1:m"]:
             if not left_unique:
                 raise ValueError("Merge keys are not unique in left dataset;"
                                  "not a one-to-many merge")
 
-        if self.validate in ["many_to_one", "m:1"]:
+        if validate in ["many_to_one", "m:1"]:
             if not right_unique:
                 raise ValueError("Merge keys are not unique in right dataset;"
                                  " not a many-to-one merge")
@@ -1056,15 +1074,17 @@ class _OrderedMerge(_MergeOperation):
     def __init__(self, left, right, on=None, left_on=None, right_on=None,
                  left_index=False, right_index=False, axis=1,
                  suffixes=('_x', '_y'), copy=True,
-                 fill_method=None, how='outer'):
+                 fill_method=None, how='outer',
+                 validate=None):
 
         self.fill_method = fill_method
         _MergeOperation.__init__(self, left, right, on=on, left_on=left_on,
                                  left_index=left_index,
                                  right_index=right_index,
                                  right_on=right_on, axis=axis,
                                  how=how, suffixes=suffixes,
-                                 sort=True  # factorize sorts
+                                 sort=True,  # factorize sorts
+                                 validate=validate
                                  )
 
     def get_result(self):
@@ -1161,7 +1181,7 @@ def __init__(self, left, right, on=None, left_on=None, right_on=None,
                  fill_method=None,
                  how='asof', tolerance=None,
                  allow_exact_matches=True,
-                 direction='backward'):
+                 direction='backward', validate=None):
 
         self.by = by
         self.left_by = left_by
@@ -1174,7 +1194,8 @@ def __init__(self, left, right, on=None, left_on=None, right_on=None,
                                right_on=right_on, left_index=left_index,
                                right_index=right_index, axis=axis,
                                how=how, suffixes=suffixes,
-                               fill_method=fill_method)
+                               fill_method=fill_method,
+                               validate=validate)
 
     def _validate_specification(self):
         super(_AsOfMerge, self)._validate_specification()
diff --git a/pandas/tests/reshape/test_merge_asof.py b/pandas/tests/reshape/test_merge_asof.py
@@ -973,3 +973,29 @@ def test_on_float_by_int(self):
             columns=['symbol', 'exch', 'price', 'mpv'])
 
         assert_frame_equal(result, expected)
+
+    def test_validate(self):
+
+        left = pd.DataFrame({'a': [1, 5, 10],
+                             'left_val': ['a', 'b', 'c']})
+        right = pd.DataFrame({'a': [1, 2, 3, 6, 7],
+                              'right_val': [1, 2, 3, 6, 7]})
+        # Simple run 1:1
+        pd.merge_asof(left, right, on='a', validate="1:1")
+
+        # Dups on right
+        right_w_dups = right.append(pd.DataFrame({'a': [7],
+                                                  'right_val': [-2]}))
+        right_w_dups = right_w_dups.sort_values('a')
+
+        pd.merge_asof(left, right_w_dups, on='a', validate="1:m")
+        with pytest.raises(ValueError):
+            pd.merge_asof(left, right_w_dups, on='a', validate="1:1")
+
+        # Dups on left
+        left_w_dups = left.append(pd.DataFrame({'a': [1],
+                                                'left_val': [-2]}))
+        left_w_dups = left_w_dups.sort_values('a')
+        pd.merge_asof(left_w_dups, right, on='a', validate="m:1")
+        with pytest.raises(ValueError):
+            pd.merge_asof(left_w_dups, right_w_dups, on='a', validate="1:1")