Skip to content

Commit 4e3c138

Browse files
author
Nick Eubank
committed
add docs, integrate with merge_asof, add tests
1 parent 95fadb7 commit 4e3c138

File tree

4 files changed

+109
-26
lines changed

4 files changed

+109
-26
lines changed

doc/source/merging.rst

+38-3
Original file line numberDiff line numberDiff line change
@@ -513,7 +513,8 @@ standard database join operations between DataFrame objects:
513513

514514
pd.merge(left, right, how='inner', on=None, left_on=None, right_on=None,
515515
left_index=False, right_index=False, sort=True,
516-
suffixes=('_x', '_y'), copy=True, indicator=False)
516+
suffixes=('_x', '_y'), copy=True, indicator=False,
517+
validate=None)
517518

518519
- ``left``: A DataFrame object
519520
- ``right``: Another DataFrame object
@@ -551,6 +552,18 @@ standard database join operations between DataFrame objects:
551552

552553
.. versionadded:: 0.17.0
553554

555+
- ``validate`` : {None, '1:1', '1:m', 'm:1', 'm:m', "one_to_one", "one_to_many", "many_to_one", "many_to_many"}, default None
556+
If specified, checks if merge is of specified type.
557+
* "one_to_one" or "1:1": check if merge keys are unique in both
558+
left and right dataset.
559+
* "one_to_many" or "1:m": check if merge keys are unique in left
560+
dataset.
561+
* "many_to_one" or "m:1": check if merge keys are unique in right
562+
dataset.
563+
564+
.. versionadded:: 0.21.0
565+
566+
554567
The return type will be the same as ``left``. If ``left`` is a ``DataFrame``
555568
and ``right`` is a subclass of DataFrame, the return type will still be
556569
``DataFrame``.
@@ -711,10 +724,32 @@ Here is another example with duplicate join keys in DataFrames:
711724
labels=['left', 'right'], vertical=False);
712725
plt.close('all');
713726
727+
714728
.. warning::
715729

716-
Joining / merging on duplicate keys can cause a returned frame that is the multiplication of the row dimensions,
717-
may result in memory overflow. It is the user' s responsibility to manage duplicate values in keys before joining large DataFrames.
730+
Joining / merging on duplicate keys can cause a returned frame that is the multiplication of the row dimensions, which may result in memory overflow. It is the user' s responsibility to manage duplicate values in keys before joining large DataFrames.
731+
732+
Checking for duplicate keys
733+
~~~~~~~~~~~~~~~~~~~~~~~~~~~
734+
735+
Users can use the ``validate`` argument to automatically check whether there are unexpected duplicates in their merge keys. Key uniqueness is checked before merge operations and so should protect against memory overflows. Checking key uniqueness is also a good way to ensure user data structures are as expected.
736+
737+
In the following example, there are duplicate values of ``B`` in the right DataFrame. As this is not a one-to-one merge -- as specified in the ``validate`` argument -- an exception will be raised.
738+
739+
.. ipython:: python
740+
741+
left = pd.DataFrame({'A' : [1,2], 'B' : [1, 2]})
742+
743+
right = pd.DataFrame({'A' : [4,5,6], 'B': [2, 2, 2]})
744+
745+
result = pd.merge(left, right, on='B', how='outer', validate="one_to_one");
746+
747+
If the user is aware of the duplicates in the right `DataFrame` but wants to ensure there are no duplicates in the left DataFrame, one can use the `one_to_many` argument instead, which will not raise an exception.
748+
749+
.. ipython:: python
750+
751+
result = pd.merge(left, right, on='B', how='outer', validate="one_to_many")
752+
718753
719754
.. _merging.indicator:
720755

pandas/core/frame.py

+6-5
Original file line numberDiff line numberDiff line change
@@ -174,13 +174,14 @@
174174
175175
.. versionadded:: 0.17.0
176176
177-
validate: None or string, default None
178-
If specified, checks to ensure merge is of specified type.
179-
If "one_to_one" or "1:1", checks merge keys are unique in both
177+
validate : {None, '1:1', '1:m', 'm:1', 'm:m', "one_to_one", "one_to_many",
178+
"many_to_one", "many_to_many"}, default None
179+
If specified, checks if merge is of specified type.
180+
* "one_to_one" or "1:1": check if merge keys are unique in both
180181
left and right dataset.
181-
If "one_to_many" or "1:m", checks merge keys are unique in left
182+
* "one_to_many" or "1:m": check if merge keys are unique in left
182183
dataset.
183-
If "many_to_one" or "m:1", checks merge keys are unique in right
184+
* "many_to_one" or "m:1": check if merge keys are unique in right
184185
dataset.
185186
186187
.. versionadded:: 0.21.0

pandas/core/reshape/merge.py

+39-18
Original file line numberDiff line numberDiff line change
@@ -265,7 +265,8 @@ def merge_asof(left, right, on=None,
265265
suffixes=('_x', '_y'),
266266
tolerance=None,
267267
allow_exact_matches=True,
268-
direction='backward'):
268+
direction='backward',
269+
validate=None):
269270
"""Perform an asof merge. This is similar to a left-join except that we
270271
match on nearest key rather than equal keys.
271272
@@ -343,6 +344,19 @@ def merge_asof(left, right, on=None,
343344
344345
.. versionadded:: 0.20.0
345346
347+
validate : {None, '1:1', '1:m', 'm:1', 'm:m', "one_to_one", "one_to_many",
348+
"many_to_one", "many_to_many"}, default None
349+
If specified, checks if merge is of specified type.
350+
* "one_to_one" or "1:1": check if merge keys are unique in both
351+
left and right dataset.
352+
* "one_to_many" or "1:m": check if merge keys are unique in left
353+
dataset.
354+
* "many_to_one" or "m:1": check if merge keys are unique in right
355+
dataset.
356+
357+
.. versionadded:: 0.21.0
358+
359+
346360
Returns
347361
-------
348362
merged : DataFrame
@@ -484,7 +498,7 @@ def merge_asof(left, right, on=None,
484498
suffixes=suffixes,
485499
how='asof', tolerance=tolerance,
486500
allow_exact_matches=allow_exact_matches,
487-
direction=direction)
501+
direction=direction, validate=validate)
488502
return op.get_result()
489503

490504

@@ -519,7 +533,6 @@ def __init__(self, left, right, how='inner', on=None,
519533
self.right_index = right_index
520534

521535
self.indicator = indicator
522-
self.validate = validate
523536

524537
if isinstance(self.indicator, compat.string_types):
525538
self.indicator_name = self.indicator
@@ -565,8 +578,11 @@ def __init__(self, left, right, how='inner', on=None,
565578
# to avoid incompat dtypes
566579
self._maybe_coerce_merge_keys()
567580

568-
if self.validate is not None:
569-
self._validate()
581+
# If argument passed to validate,
582+
# check if columns specified as unique
583+
# are in fact unique.
584+
if validate is not None:
585+
self._validate(validate)
570586

571587
def get_result(self):
572588
if self.indicator:
@@ -959,11 +975,13 @@ def _validate_specification(self):
959975
if len(self.right_on) != len(self.left_on):
960976
raise ValueError("len(right_on) must equal len(left_on)")
961977

962-
def _validate(self):
963-
# Get merging series:
978+
def _validate(self, validate):
979+
980+
# Get axes
964981
left_key = self.left_on if self.left_on is not None else self.on
965982
right_key = self.right_on if self.right_on is not None else self.on
966983

984+
# Check uniqueness of each
967985
if self.left_index:
968986
left_unique = not (self.orig_left.index.duplicated()).any()
969987
else:
@@ -975,15 +993,15 @@ def _validate(self):
975993
right_unique = not (self.orig_right[right_key].duplicated()).any()
976994

977995
# Check valid arg
978-
if self.validate not in ['one_to_one', '1:1',
979-
'one_to_many', '1:m',
980-
'many_to_one', 'm:1',
981-
'many_to_many', 'm:m']:
996+
if validate not in ['one_to_one', '1:1',
997+
'one_to_many', '1:m',
998+
'many_to_one', 'm:1',
999+
'many_to_many', 'm:m']:
9821000

9831001
raise ValueError("Not a valid argument for validate")
9841002

9851003
# Check data integrity
986-
if self.validate in ["one_to_one", "1:1"]:
1004+
if validate in ["one_to_one", "1:1"]:
9871005
if not left_unique or not right_unique:
9881006
raise ValueError("Merge keys are not unique in either left"
9891007
" or right dataset; not a one-to-one merge")
@@ -994,12 +1012,12 @@ def _validate(self):
9941012
raise ValueError("Merge keys are not unique in right dataset;"
9951013
" not a one-to-one merge")
9961014

997-
if self.validate in ["one_to_many", "1:m"]:
1015+
if validate in ["one_to_many", "1:m"]:
9981016
if not left_unique:
9991017
raise ValueError("Merge keys are not unique in left dataset;"
10001018
"not a one-to-many merge")
10011019

1002-
if self.validate in ["many_to_one", "m:1"]:
1020+
if validate in ["many_to_one", "m:1"]:
10031021
if not right_unique:
10041022
raise ValueError("Merge keys are not unique in right dataset;"
10051023
" not a many-to-one merge")
@@ -1056,15 +1074,17 @@ class _OrderedMerge(_MergeOperation):
10561074
def __init__(self, left, right, on=None, left_on=None, right_on=None,
10571075
left_index=False, right_index=False, axis=1,
10581076
suffixes=('_x', '_y'), copy=True,
1059-
fill_method=None, how='outer'):
1077+
fill_method=None, how='outer',
1078+
validate=None):
10601079

10611080
self.fill_method = fill_method
10621081
_MergeOperation.__init__(self, left, right, on=on, left_on=left_on,
10631082
left_index=left_index,
10641083
right_index=right_index,
10651084
right_on=right_on, axis=axis,
10661085
how=how, suffixes=suffixes,
1067-
sort=True # factorize sorts
1086+
sort=True, # factorize sorts
1087+
validate=validate
10681088
)
10691089

10701090
def get_result(self):
@@ -1161,7 +1181,7 @@ def __init__(self, left, right, on=None, left_on=None, right_on=None,
11611181
fill_method=None,
11621182
how='asof', tolerance=None,
11631183
allow_exact_matches=True,
1164-
direction='backward'):
1184+
direction='backward', validate=None):
11651185

11661186
self.by = by
11671187
self.left_by = left_by
@@ -1174,7 +1194,8 @@ def __init__(self, left, right, on=None, left_on=None, right_on=None,
11741194
right_on=right_on, left_index=left_index,
11751195
right_index=right_index, axis=axis,
11761196
how=how, suffixes=suffixes,
1177-
fill_method=fill_method)
1197+
fill_method=fill_method,
1198+
validate=validate)
11781199

11791200
def _validate_specification(self):
11801201
super(_AsOfMerge, self)._validate_specification()

pandas/tests/reshape/test_merge_asof.py

+26
Original file line numberDiff line numberDiff line change
@@ -973,3 +973,29 @@ def test_on_float_by_int(self):
973973
columns=['symbol', 'exch', 'price', 'mpv'])
974974

975975
assert_frame_equal(result, expected)
976+
977+
def test_validate(self):
978+
979+
left = pd.DataFrame({'a': [1, 5, 10],
980+
'left_val': ['a', 'b', 'c']})
981+
right = pd.DataFrame({'a': [1, 2, 3, 6, 7],
982+
'right_val': [1, 2, 3, 6, 7]})
983+
# Simple run 1:1
984+
pd.merge_asof(left, right, on='a', validate="1:1")
985+
986+
# Dups on right
987+
right_w_dups = right.append(pd.DataFrame({'a': [7],
988+
'right_val': [-2]}))
989+
right_w_dups = right_w_dups.sort_values('a')
990+
991+
pd.merge_asof(left, right_w_dups, on='a', validate="1:m")
992+
with pytest.raises(ValueError):
993+
pd.merge_asof(left, right_w_dups, on='a', validate="1:1")
994+
995+
# Dups on left
996+
left_w_dups = left.append(pd.DataFrame({'a': [1],
997+
'left_val': [-2]}))
998+
left_w_dups = left_w_dups.sort_values('a')
999+
pd.merge_asof(left_w_dups, right, on='a', validate="m:1")
1000+
with pytest.raises(ValueError):
1001+
pd.merge_asof(left_w_dups, right_w_dups, on='a', validate="1:1")

0 commit comments

Comments
 (0)