Skip to content

Commit e5a6ee1

Browse files
author
Nick Eubank
committed
add validate argument to merge
1 parent ba60321 commit e5a6ee1

File tree

4 files changed

+122
-5
lines changed

4 files changed

+122
-5
lines changed

doc/source/whatsnew/v0.21.0.txt

+8
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,14 @@ Check the :ref:`API Changes <whatsnew_0210.api_breaking>` and :ref:`deprecations
2020
New features
2121
~~~~~~~~~~~~
2222

23+
.. _whatsnew_0210.enhancements.merge_validate:
24+
25+
``validate`` argument checks merge key uniqueness
26+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
27+
28+
The ``validate`` argument for :func:`merge` function now checks whether a merge is
29+
one-to-one, one-to-many, many-to-one, or many-to-many. If a merge is found to not
30+
be an example of specified merge type, an exception will be raised.
2331

2432

2533
.. _whatsnew_0210.enhancements.other:

pandas/core/frame.py

+14-2
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,17 @@
174174
175175
.. versionadded:: 0.17.0
176176
177+
validate: None or string, default None
178+
If specified, checks to ensure merge is of specified type.
179+
If "one_to_one" or "1:1", checks merge keys are unique in both
180+
left and right dataset.
181+
If "one_to_many" or "1:m", checks merge keys are unique in left
182+
dataset.
183+
If "many_to_one" or "m:1", checks merge keys are unique in right
184+
dataset.
185+
186+
.. versionadded:: 0.21.0
187+
177188
Examples
178189
--------
179190
@@ -4812,12 +4823,13 @@ def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='',
48124823
@Appender(_merge_doc, indents=2)
48134824
def merge(self, right, how='inner', on=None, left_on=None, right_on=None,
48144825
left_index=False, right_index=False, sort=False,
4815-
suffixes=('_x', '_y'), copy=True, indicator=False):
4826+
suffixes=('_x', '_y'), copy=True, indicator=False,
4827+
validate=None):
48164828
from pandas.core.reshape.merge import merge
48174829
return merge(self, right, how=how, on=on, left_on=left_on,
48184830
right_on=right_on, left_index=left_index,
48194831
right_index=right_index, sort=sort, suffixes=suffixes,
4820-
copy=copy, indicator=indicator)
4832+
copy=copy, indicator=indicator, validate=validate)
48214833

48224834
def round(self, decimals=0, *args, **kwargs):
48234835
"""

pandas/core/reshape/merge.py

+51-3
Original file line numberDiff line numberDiff line change
@@ -46,11 +46,13 @@
4646
@Appender(_merge_doc, indents=0)
4747
def merge(left, right, how='inner', on=None, left_on=None, right_on=None,
4848
left_index=False, right_index=False, sort=False,
49-
suffixes=('_x', '_y'), copy=True, indicator=False):
49+
suffixes=('_x', '_y'), copy=True, indicator=False,
50+
validate=None):
5051
op = _MergeOperation(left, right, how=how, on=on, left_on=left_on,
5152
right_on=right_on, left_index=left_index,
5253
right_index=right_index, sort=sort, suffixes=suffixes,
53-
copy=copy, indicator=indicator)
54+
copy=copy, indicator=indicator,
55+
validate=validate)
5456
return op.get_result()
5557

5658

@@ -498,7 +500,8 @@ class _MergeOperation(object):
498500
def __init__(self, left, right, how='inner', on=None,
499501
left_on=None, right_on=None, axis=1,
500502
left_index=False, right_index=False, sort=True,
501-
suffixes=('_x', '_y'), copy=True, indicator=False):
503+
suffixes=('_x', '_y'), copy=True, indicator=False,
504+
validate=None):
502505
self.left = self.orig_left = left
503506
self.right = self.orig_right = right
504507
self.how = how
@@ -516,6 +519,7 @@ def __init__(self, left, right, how='inner', on=None,
516519
self.right_index = right_index
517520

518521
self.indicator = indicator
522+
self.validate = validate
519523

520524
if isinstance(self.indicator, compat.string_types):
521525
self.indicator_name = self.indicator
@@ -561,6 +565,9 @@ def __init__(self, left, right, how='inner', on=None,
561565
# to avoid incompat dtypes
562566
self._maybe_coerce_merge_keys()
563567

568+
if self.validate is not None:
569+
self._validate()
570+
564571
def get_result(self):
565572
if self.indicator:
566573
self.left, self.right = self._indicator_pre_merge(
@@ -952,6 +959,47 @@ def _validate_specification(self):
952959
if len(self.right_on) != len(self.left_on):
953960
raise ValueError("len(right_on) must equal len(left_on)")
954961

962+
def _validate(self):
963+
964+
# Get merging series:
965+
left_key = self.left_on if self.left_on is not None else self.on
966+
right_key = self.right_on if self.right_on is not None else self.on
967+
968+
if self.left_index:
969+
left_unique = not (self.orig_left.index.duplicated()).any()
970+
else:
971+
left_unique = not (self.orig_left[left_key].duplicated()).any()
972+
973+
if self.right_index:
974+
right_unique = not (self.orig_right.index.duplicated()).any()
975+
else:
976+
right_unique = not (self.orig_right[right_key].duplicated()).any()
977+
978+
# Check valid arg
979+
if self.validate not in ['one_to_one', '1:1',
980+
'one_to_many', '1:m',
981+
'many_to_one', 'm:1',
982+
'many_to_many', 'm:m']:
983+
984+
raise ValueError("Not a valid argument for validate")
985+
986+
# Check data integrity
987+
if self.validate in ["one_to_one", "1:1"]:
988+
if not left_unique or not right_unique:
989+
raise ValueError("Merge keys are not unique in either left or right dataset; not a one-to-one merge")
990+
if not left_unique:
991+
raise ValueError("Merge keys are not unique in left dataset; not a one-to-one merge")
992+
if not right_unique:
993+
raise ValueError("Merge keys are not unique in right dataset; not a one-to-one merge")
994+
995+
if self.validate in ["one_to_many", "1:m"]:
996+
if not left_unique:
997+
raise ValueError("Merge keys are not unique in left dataset; not a one-to-many merge")
998+
999+
if self.validate in ["many_to_one", "m:1"]:
1000+
if not right_unique:
1001+
raise ValueError("Merge keys are not unique in right dataset; not a many-to-one merge")
1002+
9551003

9561004
def _get_join_indexers(left_keys, right_keys, sort=False, how='inner',
9571005
**kwargs):

pandas/tests/reshape/test_merge.py

+49
Original file line numberDiff line numberDiff line change
@@ -725,6 +725,55 @@ def test_indicator(self):
725725
assert_frame_equal(test5, hand_coded_result)
726726

727727

728+
729+
def test_validation(self):
730+
left = DataFrame({'a':['a', 'b', 'c', 'd'],
731+
'b':['cat', 'dog', 'weasel', 'horse']},
732+
index = range(4))
733+
right = DataFrame({'a':['a', 'b', 'c', 'd', 'e'],
734+
'c':['meow', 'bark', 'um... weasel noise?',
735+
'nay', 'chirp']},
736+
index = range(5))
737+
merge(left, right, left_index=True, right_index=True, validate='1:1')
738+
merge(left, right, left_index=True, right_index=True, validate='one_to_one')
739+
merge(left, right, on='a', validate='1:1')
740+
merge(left, right, on='a', validate='one_to_one')
741+
742+
743+
# Dups on right
744+
right_w_dups = right.append(pd.DataFrame({'a':['e'], 'c':['moo']}, index=[4]))
745+
merge(left, right_w_dups, left_index=True, right_index=True, validate='one_to_many')
746+
747+
with pytest.raises(ValueError):
748+
merge(left, right_w_dups, left_index=True, right_index=True, validate='one_to_one')
749+
750+
with pytest.raises(ValueError):
751+
merge(left, right_w_dups, on='a', validate='one_to_one')
752+
753+
# Dups on left
754+
left_w_dups = left.append(pd.DataFrame({'a':['a'], 'c':['cow']}, index=[3]))
755+
merge(left_w_dups, right, left_index=True, right_index=True, validate='many_to_one')
756+
757+
with pytest.raises(ValueError):
758+
merge(left_w_dups, right, left_index=True, right_index=True, validate='one_to_one')
759+
760+
with pytest.raises(ValueError):
761+
merge(left_w_dups, right, on='a', validate='one_to_one')
762+
763+
# Dups on both
764+
merge(left_w_dups, right_w_dups, on='a', validate='many_to_many')
765+
766+
with pytest.raises(ValueError):
767+
merge(left_w_dups, right_w_dups, left_index=True, right_index=True, validate='many_to_one')
768+
769+
with pytest.raises(ValueError):
770+
merge(left_w_dups, right_w_dups, on='a', validate='one_to_many')
771+
772+
# Check invalid arguments
773+
with pytest.raises(ValueError):
774+
merge(left, right, on='a', validate='jibberish')
775+
776+
728777
def _check_merge(x, y):
729778
for how in ['inner', 'left', 'outer']:
730779
result = x.join(y, how=how)

0 commit comments

Comments
 (0)