Skip to content

Commit 95fadb7

Browse files
author
Nick Eubank
committed
add validate argument to merge
1 parent ba60321 commit 95fadb7

File tree

4 files changed

+134
-5
lines changed

4 files changed

+134
-5
lines changed

doc/source/whatsnew/v0.21.0.txt

+8
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,14 @@ Check the :ref:`API Changes <whatsnew_0210.api_breaking>` and :ref:`deprecations
2020
New features
2121
~~~~~~~~~~~~
2222

23+
.. _whatsnew_0210.enhancements.merge_validate:
24+
25+
``validate`` argument checks merge key uniqueness
26+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
27+
28+
The ``validate`` argument for :func:`merge` function now checks whether a merge is
29+
one-to-one, one-to-many, many-to-one, or many-to-many. If a merge is found to not
30+
be an example of specified merge type, an exception will be raised.
2331

2432

2533
.. _whatsnew_0210.enhancements.other:

pandas/core/frame.py

+14-2
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,17 @@
174174
175175
.. versionadded:: 0.17.0
176176
177+
validate: None or string, default None
178+
If specified, checks to ensure merge is of specified type.
179+
If "one_to_one" or "1:1", checks merge keys are unique in both
180+
left and right dataset.
181+
If "one_to_many" or "1:m", checks merge keys are unique in left
182+
dataset.
183+
If "many_to_one" or "m:1", checks merge keys are unique in right
184+
dataset.
185+
186+
.. versionadded:: 0.21.0
187+
177188
Examples
178189
--------
179190
@@ -4812,12 +4823,13 @@ def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='',
48124823
@Appender(_merge_doc, indents=2)
48134824
def merge(self, right, how='inner', on=None, left_on=None, right_on=None,
48144825
left_index=False, right_index=False, sort=False,
4815-
suffixes=('_x', '_y'), copy=True, indicator=False):
4826+
suffixes=('_x', '_y'), copy=True, indicator=False,
4827+
validate=None):
48164828
from pandas.core.reshape.merge import merge
48174829
return merge(self, right, how=how, on=on, left_on=left_on,
48184830
right_on=right_on, left_index=left_index,
48194831
right_index=right_index, sort=sort, suffixes=suffixes,
4820-
copy=copy, indicator=indicator)
4832+
copy=copy, indicator=indicator, validate=validate)
48214833

48224834
def round(self, decimals=0, *args, **kwargs):
48234835
"""

pandas/core/reshape/merge.py

+55-3
Original file line numberDiff line numberDiff line change
@@ -46,11 +46,13 @@
4646
@Appender(_merge_doc, indents=0)
4747
def merge(left, right, how='inner', on=None, left_on=None, right_on=None,
4848
left_index=False, right_index=False, sort=False,
49-
suffixes=('_x', '_y'), copy=True, indicator=False):
49+
suffixes=('_x', '_y'), copy=True, indicator=False,
50+
validate=None):
5051
op = _MergeOperation(left, right, how=how, on=on, left_on=left_on,
5152
right_on=right_on, left_index=left_index,
5253
right_index=right_index, sort=sort, suffixes=suffixes,
53-
copy=copy, indicator=indicator)
54+
copy=copy, indicator=indicator,
55+
validate=validate)
5456
return op.get_result()
5557

5658

@@ -498,7 +500,8 @@ class _MergeOperation(object):
498500
def __init__(self, left, right, how='inner', on=None,
499501
left_on=None, right_on=None, axis=1,
500502
left_index=False, right_index=False, sort=True,
501-
suffixes=('_x', '_y'), copy=True, indicator=False):
503+
suffixes=('_x', '_y'), copy=True, indicator=False,
504+
validate=None):
502505
self.left = self.orig_left = left
503506
self.right = self.orig_right = right
504507
self.how = how
@@ -516,6 +519,7 @@ def __init__(self, left, right, how='inner', on=None,
516519
self.right_index = right_index
517520

518521
self.indicator = indicator
522+
self.validate = validate
519523

520524
if isinstance(self.indicator, compat.string_types):
521525
self.indicator_name = self.indicator
@@ -561,6 +565,9 @@ def __init__(self, left, right, how='inner', on=None,
561565
# to avoid incompat dtypes
562566
self._maybe_coerce_merge_keys()
563567

568+
if self.validate is not None:
569+
self._validate()
570+
564571
def get_result(self):
565572
if self.indicator:
566573
self.left, self.right = self._indicator_pre_merge(
@@ -952,6 +959,51 @@ def _validate_specification(self):
952959
if len(self.right_on) != len(self.left_on):
953960
raise ValueError("len(right_on) must equal len(left_on)")
954961

962+
def _validate(self):
963+
# Get merging series:
964+
left_key = self.left_on if self.left_on is not None else self.on
965+
right_key = self.right_on if self.right_on is not None else self.on
966+
967+
if self.left_index:
968+
left_unique = not (self.orig_left.index.duplicated()).any()
969+
else:
970+
left_unique = not (self.orig_left[left_key].duplicated()).any()
971+
972+
if self.right_index:
973+
right_unique = not (self.orig_right.index.duplicated()).any()
974+
else:
975+
right_unique = not (self.orig_right[right_key].duplicated()).any()
976+
977+
# Check valid arg
978+
if self.validate not in ['one_to_one', '1:1',
979+
'one_to_many', '1:m',
980+
'many_to_one', 'm:1',
981+
'many_to_many', 'm:m']:
982+
983+
raise ValueError("Not a valid argument for validate")
984+
985+
# Check data integrity
986+
if self.validate in ["one_to_one", "1:1"]:
987+
if not left_unique or not right_unique:
988+
raise ValueError("Merge keys are not unique in either left"
989+
" or right dataset; not a one-to-one merge")
990+
if not left_unique:
991+
raise ValueError("Merge keys are not unique in left dataset;"
992+
" not a one-to-one merge")
993+
if not right_unique:
994+
raise ValueError("Merge keys are not unique in right dataset;"
995+
" not a one-to-one merge")
996+
997+
if self.validate in ["one_to_many", "1:m"]:
998+
if not left_unique:
999+
raise ValueError("Merge keys are not unique in left dataset;"
1000+
"not a one-to-many merge")
1001+
1002+
if self.validate in ["many_to_one", "m:1"]:
1003+
if not right_unique:
1004+
raise ValueError("Merge keys are not unique in right dataset;"
1005+
" not a many-to-one merge")
1006+
9551007

9561008
def _get_join_indexers(left_keys, right_keys, sort=False, how='inner',
9571009
**kwargs):

pandas/tests/reshape/test_merge.py

+57
Original file line numberDiff line numberDiff line change
@@ -724,6 +724,63 @@ def test_indicator(self):
724724
how='outer', indicator=True)
725725
assert_frame_equal(test5, hand_coded_result)
726726

727+
def test_validation(self):
728+
left = DataFrame({'a': ['a', 'b', 'c', 'd'],
729+
'b': ['cat', 'dog', 'weasel', 'horse']},
730+
index=range(4))
731+
732+
right = DataFrame({'a': ['a', 'b', 'c', 'd', 'e'],
733+
'c': ['meow', 'bark', 'um... weasel noise?',
734+
'nay', 'chirp']},
735+
index=range(5))
736+
737+
merge(left, right, left_index=True, right_index=True, validate='1:1')
738+
merge(left, right, left_index=True, right_index=True,
739+
validate='one_to_one')
740+
merge(left, right, on='a', validate='1:1')
741+
merge(left, right, on='a', validate='one_to_one')
742+
743+
# Dups on right
744+
right_w_dups = right.append(pd.DataFrame({'a': ['e'], 'c': ['moo']},
745+
index=[4]))
746+
merge(left, right_w_dups, left_index=True, right_index=True,
747+
validate='one_to_many')
748+
749+
with pytest.raises(ValueError):
750+
merge(left, right_w_dups, left_index=True, right_index=True,
751+
validate='one_to_one')
752+
753+
with pytest.raises(ValueError):
754+
merge(left, right_w_dups, on='a', validate='one_to_one')
755+
756+
# Dups on left
757+
left_w_dups = left.append(pd.DataFrame({'a': ['a'], 'c': ['cow']},
758+
index=[3]))
759+
merge(left_w_dups, right, left_index=True, right_index=True,
760+
validate='many_to_one')
761+
762+
with pytest.raises(ValueError):
763+
merge(left_w_dups, right, left_index=True, right_index=True,
764+
validate='one_to_one')
765+
766+
with pytest.raises(ValueError):
767+
merge(left_w_dups, right, on='a', validate='one_to_one')
768+
769+
# Dups on both
770+
merge(left_w_dups, right_w_dups, on='a', validate='many_to_many')
771+
772+
with pytest.raises(ValueError):
773+
merge(left_w_dups, right_w_dups, left_index=True,
774+
right_index=True, validate='many_to_one')
775+
776+
with pytest.raises(ValueError):
777+
merge(left_w_dups, right_w_dups, on='a',
778+
validate='one_to_many')
779+
780+
# Check invalid arguments
781+
with pytest.raises(ValueError):
782+
merge(left, right, on='a', validate='jibberish')
783+
727784

728785
def _check_merge(x, y):
729786
for how in ['inner', 'left', 'outer']:

0 commit comments

Comments
 (0)