-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
ENH: add validate argument to merge #16275
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -46,11 +46,13 @@ | |
@Appender(_merge_doc, indents=0) | ||
def merge(left, right, how='inner', on=None, left_on=None, right_on=None, | ||
left_index=False, right_index=False, sort=False, | ||
suffixes=('_x', '_y'), copy=True, indicator=False): | ||
suffixes=('_x', '_y'), copy=True, indicator=False, | ||
validate=None): | ||
op = _MergeOperation(left, right, how=how, on=on, left_on=left_on, | ||
right_on=right_on, left_index=left_index, | ||
right_index=right_index, sort=sort, suffixes=suffixes, | ||
copy=copy, indicator=indicator) | ||
copy=copy, indicator=indicator, | ||
validate=validate) | ||
return op.get_result() | ||
|
||
|
||
|
@@ -341,6 +343,7 @@ def merge_asof(left, right, on=None, | |
|
||
.. versionadded:: 0.20.0 | ||
|
||
|
||
Returns | ||
------- | ||
merged : DataFrame | ||
|
@@ -504,7 +507,8 @@ class _MergeOperation(object): | |
def __init__(self, left, right, how='inner', on=None, | ||
left_on=None, right_on=None, axis=1, | ||
left_index=False, right_index=False, sort=True, | ||
suffixes=('_x', '_y'), copy=True, indicator=False): | ||
suffixes=('_x', '_y'), copy=True, indicator=False, | ||
validate=None): | ||
self.left = self.orig_left = left | ||
self.right = self.orig_right = right | ||
self.how = how | ||
|
@@ -567,6 +571,12 @@ def __init__(self, left, right, how='inner', on=None, | |
# to avoid incompat dtypes | ||
self._maybe_coerce_merge_keys() | ||
|
||
# If argument passed to validate, | ||
# check if columns specified as unique | ||
# are in fact unique. | ||
if validate is not None: | ||
self._validate(validate) | ||
|
||
def get_result(self): | ||
if self.indicator: | ||
self.left, self.right = self._indicator_pre_merge( | ||
|
@@ -958,6 +968,49 @@ def _validate_specification(self): | |
if len(self.right_on) != len(self.left_on): | ||
raise ValueError("len(right_on) must equal len(left_on)") | ||
|
||
def _validate(self, validate): | ||
|
||
# Check uniqueness of each | ||
if self.left_index: | ||
left_unique = self.orig_left.index.is_unique | ||
else: | ||
left_unique = MultiIndex.from_arrays(self.left_join_keys | ||
).is_unique | ||
|
||
if self.right_index: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same |
||
right_unique = self.orig_right.index.is_unique | ||
else: | ||
right_unique = MultiIndex.from_arrays(self.right_join_keys | ||
).is_unique | ||
|
||
# Check data integrity | ||
if validate in ["one_to_one", "1:1"]: | ||
if not left_unique and not right_unique: | ||
raise ValueError("Merge keys are not unique in either left" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we might want to raise a MergeError (inherits from ValueError) instead here (as this is what we do for the other operations). Have a look and see if this is inconsistent There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Converted. I think that's fair. |
||
" or right dataset; not a one-to-one merge") | ||
elif not left_unique: | ||
raise ValueError("Merge keys are not unique in left dataset;" | ||
" not a one-to-one merge") | ||
elif not right_unique: | ||
raise ValueError("Merge keys are not unique in right dataset;" | ||
" not a one-to-one merge") | ||
|
||
elif validate in ["one_to_many", "1:m"]: | ||
if not left_unique: | ||
raise ValueError("Merge keys are not unique in left dataset;" | ||
"not a one-to-many merge") | ||
|
||
elif validate in ["many_to_one", "m:1"]: | ||
if not right_unique: | ||
raise ValueError("Merge keys are not unique in right dataset;" | ||
" not a many-to-one merge") | ||
|
||
elif validate in ['many_to_many', 'm:m']: | ||
pass | ||
|
||
else: | ||
raise ValueError("Not a valid argument for validate") | ||
|
||
|
||
def _get_join_indexers(left_keys, right_keys, sort=False, how='inner', | ||
**kwargs): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -724,6 +724,130 @@ def test_indicator(self): | |
how='outer', indicator=True) | ||
assert_frame_equal(test5, hand_coded_result) | ||
|
||
def test_validation(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. rather than repeating tests I think pull these out to another test file, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The only issue I see is that there are requirements for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe just do some simple validation then (and leave them where they are). having completely duplicated tests is not that useful. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok. Added some simple validations to |
||
left = DataFrame({'a': ['a', 'b', 'c', 'd'], | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add tests for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added |
||
'b': ['cat', 'dog', 'weasel', 'horse']}, | ||
index=range(4)) | ||
|
||
right = DataFrame({'a': ['a', 'b', 'c', 'd', 'e'], | ||
'c': ['meow', 'bark', 'um... weasel noise?', | ||
'nay', 'chirp']}, | ||
index=range(5)) | ||
|
||
# Make sure no side effects. | ||
left_copy = left.copy() | ||
right_copy = right.copy() | ||
|
||
result = merge(left, right, left_index=True, right_index=True, | ||
validate='1:1') | ||
assert_frame_equal(left, left_copy) | ||
assert_frame_equal(right, right_copy) | ||
|
||
# make sure merge still correct | ||
expected = DataFrame({'a_x': ['a', 'b', 'c', 'd'], | ||
'b': ['cat', 'dog', 'weasel', 'horse'], | ||
'a_y': ['a', 'b', 'c', 'd'], | ||
'c': ['meow', 'bark', 'um... weasel noise?', | ||
'nay']}, | ||
index=range(4), | ||
columns=['a_x', 'b', 'a_y', 'c']) | ||
|
||
result = merge(left, right, left_index=True, right_index=True, | ||
validate='one_to_one') | ||
assert_frame_equal(result, expected) | ||
|
||
expected_2 = DataFrame({'a': ['a', 'b', 'c', 'd'], | ||
'b': ['cat', 'dog', 'weasel', 'horse'], | ||
'c': ['meow', 'bark', 'um... weasel noise?', | ||
'nay']}, | ||
index=range(4)) | ||
|
||
result = merge(left, right, on='a', validate='1:1') | ||
assert_frame_equal(left, left_copy) | ||
assert_frame_equal(right, right_copy) | ||
assert_frame_equal(result, expected_2) | ||
|
||
result = merge(left, right, on='a', validate='one_to_one') | ||
assert_frame_equal(result, expected_2) | ||
|
||
# One index, one column | ||
expected_3 = DataFrame({'b': ['cat', 'dog', 'weasel', 'horse'], | ||
'a': ['a', 'b', 'c', 'd'], | ||
'c': ['meow', 'bark', 'um... weasel noise?', | ||
'nay']}, | ||
columns=['b', 'a', 'c'], | ||
index=range(4)) | ||
|
||
left_index_reset = left.set_index('a') | ||
result = merge(left_index_reset, right, left_index=True, | ||
right_on='a', validate='one_to_one') | ||
assert_frame_equal(result, expected_3) | ||
|
||
# Dups on right | ||
right_w_dups = right.append(pd.DataFrame({'a': ['e'], 'c': ['moo']}, | ||
index=[4])) | ||
merge(left, right_w_dups, left_index=True, right_index=True, | ||
validate='one_to_many') | ||
|
||
with pytest.raises(ValueError): | ||
merge(left, right_w_dups, left_index=True, right_index=True, | ||
validate='one_to_one') | ||
|
||
with pytest.raises(ValueError): | ||
merge(left, right_w_dups, on='a', validate='one_to_one') | ||
|
||
# Dups on left | ||
left_w_dups = left.append(pd.DataFrame({'a': ['a'], 'c': ['cow']}, | ||
index=[3])) | ||
merge(left_w_dups, right, left_index=True, right_index=True, | ||
validate='many_to_one') | ||
|
||
with pytest.raises(ValueError): | ||
merge(left_w_dups, right, left_index=True, right_index=True, | ||
validate='one_to_one') | ||
|
||
with pytest.raises(ValueError): | ||
merge(left_w_dups, right, on='a', validate='one_to_one') | ||
|
||
# Dups on both | ||
merge(left_w_dups, right_w_dups, on='a', validate='many_to_many') | ||
|
||
with pytest.raises(ValueError): | ||
merge(left_w_dups, right_w_dups, left_index=True, | ||
right_index=True, validate='many_to_one') | ||
|
||
with pytest.raises(ValueError): | ||
merge(left_w_dups, right_w_dups, on='a', | ||
validate='one_to_many') | ||
|
||
# Check invalid arguments | ||
with pytest.raises(ValueError): | ||
merge(left, right, on='a', validate='jibberish') | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you also add tests for when merging on multiple columns (and maybe also one test where combining eg left_index and right_on, but not needed to that for all of them) |
||
# Two column merge, dups in both, but jointly no dups. | ||
left = DataFrame({'a': ['a', 'a', 'b', 'b'], | ||
'b': [0, 1, 0, 1], | ||
'c': ['cat', 'dog', 'weasel', 'horse']}, | ||
index=range(4)) | ||
|
||
right = DataFrame({'a': ['a', 'a', 'b'], | ||
'b': [0, 1, 0], | ||
'd': ['meow', 'bark', 'um... weasel noise?']}, | ||
index=range(3)) | ||
|
||
expected_multi = DataFrame({'a': ['a', 'a', 'b'], | ||
'b': [0, 1, 0], | ||
'c': ['cat', 'dog', 'weasel'], | ||
'd': ['meow', 'bark', | ||
'um... weasel noise?']}, | ||
index=range(3)) | ||
|
||
with pytest.raises(ValueError): | ||
merge(left, right, on='a', validate='1:1') | ||
|
||
result = merge(left, right, on=['a', 'b'], validate='1:1') | ||
assert_frame_equal(result, expected_multi) | ||
|
||
|
||
def _check_merge(x, y): | ||
for how in ['inner', 'left', 'outer']: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
not self.orig_left.index.is_unique