-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
ENH: merge_asof() has left_index/right_index and left_by/right_by (#14253) #14531
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
f6f87b6
9c4f718
f3bed99
56fe4c4
e8416b1
243d8ed
2cd3f41
26cca27
acad843
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -259,7 +259,8 @@ def _merger(x, y): | |
|
||
def merge_asof(left, right, on=None, | ||
left_on=None, right_on=None, | ||
by=None, | ||
left_index=False, right_index=False, | ||
by=None, left_by=None, right_by=None, | ||
suffixes=('_x', '_y'), | ||
tolerance=None, | ||
allow_exact_matches=True): | ||
|
@@ -288,9 +289,29 @@ def merge_asof(left, right, on=None, | |
Field name to join on in left DataFrame. | ||
right_on : label | ||
Field name to join on in right DataFrame. | ||
left_index : boolean | ||
Use the index of the left DataFrame as the join key. | ||
|
||
.. versionadded:: 0.19.1 | ||
|
||
right_index : boolean | ||
Use the index of the right DataFrame as the join key. | ||
|
||
.. versionadded:: 0.19.1 | ||
|
||
by : column name | ||
Group both the left and right DataFrames by the group column; perform | ||
the merge operation on these pieces and recombine. | ||
left_by : column name | ||
Field name to group by in the left DataFrame. | ||
|
||
.. versionadded:: 0.19.1 | ||
|
||
right_by : column name | ||
Field name to group by in the right DataFrame. | ||
|
||
.. versionadded:: 0.19.1 | ||
|
||
suffixes : 2-length sequence (tuple, list, ...) | ||
Suffix to apply to overlapping column names in the left and right | ||
side, respectively | ||
|
@@ -418,7 +439,9 @@ def merge_asof(left, right, on=None, | |
""" | ||
op = _AsOfMerge(left, right, | ||
on=on, left_on=left_on, right_on=right_on, | ||
by=by, suffixes=suffixes, | ||
left_index=left_index, right_index=right_index, | ||
by=by, left_by=left_by, right_by=right_by, | ||
suffixes=suffixes, | ||
how='asof', tolerance=tolerance, | ||
allow_exact_matches=allow_exact_matches) | ||
return op.get_result() | ||
|
@@ -641,7 +664,7 @@ def _get_join_info(self): | |
left_ax = self.left._data.axes[self.axis] | ||
right_ax = self.right._data.axes[self.axis] | ||
|
||
if self.left_index and self.right_index: | ||
if self.left_index and self.right_index and self.how != 'asof': | ||
join_index, left_indexer, right_indexer = \ | ||
left_ax.join(right_ax, how=self.how, return_indexers=True) | ||
elif self.right_index and self.how == 'left': | ||
|
@@ -731,21 +754,35 @@ def _get_merge_keys(self): | |
right_keys.append(rk) | ||
join_names.append(None) # what to do? | ||
else: | ||
right_keys.append(right[rk]._values) | ||
join_names.append(rk) | ||
if rk is not None: | ||
right_keys.append(right[rk]._values) | ||
join_names.append(rk) | ||
else: | ||
# kludge for merge_asof(right_index=True) | ||
right_keys.append(right.index.values) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is not safe as it can cause dtype conversions There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Which part? The |
||
join_names.append(right.index.name) | ||
else: | ||
if not is_rkey(rk): | ||
right_keys.append(right[rk]._values) | ||
if lk == rk: | ||
if rk is not None: | ||
right_keys.append(right[rk]._values) | ||
else: | ||
# kludge for merge_asof(right_index=True) | ||
right_keys.append(right.index.values) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this here, we don't like doing things like |
||
if lk is not None and lk == rk: | ||
# avoid key upcast in corner case (length-0) | ||
if len(left) > 0: | ||
right_drop.append(rk) | ||
else: | ||
left_drop.append(lk) | ||
else: | ||
right_keys.append(rk) | ||
left_keys.append(left[lk]._values) | ||
join_names.append(lk) | ||
if lk is not None: | ||
left_keys.append(left[lk]._values) | ||
join_names.append(lk) | ||
else: | ||
# kludge for merge_asof(left_index=True) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why is this a kludge? any way to make this less kludgy? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is unique to In a regular I put the kludge in there to handle this case. So perhaps it isn't a kludge at all and I should just put the above explanation at the top of the function. What do you think? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok, so its simpler to handle all of the cases here (in basic merge) then? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think for now it's simpler. It would take a massive overhaul of how generic merge works to fix this. I'll add my notes as a comment to the function and resubmit. |
||
left_keys.append(left.index.values) | ||
join_names.append(left.index.name) | ||
elif _any(self.left_on): | ||
for k in self.left_on: | ||
if is_lkey(k): | ||
|
@@ -870,13 +907,15 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how='inner', | |
class _OrderedMerge(_MergeOperation): | ||
_merge_type = 'ordered_merge' | ||
|
||
def __init__(self, left, right, on=None, left_on=None, | ||
right_on=None, axis=1, | ||
def __init__(self, left, right, on=None, left_on=None, right_on=None, | ||
left_index=False, right_index=False, axis=1, | ||
suffixes=('_x', '_y'), copy=True, | ||
fill_method=None, how='outer'): | ||
|
||
self.fill_method = fill_method | ||
_MergeOperation.__init__(self, left, right, on=on, left_on=left_on, | ||
left_index=left_index, | ||
right_index=right_index, | ||
right_on=right_on, axis=axis, | ||
how=how, suffixes=suffixes, | ||
sort=True # factorize sorts | ||
|
@@ -949,43 +988,68 @@ def _get_cython_type(dtype): | |
class _AsOfMerge(_OrderedMerge): | ||
_merge_type = 'asof_merge' | ||
|
||
def __init__(self, left, right, on=None, by=None, left_on=None, | ||
right_on=None, axis=1, | ||
suffixes=('_x', '_y'), copy=True, | ||
def __init__(self, left, right, on=None, left_on=None, right_on=None, | ||
left_index=False, right_index=False, | ||
by=None, left_by=None, right_by=None, | ||
axis=1, suffixes=('_x', '_y'), copy=True, | ||
fill_method=None, | ||
how='asof', tolerance=None, | ||
allow_exact_matches=True): | ||
|
||
self.by = by | ||
self.left_by = left_by | ||
self.right_by = right_by | ||
self.tolerance = tolerance | ||
self.allow_exact_matches = allow_exact_matches | ||
|
||
_OrderedMerge.__init__(self, left, right, on=on, left_on=left_on, | ||
right_on=right_on, axis=axis, | ||
right_on=right_on, left_index=left_index, | ||
right_index=right_index, axis=axis, | ||
how=how, suffixes=suffixes, | ||
fill_method=fill_method) | ||
|
||
def _validate_specification(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The |
||
super(_AsOfMerge, self)._validate_specification() | ||
|
||
# we only allow on to be a single item for on | ||
if len(self.left_on) != 1: | ||
if len(self.left_on) != 1 and not self.left_index: | ||
raise MergeError("can only asof on a key for left") | ||
|
||
if len(self.right_on) != 1: | ||
if len(self.right_on) != 1 and not self.right_index: | ||
raise MergeError("can only asof on a key for right") | ||
|
||
if self.left_index and isinstance(self.left.index, MultiIndex): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you tests for these conditions that are errors? |
||
raise MergeError("left can only have one index") | ||
|
||
if self.right_index and isinstance(self.right.index, MultiIndex): | ||
raise MergeError("right can only have one index") | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. need to add tests for each of these error conditions There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These tests already exist: |
||
# set 'by' columns | ||
if self.by is not None: | ||
if self.left_by is not None or self.right_by is not None: | ||
raise MergeError('Can only pass by OR left_by ' | ||
'and right_by') | ||
self.left_by = self.right_by = self.by | ||
if self.left_by is None and self.right_by is not None: | ||
raise MergeError('missing left_by') | ||
if self.left_by is not None and self.right_by is None: | ||
raise MergeError('missing right_by') | ||
|
||
# add by to our key-list so we can have it in the | ||
# output as a key | ||
if self.by is not None: | ||
if not is_list_like(self.by): | ||
self.by = [self.by] | ||
if self.left_by is not None: | ||
if not is_list_like(self.left_by): | ||
self.left_by = [self.left_by] | ||
if not is_list_like(self.right_by): | ||
self.right_by = [self.right_by] | ||
|
||
if len(self.by) != 1: | ||
if len(self.left_by) != 1: | ||
raise MergeError("can only asof by a single key") | ||
if len(self.right_by) != 1: | ||
raise MergeError("can only asof by a single key") | ||
|
||
self.left_on = self.by + list(self.left_on) | ||
self.right_on = self.by + list(self.right_on) | ||
self.left_on = self.left_by + list(self.left_on) | ||
self.right_on = self.right_by + list(self.right_on) | ||
|
||
@property | ||
def _asof_key(self): | ||
|
@@ -1008,7 +1072,7 @@ def _get_merge_keys(self): | |
# validate tolerance; must be a Timedelta if we have a DTI | ||
if self.tolerance is not None: | ||
|
||
lt = left_join_keys[self.left_on.index(self._asof_key)] | ||
lt = left_join_keys[-1] | ||
msg = "incompatible tolerance, must be compat " \ | ||
"with type {0}".format(type(lt)) | ||
|
||
|
@@ -1038,8 +1102,10 @@ def _get_join_indexers(self): | |
""" return the join indexers """ | ||
|
||
# values to compare | ||
left_values = self.left_join_keys[-1] | ||
right_values = self.right_join_keys[-1] | ||
left_values = (self.left.index.values if self.left_index else | ||
self.left_join_keys[-1]) | ||
right_values = (self.right.index.values if self.right_index else | ||
self.right_join_keys[-1]) | ||
tolerance = self.tolerance | ||
|
||
# we required sortedness in the join keys | ||
|
@@ -1057,7 +1123,7 @@ def _get_join_indexers(self): | |
tolerance = tolerance.value | ||
|
||
# a "by" parameter requires special handling | ||
if self.by is not None: | ||
if self.left_by is not None: | ||
left_by_values = self.left_join_keys[0] | ||
right_by_values = self.right_join_keys[0] | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -117,6 +117,75 @@ def test_basic_categorical(self): | |
by='ticker') | ||
assert_frame_equal(result, expected) | ||
|
||
def test_basic_left_index(self): | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add the github issue as a comment |
||
expected = self.asof | ||
trades = self.trades.set_index('time') | ||
quotes = self.quotes | ||
|
||
result = merge_asof(trades, quotes, | ||
left_index=True, | ||
right_on='time', | ||
by='ticker') | ||
# left-only index uses right's index, oddly | ||
expected.index = result.index | ||
# time column appears after left's columns | ||
expected = expected[result.columns] | ||
assert_frame_equal(result, expected) | ||
|
||
def test_basic_right_index(self): | ||
|
||
expected = self.asof | ||
trades = self.trades | ||
quotes = self.quotes.set_index('time') | ||
|
||
result = merge_asof(trades, quotes, | ||
left_on='time', | ||
right_index=True, | ||
by='ticker') | ||
assert_frame_equal(result, expected) | ||
|
||
def test_basic_left_index_right_index(self): | ||
|
||
expected = self.asof.set_index('time') | ||
trades = self.trades.set_index('time') | ||
quotes = self.quotes.set_index('time') | ||
|
||
result = merge_asof(trades, quotes, | ||
left_index=True, | ||
right_index=True, | ||
by='ticker') | ||
assert_frame_equal(result, expected) | ||
|
||
def test_multi_index(self): | ||
|
||
# MultiIndex is prohibited | ||
trades = self.trades.set_index(['time', 'price']) | ||
quotes = self.quotes.set_index('time') | ||
with self.assertRaises(MergeError): | ||
merge_asof(trades, quotes, | ||
left_index=True, | ||
right_index=True) | ||
|
||
trades = self.trades.set_index('time') | ||
quotes = self.quotes.set_index(['time', 'bid']) | ||
with self.assertRaises(MergeError): | ||
merge_asof(trades, quotes, | ||
left_index=True, | ||
right_index=True) | ||
|
||
def test_basic_left_by_right_by(self): | ||
|
||
expected = self.asof | ||
trades = self.trades | ||
quotes = self.quotes | ||
|
||
result = merge_asof(trades, quotes, | ||
on='time', | ||
left_by='ticker', | ||
right_by='ticker') | ||
assert_frame_equal(result, expected) | ||
|
||
def test_missing_right_by(self): | ||
|
||
expected = self.asof | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you add some examples of using these parameters?