-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
ENH: merge_asof() has left_index/right_index and left_by/right_by (#14253) #14531
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
f6f87b6
9c4f718
f3bed99
56fe4c4
e8416b1
243d8ed
2cd3f41
26cca27
acad843
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -259,7 +259,8 @@ def _merger(x, y): | |
|
||
def merge_asof(left, right, on=None, | ||
left_on=None, right_on=None, | ||
by=None, | ||
left_index=False, right_index=False, | ||
by=None, left_by=None, right_by=None, | ||
suffixes=('_x', '_y'), | ||
tolerance=None, | ||
allow_exact_matches=True): | ||
|
@@ -288,9 +289,29 @@ def merge_asof(left, right, on=None, | |
Field name to join on in left DataFrame. | ||
right_on : label | ||
Field name to join on in right DataFrame. | ||
left_index : boolean | ||
Use the index of the left DataFrame as the join key. | ||
|
||
.. versionadded:: 0.19.2 | ||
|
||
right_index : boolean | ||
Use the index of the right DataFrame as the join key. | ||
|
||
.. versionadded:: 0.19.2 | ||
|
||
by : column name | ||
Group both the left and right DataFrames by the group column; perform | ||
the merge operation on these pieces and recombine. | ||
left_by : column name | ||
Field name to group by in the left DataFrame. | ||
|
||
.. versionadded:: 0.19.2 | ||
|
||
right_by : column name | ||
Field name to group by in the right DataFrame. | ||
|
||
.. versionadded:: 0.19.2 | ||
|
||
suffixes : 2-length sequence (tuple, list, ...) | ||
Suffix to apply to overlapping column names in the left and right | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add some examples of using these parameters? |
||
side, respectively | ||
|
@@ -348,6 +369,28 @@ def merge_asof(left, right, on=None, | |
3 5 b 3.0 | ||
6 10 c 7.0 | ||
|
||
We can use indexed DataFrames as well. | ||
|
||
>>> left | ||
left_val | ||
1 a | ||
5 b | ||
10 c | ||
|
||
>>> right | ||
right_val | ||
1 1 | ||
2 2 | ||
3 3 | ||
6 6 | ||
7 7 | ||
|
||
>>> pd.merge_asof(left, right, left_index=True, right_index=True) | ||
left_val right_val | ||
1 a 1 | ||
5 b 3 | ||
10 c 7 | ||
|
||
Here is a real-world times-series example | ||
|
||
>>> quotes | ||
|
@@ -418,7 +461,9 @@ def merge_asof(left, right, on=None, | |
""" | ||
op = _AsOfMerge(left, right, | ||
on=on, left_on=left_on, right_on=right_on, | ||
by=by, suffixes=suffixes, | ||
left_index=left_index, right_index=right_index, | ||
by=by, left_by=left_by, right_by=right_by, | ||
suffixes=suffixes, | ||
how='asof', tolerance=tolerance, | ||
allow_exact_matches=allow_exact_matches) | ||
return op.get_result() | ||
|
@@ -650,7 +695,7 @@ def _get_join_info(self): | |
left_ax = self.left._data.axes[self.axis] | ||
right_ax = self.right._data.axes[self.axis] | ||
|
||
if self.left_index and self.right_index: | ||
if self.left_index and self.right_index and self.how != 'asof': | ||
join_index, left_indexer, right_indexer = \ | ||
left_ax.join(right_ax, how=self.how, return_indexers=True) | ||
elif self.right_index and self.how == 'left': | ||
|
@@ -731,6 +776,16 @@ def _get_merge_keys(self): | |
is_rkey = lambda x: isinstance( | ||
x, (np.ndarray, ABCSeries)) and len(x) == len(right) | ||
|
||
# Note that pd.merge_asof() has separate 'on' and 'by' parameters. A | ||
# user could, for example, request 'left_index' and 'left_by'. In a | ||
# regular pd.merge(), users cannot specify both 'left_index' and | ||
# 'left_on'. (Instead, users have a MultiIndex). That means the | ||
# self.left_on in this function is always empty in a pd.merge(), but | ||
# a pd.merge_asof(left_index=True, left_by=...) will result in a | ||
# self.left_on array with a None in the middle of it. This requires | ||
# a work-around as designated in the code below. | ||
# See _validate_specification() for where this happens. | ||
|
||
# ugh, spaghetti re #733 | ||
if _any(self.left_on) and _any(self.right_on): | ||
for lk, rk in zip(self.left_on, self.right_on): | ||
|
@@ -740,21 +795,35 @@ def _get_merge_keys(self): | |
right_keys.append(rk) | ||
join_names.append(None) # what to do? | ||
else: | ||
right_keys.append(right[rk]._values) | ||
join_names.append(rk) | ||
if rk is not None: | ||
right_keys.append(right[rk]._values) | ||
join_names.append(rk) | ||
else: | ||
# work-around for merge_asof(right_index=True) | ||
right_keys.append(right.index) | ||
join_names.append(right.index.name) | ||
else: | ||
if not is_rkey(rk): | ||
right_keys.append(right[rk]._values) | ||
if lk == rk: | ||
if rk is not None: | ||
right_keys.append(right[rk]._values) | ||
else: | ||
# work-around for merge_asof(right_index=True) | ||
right_keys.append(right.index) | ||
if lk is not None and lk == rk: | ||
# avoid key upcast in corner case (length-0) | ||
if len(left) > 0: | ||
right_drop.append(rk) | ||
else: | ||
left_drop.append(lk) | ||
else: | ||
right_keys.append(rk) | ||
left_keys.append(left[lk]._values) | ||
join_names.append(lk) | ||
if lk is not None: | ||
left_keys.append(left[lk]._values) | ||
join_names.append(lk) | ||
else: | ||
# work-around for merge_asof(left_index=True) | ||
left_keys.append(left.index) | ||
join_names.append(left.index.name) | ||
elif _any(self.left_on): | ||
for k in self.left_on: | ||
if is_lkey(k): | ||
|
@@ -879,13 +948,15 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how='inner', | |
class _OrderedMerge(_MergeOperation): | ||
_merge_type = 'ordered_merge' | ||
|
||
def __init__(self, left, right, on=None, left_on=None, | ||
right_on=None, axis=1, | ||
def __init__(self, left, right, on=None, left_on=None, right_on=None, | ||
left_index=False, right_index=False, axis=1, | ||
suffixes=('_x', '_y'), copy=True, | ||
fill_method=None, how='outer'): | ||
|
||
self.fill_method = fill_method | ||
_MergeOperation.__init__(self, left, right, on=on, left_on=left_on, | ||
left_index=left_index, | ||
right_index=right_index, | ||
right_on=right_on, axis=axis, | ||
how=how, suffixes=suffixes, | ||
sort=True # factorize sorts | ||
|
@@ -958,43 +1029,68 @@ def _get_cython_type(dtype): | |
class _AsOfMerge(_OrderedMerge): | ||
_merge_type = 'asof_merge' | ||
|
||
def __init__(self, left, right, on=None, by=None, left_on=None, | ||
right_on=None, axis=1, | ||
suffixes=('_x', '_y'), copy=True, | ||
def __init__(self, left, right, on=None, left_on=None, right_on=None, | ||
left_index=False, right_index=False, | ||
by=None, left_by=None, right_by=None, | ||
axis=1, suffixes=('_x', '_y'), copy=True, | ||
fill_method=None, | ||
how='asof', tolerance=None, | ||
allow_exact_matches=True): | ||
|
||
self.by = by | ||
self.left_by = left_by | ||
self.right_by = right_by | ||
self.tolerance = tolerance | ||
self.allow_exact_matches = allow_exact_matches | ||
|
||
_OrderedMerge.__init__(self, left, right, on=on, left_on=left_on, | ||
right_on=right_on, axis=axis, | ||
right_on=right_on, left_index=left_index, | ||
right_index=right_index, axis=axis, | ||
how=how, suffixes=suffixes, | ||
fill_method=fill_method) | ||
|
||
def _validate_specification(self): | ||
super(_AsOfMerge, self)._validate_specification() | ||
|
||
# we only allow on to be a single item for on | ||
if len(self.left_on) != 1: | ||
if len(self.left_on) != 1 and not self.left_index: | ||
raise MergeError("can only asof on a key for left") | ||
|
||
if len(self.right_on) != 1: | ||
if len(self.right_on) != 1 and not self.right_index: | ||
raise MergeError("can only asof on a key for right") | ||
|
||
if self.left_index and isinstance(self.left.index, MultiIndex): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you tests for these conditions that are errors? |
||
raise MergeError("left can only have one index") | ||
|
||
if self.right_index and isinstance(self.right.index, MultiIndex): | ||
raise MergeError("right can only have one index") | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. need to add tests for each of these error conditions There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These tests already exist: |
||
# set 'by' columns | ||
if self.by is not None: | ||
if self.left_by is not None or self.right_by is not None: | ||
raise MergeError('Can only pass by OR left_by ' | ||
'and right_by') | ||
self.left_by = self.right_by = self.by | ||
if self.left_by is None and self.right_by is not None: | ||
raise MergeError('missing left_by') | ||
if self.left_by is not None and self.right_by is None: | ||
raise MergeError('missing right_by') | ||
|
||
# add by to our key-list so we can have it in the | ||
# output as a key | ||
if self.by is not None: | ||
if not is_list_like(self.by): | ||
self.by = [self.by] | ||
if self.left_by is not None: | ||
if not is_list_like(self.left_by): | ||
self.left_by = [self.left_by] | ||
if not is_list_like(self.right_by): | ||
self.right_by = [self.right_by] | ||
|
||
if len(self.by) != 1: | ||
if len(self.left_by) != 1: | ||
raise MergeError("can only asof by a single key") | ||
if len(self.right_by) != 1: | ||
raise MergeError("can only asof by a single key") | ||
|
||
self.left_on = self.by + list(self.left_on) | ||
self.right_on = self.by + list(self.right_on) | ||
self.left_on = self.left_by + list(self.left_on) | ||
self.right_on = self.right_by + list(self.right_on) | ||
|
||
@property | ||
def _asof_key(self): | ||
|
@@ -1017,7 +1113,7 @@ def _get_merge_keys(self): | |
# validate tolerance; must be a Timedelta if we have a DTI | ||
if self.tolerance is not None: | ||
|
||
lt = left_join_keys[self.left_on.index(self._asof_key)] | ||
lt = left_join_keys[-1] | ||
msg = "incompatible tolerance, must be compat " \ | ||
"with type {0}".format(type(lt)) | ||
|
||
|
@@ -1047,8 +1143,10 @@ def _get_join_indexers(self): | |
""" return the join indexers """ | ||
|
||
# values to compare | ||
left_values = self.left_join_keys[-1] | ||
right_values = self.right_join_keys[-1] | ||
left_values = (self.left.index.values if self.left_index else | ||
self.left_join_keys[-1]) | ||
right_values = (self.right.index.values if self.right_index else | ||
self.right_join_keys[-1]) | ||
tolerance = self.tolerance | ||
|
||
# we required sortedness in the join keys | ||
|
@@ -1066,7 +1164,7 @@ def _get_join_indexers(self): | |
tolerance = tolerance.value | ||
|
||
# a "by" parameter requires special handling | ||
if self.by is not None: | ||
if self.left_by is not None: | ||
left_by_values = self.left_join_keys[0] | ||
right_by_values = self.right_join_keys[0] | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can take -> gained, and say arguments at the end