Skip to content

ENH: merge_asof() has left_index/right_index and left_by/right_by (#14253) #14531

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Dec 14, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.19.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -58,4 +58,4 @@ Bug Fixes
- Bug in ``df.groupby`` causing an ``AttributeError`` when grouping a single index frame by a column and the index level (:issue`14327`)
- Bug in ``df.groupby`` where ``TypeError`` raised when ``pd.Grouper(key=...)`` is passed in a list (:issue:`14334`)
- Bug in ``pd.pivot_table`` may raise ``TypeError`` or ``ValueError`` when ``index`` or ``columns``
is not scalar and ``values`` is not specified (:issue:`14380`)
is not scalar and ``values`` is not specified (:issue:`14380`)
9 changes: 9 additions & 0 deletions doc/source/whatsnew/v0.19.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,15 @@ Performance Improvements
~~~~~~~~~~~~~~~~~~~~~~~~


.. _whatsnew_0192.enhancements.other:

Other enhancements
^^^^^^^^^^^^^^^^^^

- ``pd.merge_asof()`` gained ``left_index``/``right_index`` and ``left_by``/``right_by`` arguments (:issue:`14253`)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can take -> gained, and say arguments at the end



.. _whatsnew_0192.bug_fixes:

Bug Fixes
Expand Down
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ Highlights include:

Check the :ref:`API Changes <whatsnew_0200.api_breaking>` and :ref:`deprecations <whatsnew_0200.deprecations>` before updating.

.. contents:: What's new in v0.19.0
.. contents:: What's new in v0.20.0
:local:
:backlinks: none

Expand Down
152 changes: 125 additions & 27 deletions pandas/tools/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,8 @@ def _merger(x, y):

def merge_asof(left, right, on=None,
left_on=None, right_on=None,
by=None,
left_index=False, right_index=False,
by=None, left_by=None, right_by=None,
suffixes=('_x', '_y'),
tolerance=None,
allow_exact_matches=True):
Expand Down Expand Up @@ -288,9 +289,29 @@ def merge_asof(left, right, on=None,
Field name to join on in left DataFrame.
right_on : label
Field name to join on in right DataFrame.
left_index : boolean
Use the index of the left DataFrame as the join key.

.. versionadded:: 0.19.2

right_index : boolean
Use the index of the right DataFrame as the join key.

.. versionadded:: 0.19.2

by : column name
Group both the left and right DataFrames by the group column; perform
the merge operation on these pieces and recombine.
left_by : column name
Field name to group by in the left DataFrame.

.. versionadded:: 0.19.2

right_by : column name
Field name to group by in the right DataFrame.

.. versionadded:: 0.19.2

suffixes : 2-length sequence (tuple, list, ...)
Suffix to apply to overlapping column names in the left and right
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add some examples of using these parameters?

side, respectively
Expand Down Expand Up @@ -348,6 +369,28 @@ def merge_asof(left, right, on=None,
3 5 b 3.0
6 10 c 7.0

We can use indexed DataFrames as well.

>>> left
left_val
1 a
5 b
10 c

>>> right
right_val
1 1
2 2
3 3
6 6
7 7

>>> pd.merge_asof(left, right, left_index=True, right_index=True)
left_val right_val
1 a 1
5 b 3
10 c 7

Here is a real-world times-series example

>>> quotes
Expand Down Expand Up @@ -418,7 +461,9 @@ def merge_asof(left, right, on=None,
"""
op = _AsOfMerge(left, right,
on=on, left_on=left_on, right_on=right_on,
by=by, suffixes=suffixes,
left_index=left_index, right_index=right_index,
by=by, left_by=left_by, right_by=right_by,
suffixes=suffixes,
how='asof', tolerance=tolerance,
allow_exact_matches=allow_exact_matches)
return op.get_result()
Expand Down Expand Up @@ -650,7 +695,7 @@ def _get_join_info(self):
left_ax = self.left._data.axes[self.axis]
right_ax = self.right._data.axes[self.axis]

if self.left_index and self.right_index:
if self.left_index and self.right_index and self.how != 'asof':
join_index, left_indexer, right_indexer = \
left_ax.join(right_ax, how=self.how, return_indexers=True)
elif self.right_index and self.how == 'left':
Expand Down Expand Up @@ -731,6 +776,16 @@ def _get_merge_keys(self):
is_rkey = lambda x: isinstance(
x, (np.ndarray, ABCSeries)) and len(x) == len(right)

# Note that pd.merge_asof() has separate 'on' and 'by' parameters. A
# user could, for example, request 'left_index' and 'left_by'. In a
# regular pd.merge(), users cannot specify both 'left_index' and
# 'left_on'. (Instead, users have a MultiIndex). That means the
# self.left_on in this function is always empty in a pd.merge(), but
# a pd.merge_asof(left_index=True, left_by=...) will result in a
# self.left_on array with a None in the middle of it. This requires
# a work-around as designated in the code below.
# See _validate_specification() for where this happens.

# ugh, spaghetti re #733
if _any(self.left_on) and _any(self.right_on):
for lk, rk in zip(self.left_on, self.right_on):
Expand All @@ -740,21 +795,35 @@ def _get_merge_keys(self):
right_keys.append(rk)
join_names.append(None) # what to do?
else:
right_keys.append(right[rk]._values)
join_names.append(rk)
if rk is not None:
right_keys.append(right[rk]._values)
join_names.append(rk)
else:
# work-around for merge_asof(right_index=True)
right_keys.append(right.index)
join_names.append(right.index.name)
else:
if not is_rkey(rk):
right_keys.append(right[rk]._values)
if lk == rk:
if rk is not None:
right_keys.append(right[rk]._values)
else:
# work-around for merge_asof(right_index=True)
right_keys.append(right.index)
if lk is not None and lk == rk:
# avoid key upcast in corner case (length-0)
if len(left) > 0:
right_drop.append(rk)
else:
left_drop.append(lk)
else:
right_keys.append(rk)
left_keys.append(left[lk]._values)
join_names.append(lk)
if lk is not None:
left_keys.append(left[lk]._values)
join_names.append(lk)
else:
# work-around for merge_asof(left_index=True)
left_keys.append(left.index)
join_names.append(left.index.name)
elif _any(self.left_on):
for k in self.left_on:
if is_lkey(k):
Expand Down Expand Up @@ -879,13 +948,15 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how='inner',
class _OrderedMerge(_MergeOperation):
_merge_type = 'ordered_merge'

def __init__(self, left, right, on=None, left_on=None,
right_on=None, axis=1,
def __init__(self, left, right, on=None, left_on=None, right_on=None,
left_index=False, right_index=False, axis=1,
suffixes=('_x', '_y'), copy=True,
fill_method=None, how='outer'):

self.fill_method = fill_method
_MergeOperation.__init__(self, left, right, on=on, left_on=left_on,
left_index=left_index,
right_index=right_index,
right_on=right_on, axis=axis,
how=how, suffixes=suffixes,
sort=True # factorize sorts
Expand Down Expand Up @@ -958,43 +1029,68 @@ def _get_cython_type(dtype):
class _AsOfMerge(_OrderedMerge):
_merge_type = 'asof_merge'

def __init__(self, left, right, on=None, by=None, left_on=None,
right_on=None, axis=1,
suffixes=('_x', '_y'), copy=True,
def __init__(self, left, right, on=None, left_on=None, right_on=None,
left_index=False, right_index=False,
by=None, left_by=None, right_by=None,
axis=1, suffixes=('_x', '_y'), copy=True,
fill_method=None,
how='asof', tolerance=None,
allow_exact_matches=True):

self.by = by
self.left_by = left_by
self.right_by = right_by
self.tolerance = tolerance
self.allow_exact_matches = allow_exact_matches

_OrderedMerge.__init__(self, left, right, on=on, left_on=left_on,
right_on=right_on, axis=axis,
right_on=right_on, left_index=left_index,
right_index=right_index, axis=axis,
how=how, suffixes=suffixes,
fill_method=fill_method)

def _validate_specification(self):
super(_AsOfMerge, self)._validate_specification()

# we only allow on to be a single item for on
if len(self.left_on) != 1:
if len(self.left_on) != 1 and not self.left_index:
raise MergeError("can only asof on a key for left")

if len(self.right_on) != 1:
if len(self.right_on) != 1 and not self.right_index:
raise MergeError("can only asof on a key for right")

if self.left_index and isinstance(self.left.index, MultiIndex):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you tests for these conditions that are errors?

raise MergeError("left can only have one index")

if self.right_index and isinstance(self.right.index, MultiIndex):
raise MergeError("right can only have one index")

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

need to add tests for each of these error conditions

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

# set 'by' columns
if self.by is not None:
if self.left_by is not None or self.right_by is not None:
raise MergeError('Can only pass by OR left_by '
'and right_by')
self.left_by = self.right_by = self.by
if self.left_by is None and self.right_by is not None:
raise MergeError('missing left_by')
if self.left_by is not None and self.right_by is None:
raise MergeError('missing right_by')

# add by to our key-list so we can have it in the
# output as a key
if self.by is not None:
if not is_list_like(self.by):
self.by = [self.by]
if self.left_by is not None:
if not is_list_like(self.left_by):
self.left_by = [self.left_by]
if not is_list_like(self.right_by):
self.right_by = [self.right_by]

if len(self.by) != 1:
if len(self.left_by) != 1:
raise MergeError("can only asof by a single key")
if len(self.right_by) != 1:
raise MergeError("can only asof by a single key")

self.left_on = self.by + list(self.left_on)
self.right_on = self.by + list(self.right_on)
self.left_on = self.left_by + list(self.left_on)
self.right_on = self.right_by + list(self.right_on)

@property
def _asof_key(self):
Expand All @@ -1017,7 +1113,7 @@ def _get_merge_keys(self):
# validate tolerance; must be a Timedelta if we have a DTI
if self.tolerance is not None:

lt = left_join_keys[self.left_on.index(self._asof_key)]
lt = left_join_keys[-1]
msg = "incompatible tolerance, must be compat " \
"with type {0}".format(type(lt))

Expand Down Expand Up @@ -1047,8 +1143,10 @@ def _get_join_indexers(self):
""" return the join indexers """

# values to compare
left_values = self.left_join_keys[-1]
right_values = self.right_join_keys[-1]
left_values = (self.left.index.values if self.left_index else
self.left_join_keys[-1])
right_values = (self.right.index.values if self.right_index else
self.right_join_keys[-1])
tolerance = self.tolerance

# we required sortedness in the join keys
Expand All @@ -1066,7 +1164,7 @@ def _get_join_indexers(self):
tolerance = tolerance.value

# a "by" parameter requires special handling
if self.by is not None:
if self.left_by is not None:
left_by_values = self.left_join_keys[0]
right_by_values = self.right_join_keys[0]

Expand Down
Loading