diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index db5bd22393e64..545b4380d9b75 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -58,4 +58,4 @@ Bug Fixes - Bug in ``df.groupby`` causing an ``AttributeError`` when grouping a single index frame by a column and the index level (:issue`14327`) - Bug in ``df.groupby`` where ``TypeError`` raised when ``pd.Grouper(key=...)`` is passed in a list (:issue:`14334`) - Bug in ``pd.pivot_table`` may raise ``TypeError`` or ``ValueError`` when ``index`` or ``columns`` - is not scalar and ``values`` is not specified (:issue:`14380`) \ No newline at end of file + is not scalar and ``values`` is not specified (:issue:`14380`) diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt index dc11dd17bfdd7..dc1b9e607dec6 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.txt @@ -18,6 +18,15 @@ Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ +.. _whatsnew_0192.enhancements.other: + +Other enhancements +^^^^^^^^^^^^^^^^^^ + +- ``pd.merge_asof()`` gained ``left_index``/``right_index`` and ``left_by``/``right_by`` arguments (:issue:`14253`) + + + .. _whatsnew_0192.bug_fixes: Bug Fixes diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 660300e1814e8..d0e6781fd6e42 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -12,7 +12,7 @@ Highlights include: Check the :ref:`API Changes ` and :ref:`deprecations ` before updating. -.. contents:: What's new in v0.19.0 +.. contents:: What's new in v0.20.0 :local: :backlinks: none diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index d2060185c3246..07c3ae7005783 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -259,7 +259,8 @@ def _merger(x, y): def merge_asof(left, right, on=None, left_on=None, right_on=None, - by=None, + left_index=False, right_index=False, + by=None, left_by=None, right_by=None, suffixes=('_x', '_y'), tolerance=None, allow_exact_matches=True): @@ -288,9 +289,29 @@ def merge_asof(left, right, on=None, Field name to join on in left DataFrame. right_on : label Field name to join on in right DataFrame. + left_index : boolean + Use the index of the left DataFrame as the join key. + + .. versionadded:: 0.19.2 + + right_index : boolean + Use the index of the right DataFrame as the join key. + + .. versionadded:: 0.19.2 + by : column name Group both the left and right DataFrames by the group column; perform the merge operation on these pieces and recombine. + left_by : column name + Field name to group by in the left DataFrame. + + .. versionadded:: 0.19.2 + + right_by : column name + Field name to group by in the right DataFrame. + + .. versionadded:: 0.19.2 + suffixes : 2-length sequence (tuple, list, ...) Suffix to apply to overlapping column names in the left and right side, respectively @@ -348,6 +369,28 @@ def merge_asof(left, right, on=None, 3 5 b 3.0 6 10 c 7.0 + We can use indexed DataFrames as well. + + >>> left + left_val + 1 a + 5 b + 10 c + + >>> right + right_val + 1 1 + 2 2 + 3 3 + 6 6 + 7 7 + + >>> pd.merge_asof(left, right, left_index=True, right_index=True) + left_val right_val + 1 a 1 + 5 b 3 + 10 c 7 + Here is a real-world times-series example >>> quotes @@ -418,7 +461,9 @@ def merge_asof(left, right, on=None, """ op = _AsOfMerge(left, right, on=on, left_on=left_on, right_on=right_on, - by=by, suffixes=suffixes, + left_index=left_index, right_index=right_index, + by=by, left_by=left_by, right_by=right_by, + suffixes=suffixes, how='asof', tolerance=tolerance, allow_exact_matches=allow_exact_matches) return op.get_result() @@ -650,7 +695,7 @@ def _get_join_info(self): left_ax = self.left._data.axes[self.axis] right_ax = self.right._data.axes[self.axis] - if self.left_index and self.right_index: + if self.left_index and self.right_index and self.how != 'asof': join_index, left_indexer, right_indexer = \ left_ax.join(right_ax, how=self.how, return_indexers=True) elif self.right_index and self.how == 'left': @@ -731,6 +776,16 @@ def _get_merge_keys(self): is_rkey = lambda x: isinstance( x, (np.ndarray, ABCSeries)) and len(x) == len(right) + # Note that pd.merge_asof() has separate 'on' and 'by' parameters. A + # user could, for example, request 'left_index' and 'left_by'. In a + # regular pd.merge(), users cannot specify both 'left_index' and + # 'left_on'. (Instead, users have a MultiIndex). That means the + # self.left_on in this function is always empty in a pd.merge(), but + # a pd.merge_asof(left_index=True, left_by=...) will result in a + # self.left_on array with a None in the middle of it. This requires + # a work-around as designated in the code below. + # See _validate_specification() for where this happens. + # ugh, spaghetti re #733 if _any(self.left_on) and _any(self.right_on): for lk, rk in zip(self.left_on, self.right_on): @@ -740,12 +795,21 @@ def _get_merge_keys(self): right_keys.append(rk) join_names.append(None) # what to do? else: - right_keys.append(right[rk]._values) - join_names.append(rk) + if rk is not None: + right_keys.append(right[rk]._values) + join_names.append(rk) + else: + # work-around for merge_asof(right_index=True) + right_keys.append(right.index) + join_names.append(right.index.name) else: if not is_rkey(rk): - right_keys.append(right[rk]._values) - if lk == rk: + if rk is not None: + right_keys.append(right[rk]._values) + else: + # work-around for merge_asof(right_index=True) + right_keys.append(right.index) + if lk is not None and lk == rk: # avoid key upcast in corner case (length-0) if len(left) > 0: right_drop.append(rk) @@ -753,8 +817,13 @@ def _get_merge_keys(self): left_drop.append(lk) else: right_keys.append(rk) - left_keys.append(left[lk]._values) - join_names.append(lk) + if lk is not None: + left_keys.append(left[lk]._values) + join_names.append(lk) + else: + # work-around for merge_asof(left_index=True) + left_keys.append(left.index) + join_names.append(left.index.name) elif _any(self.left_on): for k in self.left_on: if is_lkey(k): @@ -879,13 +948,15 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how='inner', class _OrderedMerge(_MergeOperation): _merge_type = 'ordered_merge' - def __init__(self, left, right, on=None, left_on=None, - right_on=None, axis=1, + def __init__(self, left, right, on=None, left_on=None, right_on=None, + left_index=False, right_index=False, axis=1, suffixes=('_x', '_y'), copy=True, fill_method=None, how='outer'): self.fill_method = fill_method _MergeOperation.__init__(self, left, right, on=on, left_on=left_on, + left_index=left_index, + right_index=right_index, right_on=right_on, axis=axis, how=how, suffixes=suffixes, sort=True # factorize sorts @@ -958,19 +1029,23 @@ def _get_cython_type(dtype): class _AsOfMerge(_OrderedMerge): _merge_type = 'asof_merge' - def __init__(self, left, right, on=None, by=None, left_on=None, - right_on=None, axis=1, - suffixes=('_x', '_y'), copy=True, + def __init__(self, left, right, on=None, left_on=None, right_on=None, + left_index=False, right_index=False, + by=None, left_by=None, right_by=None, + axis=1, suffixes=('_x', '_y'), copy=True, fill_method=None, how='asof', tolerance=None, allow_exact_matches=True): self.by = by + self.left_by = left_by + self.right_by = right_by self.tolerance = tolerance self.allow_exact_matches = allow_exact_matches _OrderedMerge.__init__(self, left, right, on=on, left_on=left_on, - right_on=right_on, axis=axis, + right_on=right_on, left_index=left_index, + right_index=right_index, axis=axis, how=how, suffixes=suffixes, fill_method=fill_method) @@ -978,23 +1053,44 @@ def _validate_specification(self): super(_AsOfMerge, self)._validate_specification() # we only allow on to be a single item for on - if len(self.left_on) != 1: + if len(self.left_on) != 1 and not self.left_index: raise MergeError("can only asof on a key for left") - if len(self.right_on) != 1: + if len(self.right_on) != 1 and not self.right_index: raise MergeError("can only asof on a key for right") + if self.left_index and isinstance(self.left.index, MultiIndex): + raise MergeError("left can only have one index") + + if self.right_index and isinstance(self.right.index, MultiIndex): + raise MergeError("right can only have one index") + + # set 'by' columns + if self.by is not None: + if self.left_by is not None or self.right_by is not None: + raise MergeError('Can only pass by OR left_by ' + 'and right_by') + self.left_by = self.right_by = self.by + if self.left_by is None and self.right_by is not None: + raise MergeError('missing left_by') + if self.left_by is not None and self.right_by is None: + raise MergeError('missing right_by') + # add by to our key-list so we can have it in the # output as a key - if self.by is not None: - if not is_list_like(self.by): - self.by = [self.by] + if self.left_by is not None: + if not is_list_like(self.left_by): + self.left_by = [self.left_by] + if not is_list_like(self.right_by): + self.right_by = [self.right_by] - if len(self.by) != 1: + if len(self.left_by) != 1: + raise MergeError("can only asof by a single key") + if len(self.right_by) != 1: raise MergeError("can only asof by a single key") - self.left_on = self.by + list(self.left_on) - self.right_on = self.by + list(self.right_on) + self.left_on = self.left_by + list(self.left_on) + self.right_on = self.right_by + list(self.right_on) @property def _asof_key(self): @@ -1017,7 +1113,7 @@ def _get_merge_keys(self): # validate tolerance; must be a Timedelta if we have a DTI if self.tolerance is not None: - lt = left_join_keys[self.left_on.index(self._asof_key)] + lt = left_join_keys[-1] msg = "incompatible tolerance, must be compat " \ "with type {0}".format(type(lt)) @@ -1047,8 +1143,10 @@ def _get_join_indexers(self): """ return the join indexers """ # values to compare - left_values = self.left_join_keys[-1] - right_values = self.right_join_keys[-1] + left_values = (self.left.index.values if self.left_index else + self.left_join_keys[-1]) + right_values = (self.right.index.values if self.right_index else + self.right_join_keys[-1]) tolerance = self.tolerance # we required sortedness in the join keys @@ -1066,7 +1164,7 @@ def _get_join_indexers(self): tolerance = tolerance.value # a "by" parameter requires special handling - if self.by is not None: + if self.left_by is not None: left_by_values = self.left_join_keys[0] right_by_values = self.right_join_keys[0] diff --git a/pandas/tools/tests/test_merge_asof.py b/pandas/tools/tests/test_merge_asof.py index f413618624592..c0993917ea395 100644 --- a/pandas/tools/tests/test_merge_asof.py +++ b/pandas/tools/tests/test_merge_asof.py @@ -117,6 +117,96 @@ def test_basic_categorical(self): by='ticker') assert_frame_equal(result, expected) + def test_basic_left_index(self): + + # GH14253 + expected = self.asof + trades = self.trades.set_index('time') + quotes = self.quotes + + result = merge_asof(trades, quotes, + left_index=True, + right_on='time', + by='ticker') + # left-only index uses right's index, oddly + expected.index = result.index + # time column appears after left's columns + expected = expected[result.columns] + assert_frame_equal(result, expected) + + def test_basic_right_index(self): + + expected = self.asof + trades = self.trades + quotes = self.quotes.set_index('time') + + result = merge_asof(trades, quotes, + left_on='time', + right_index=True, + by='ticker') + assert_frame_equal(result, expected) + + def test_basic_left_index_right_index(self): + + expected = self.asof.set_index('time') + trades = self.trades.set_index('time') + quotes = self.quotes.set_index('time') + + result = merge_asof(trades, quotes, + left_index=True, + right_index=True, + by='ticker') + assert_frame_equal(result, expected) + + def test_multi_index(self): + + # MultiIndex is prohibited + trades = self.trades.set_index(['time', 'price']) + quotes = self.quotes.set_index('time') + with self.assertRaises(MergeError): + merge_asof(trades, quotes, + left_index=True, + right_index=True) + + trades = self.trades.set_index('time') + quotes = self.quotes.set_index(['time', 'bid']) + with self.assertRaises(MergeError): + merge_asof(trades, quotes, + left_index=True, + right_index=True) + + def test_on_and_index(self): + + # 'on' parameter and index together is prohibited + trades = self.trades.set_index('time') + quotes = self.quotes.set_index('time') + with self.assertRaises(MergeError): + merge_asof(trades, quotes, + left_on='price', + left_index=True, + right_index=True) + + trades = self.trades.set_index('time') + quotes = self.quotes.set_index('time') + with self.assertRaises(MergeError): + merge_asof(trades, quotes, + right_on='bid', + left_index=True, + right_index=True) + + def test_basic_left_by_right_by(self): + + # GH14253 + expected = self.asof + trades = self.trades + quotes = self.quotes + + result = merge_asof(trades, quotes, + on='time', + left_by='ticker', + right_by='ticker') + assert_frame_equal(result, expected) + def test_missing_right_by(self): expected = self.asof