Skip to content

Commit 9a6a78f

Browse files
chrisaycockjorisvandenbossche
authored andcommitted
ENH: merge_asof() has left_index/right_index and left_by/right_by (pandas-dev#14253) (pandas-dev#14531)
(cherry picked from commit 84cad61)
1 parent f1d43a4 commit 9a6a78f

File tree

3 files changed

+215
-29
lines changed

3 files changed

+215
-29
lines changed

doc/source/whatsnew/v0.19.2.txt

-2
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,6 @@ Other Enhancements
3232
- ``pd.merge_asof()`` gained ``left_index``/``right_index`` and ``left_by``/``right_by`` arguments (:issue:`14253`)
3333

3434

35-
36-
>>>>>>> 49e3137... DOC: whatsnew 0.19.2
3735
.. _whatsnew_0192.bug_fixes:
3836

3937
Bug Fixes

pandas/tools/merge.py

+125-27
Original file line numberDiff line numberDiff line change
@@ -259,7 +259,8 @@ def _merger(x, y):
259259

260260
def merge_asof(left, right, on=None,
261261
left_on=None, right_on=None,
262-
by=None,
262+
left_index=False, right_index=False,
263+
by=None, left_by=None, right_by=None,
263264
suffixes=('_x', '_y'),
264265
tolerance=None,
265266
allow_exact_matches=True):
@@ -288,9 +289,29 @@ def merge_asof(left, right, on=None,
288289
Field name to join on in left DataFrame.
289290
right_on : label
290291
Field name to join on in right DataFrame.
292+
left_index : boolean
293+
Use the index of the left DataFrame as the join key.
294+
295+
.. versionadded:: 0.19.2
296+
297+
right_index : boolean
298+
Use the index of the right DataFrame as the join key.
299+
300+
.. versionadded:: 0.19.2
301+
291302
by : column name
292303
Group both the left and right DataFrames by the group column; perform
293304
the merge operation on these pieces and recombine.
305+
left_by : column name
306+
Field name to group by in the left DataFrame.
307+
308+
.. versionadded:: 0.19.2
309+
310+
right_by : column name
311+
Field name to group by in the right DataFrame.
312+
313+
.. versionadded:: 0.19.2
314+
294315
suffixes : 2-length sequence (tuple, list, ...)
295316
Suffix to apply to overlapping column names in the left and right
296317
side, respectively
@@ -348,6 +369,28 @@ def merge_asof(left, right, on=None,
348369
3 5 b 3.0
349370
6 10 c 7.0
350371
372+
We can use indexed DataFrames as well.
373+
374+
>>> left
375+
left_val
376+
1 a
377+
5 b
378+
10 c
379+
380+
>>> right
381+
right_val
382+
1 1
383+
2 2
384+
3 3
385+
6 6
386+
7 7
387+
388+
>>> pd.merge_asof(left, right, left_index=True, right_index=True)
389+
left_val right_val
390+
1 a 1
391+
5 b 3
392+
10 c 7
393+
351394
Here is a real-world times-series example
352395
353396
>>> quotes
@@ -418,7 +461,9 @@ def merge_asof(left, right, on=None,
418461
"""
419462
op = _AsOfMerge(left, right,
420463
on=on, left_on=left_on, right_on=right_on,
421-
by=by, suffixes=suffixes,
464+
left_index=left_index, right_index=right_index,
465+
by=by, left_by=left_by, right_by=right_by,
466+
suffixes=suffixes,
422467
how='asof', tolerance=tolerance,
423468
allow_exact_matches=allow_exact_matches)
424469
return op.get_result()
@@ -650,7 +695,7 @@ def _get_join_info(self):
650695
left_ax = self.left._data.axes[self.axis]
651696
right_ax = self.right._data.axes[self.axis]
652697

653-
if self.left_index and self.right_index:
698+
if self.left_index and self.right_index and self.how != 'asof':
654699
join_index, left_indexer, right_indexer = \
655700
left_ax.join(right_ax, how=self.how, return_indexers=True)
656701
elif self.right_index and self.how == 'left':
@@ -731,6 +776,16 @@ def _get_merge_keys(self):
731776
is_rkey = lambda x: isinstance(
732777
x, (np.ndarray, ABCSeries)) and len(x) == len(right)
733778

779+
# Note that pd.merge_asof() has separate 'on' and 'by' parameters. A
780+
# user could, for example, request 'left_index' and 'left_by'. In a
781+
# regular pd.merge(), users cannot specify both 'left_index' and
782+
# 'left_on'. (Instead, users have a MultiIndex). That means the
783+
# self.left_on in this function is always empty in a pd.merge(), but
784+
# a pd.merge_asof(left_index=True, left_by=...) will result in a
785+
# self.left_on array with a None in the middle of it. This requires
786+
# a work-around as designated in the code below.
787+
# See _validate_specification() for where this happens.
788+
734789
# ugh, spaghetti re #733
735790
if _any(self.left_on) and _any(self.right_on):
736791
for lk, rk in zip(self.left_on, self.right_on):
@@ -740,21 +795,35 @@ def _get_merge_keys(self):
740795
right_keys.append(rk)
741796
join_names.append(None) # what to do?
742797
else:
743-
right_keys.append(right[rk]._values)
744-
join_names.append(rk)
798+
if rk is not None:
799+
right_keys.append(right[rk]._values)
800+
join_names.append(rk)
801+
else:
802+
# work-around for merge_asof(right_index=True)
803+
right_keys.append(right.index)
804+
join_names.append(right.index.name)
745805
else:
746806
if not is_rkey(rk):
747-
right_keys.append(right[rk]._values)
748-
if lk == rk:
807+
if rk is not None:
808+
right_keys.append(right[rk]._values)
809+
else:
810+
# work-around for merge_asof(right_index=True)
811+
right_keys.append(right.index)
812+
if lk is not None and lk == rk:
749813
# avoid key upcast in corner case (length-0)
750814
if len(left) > 0:
751815
right_drop.append(rk)
752816
else:
753817
left_drop.append(lk)
754818
else:
755819
right_keys.append(rk)
756-
left_keys.append(left[lk]._values)
757-
join_names.append(lk)
820+
if lk is not None:
821+
left_keys.append(left[lk]._values)
822+
join_names.append(lk)
823+
else:
824+
# work-around for merge_asof(left_index=True)
825+
left_keys.append(left.index)
826+
join_names.append(left.index.name)
758827
elif _any(self.left_on):
759828
for k in self.left_on:
760829
if is_lkey(k):
@@ -879,13 +948,15 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how='inner',
879948
class _OrderedMerge(_MergeOperation):
880949
_merge_type = 'ordered_merge'
881950

882-
def __init__(self, left, right, on=None, left_on=None,
883-
right_on=None, axis=1,
951+
def __init__(self, left, right, on=None, left_on=None, right_on=None,
952+
left_index=False, right_index=False, axis=1,
884953
suffixes=('_x', '_y'), copy=True,
885954
fill_method=None, how='outer'):
886955

887956
self.fill_method = fill_method
888957
_MergeOperation.__init__(self, left, right, on=on, left_on=left_on,
958+
left_index=left_index,
959+
right_index=right_index,
889960
right_on=right_on, axis=axis,
890961
how=how, suffixes=suffixes,
891962
sort=True # factorize sorts
@@ -958,43 +1029,68 @@ def _get_cython_type(dtype):
9581029
class _AsOfMerge(_OrderedMerge):
9591030
_merge_type = 'asof_merge'
9601031

961-
def __init__(self, left, right, on=None, by=None, left_on=None,
962-
right_on=None, axis=1,
963-
suffixes=('_x', '_y'), copy=True,
1032+
def __init__(self, left, right, on=None, left_on=None, right_on=None,
1033+
left_index=False, right_index=False,
1034+
by=None, left_by=None, right_by=None,
1035+
axis=1, suffixes=('_x', '_y'), copy=True,
9641036
fill_method=None,
9651037
how='asof', tolerance=None,
9661038
allow_exact_matches=True):
9671039

9681040
self.by = by
1041+
self.left_by = left_by
1042+
self.right_by = right_by
9691043
self.tolerance = tolerance
9701044
self.allow_exact_matches = allow_exact_matches
9711045

9721046
_OrderedMerge.__init__(self, left, right, on=on, left_on=left_on,
973-
right_on=right_on, axis=axis,
1047+
right_on=right_on, left_index=left_index,
1048+
right_index=right_index, axis=axis,
9741049
how=how, suffixes=suffixes,
9751050
fill_method=fill_method)
9761051

9771052
def _validate_specification(self):
9781053
super(_AsOfMerge, self)._validate_specification()
9791054

9801055
# we only allow on to be a single item for on
981-
if len(self.left_on) != 1:
1056+
if len(self.left_on) != 1 and not self.left_index:
9821057
raise MergeError("can only asof on a key for left")
9831058

984-
if len(self.right_on) != 1:
1059+
if len(self.right_on) != 1 and not self.right_index:
9851060
raise MergeError("can only asof on a key for right")
9861061

1062+
if self.left_index and isinstance(self.left.index, MultiIndex):
1063+
raise MergeError("left can only have one index")
1064+
1065+
if self.right_index and isinstance(self.right.index, MultiIndex):
1066+
raise MergeError("right can only have one index")
1067+
1068+
# set 'by' columns
1069+
if self.by is not None:
1070+
if self.left_by is not None or self.right_by is not None:
1071+
raise MergeError('Can only pass by OR left_by '
1072+
'and right_by')
1073+
self.left_by = self.right_by = self.by
1074+
if self.left_by is None and self.right_by is not None:
1075+
raise MergeError('missing left_by')
1076+
if self.left_by is not None and self.right_by is None:
1077+
raise MergeError('missing right_by')
1078+
9871079
# add by to our key-list so we can have it in the
9881080
# output as a key
989-
if self.by is not None:
990-
if not is_list_like(self.by):
991-
self.by = [self.by]
1081+
if self.left_by is not None:
1082+
if not is_list_like(self.left_by):
1083+
self.left_by = [self.left_by]
1084+
if not is_list_like(self.right_by):
1085+
self.right_by = [self.right_by]
9921086

993-
if len(self.by) != 1:
1087+
if len(self.left_by) != 1:
1088+
raise MergeError("can only asof by a single key")
1089+
if len(self.right_by) != 1:
9941090
raise MergeError("can only asof by a single key")
9951091

996-
self.left_on = self.by + list(self.left_on)
997-
self.right_on = self.by + list(self.right_on)
1092+
self.left_on = self.left_by + list(self.left_on)
1093+
self.right_on = self.right_by + list(self.right_on)
9981094

9991095
@property
10001096
def _asof_key(self):
@@ -1017,7 +1113,7 @@ def _get_merge_keys(self):
10171113
# validate tolerance; must be a Timedelta if we have a DTI
10181114
if self.tolerance is not None:
10191115

1020-
lt = left_join_keys[self.left_on.index(self._asof_key)]
1116+
lt = left_join_keys[-1]
10211117
msg = "incompatible tolerance, must be compat " \
10221118
"with type {0}".format(type(lt))
10231119

@@ -1047,8 +1143,10 @@ def _get_join_indexers(self):
10471143
""" return the join indexers """
10481144

10491145
# values to compare
1050-
left_values = self.left_join_keys[-1]
1051-
right_values = self.right_join_keys[-1]
1146+
left_values = (self.left.index.values if self.left_index else
1147+
self.left_join_keys[-1])
1148+
right_values = (self.right.index.values if self.right_index else
1149+
self.right_join_keys[-1])
10521150
tolerance = self.tolerance
10531151

10541152
# we required sortedness in the join keys
@@ -1066,7 +1164,7 @@ def _get_join_indexers(self):
10661164
tolerance = tolerance.value
10671165

10681166
# a "by" parameter requires special handling
1069-
if self.by is not None:
1167+
if self.left_by is not None:
10701168
left_by_values = self.left_join_keys[0]
10711169
right_by_values = self.right_join_keys[0]
10721170

pandas/tools/tests/test_merge_asof.py

+90
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,96 @@ def test_basic_categorical(self):
118118
by='ticker')
119119
assert_frame_equal(result, expected)
120120

121+
def test_basic_left_index(self):
122+
123+
# GH14253
124+
expected = self.asof
125+
trades = self.trades.set_index('time')
126+
quotes = self.quotes
127+
128+
result = merge_asof(trades, quotes,
129+
left_index=True,
130+
right_on='time',
131+
by='ticker')
132+
# left-only index uses right's index, oddly
133+
expected.index = result.index
134+
# time column appears after left's columns
135+
expected = expected[result.columns]
136+
assert_frame_equal(result, expected)
137+
138+
def test_basic_right_index(self):
139+
140+
expected = self.asof
141+
trades = self.trades
142+
quotes = self.quotes.set_index('time')
143+
144+
result = merge_asof(trades, quotes,
145+
left_on='time',
146+
right_index=True,
147+
by='ticker')
148+
assert_frame_equal(result, expected)
149+
150+
def test_basic_left_index_right_index(self):
151+
152+
expected = self.asof.set_index('time')
153+
trades = self.trades.set_index('time')
154+
quotes = self.quotes.set_index('time')
155+
156+
result = merge_asof(trades, quotes,
157+
left_index=True,
158+
right_index=True,
159+
by='ticker')
160+
assert_frame_equal(result, expected)
161+
162+
def test_multi_index(self):
163+
164+
# MultiIndex is prohibited
165+
trades = self.trades.set_index(['time', 'price'])
166+
quotes = self.quotes.set_index('time')
167+
with self.assertRaises(MergeError):
168+
merge_asof(trades, quotes,
169+
left_index=True,
170+
right_index=True)
171+
172+
trades = self.trades.set_index('time')
173+
quotes = self.quotes.set_index(['time', 'bid'])
174+
with self.assertRaises(MergeError):
175+
merge_asof(trades, quotes,
176+
left_index=True,
177+
right_index=True)
178+
179+
def test_on_and_index(self):
180+
181+
# 'on' parameter and index together is prohibited
182+
trades = self.trades.set_index('time')
183+
quotes = self.quotes.set_index('time')
184+
with self.assertRaises(MergeError):
185+
merge_asof(trades, quotes,
186+
left_on='price',
187+
left_index=True,
188+
right_index=True)
189+
190+
trades = self.trades.set_index('time')
191+
quotes = self.quotes.set_index('time')
192+
with self.assertRaises(MergeError):
193+
merge_asof(trades, quotes,
194+
right_on='bid',
195+
left_index=True,
196+
right_index=True)
197+
198+
def test_basic_left_by_right_by(self):
199+
200+
# GH14253
201+
expected = self.asof
202+
trades = self.trades
203+
quotes = self.quotes
204+
205+
result = merge_asof(trades, quotes,
206+
on='time',
207+
left_by='ticker',
208+
right_by='ticker')
209+
assert_frame_equal(result, expected)
210+
121211
def test_missing_right_by(self):
122212

123213
expected = self.asof

0 commit comments

Comments
 (0)