Skip to content

Commit 9b2797d

Browse files
chrisaycockjreback
authored andcommitted
BUG: Fix edge cases in merge_asof() by comparing factorized keys (pandas-dev#13709) (pandas-dev#13836)
Also removes unnecessary check_duplicates. Added asv benchmarks for merge_asof()
1 parent 49243d6 commit 9b2797d

File tree

5 files changed

+100
-189
lines changed

5 files changed

+100
-189
lines changed

asv_bench/benchmarks/join_merge.py

+37
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,43 @@ def time_join_dataframe_integer_key(self):
293293
merge(self.df, self.df2, on='key1')
294294

295295

296+
class merge_asof_noby(object):
297+
298+
def setup(self):
299+
np.random.seed(0)
300+
one_count = 200000
301+
two_count = 1000000
302+
self.df1 = pd.DataFrame({'time': np.random.randint(0, one_count/20, one_count),
303+
'value1': np.random.randn(one_count)})
304+
self.df2 = pd.DataFrame({'time': np.random.randint(0, two_count/20, two_count),
305+
'value2': np.random.randn(two_count)})
306+
self.df1 = self.df1.sort_values('time')
307+
self.df2 = self.df2.sort_values('time')
308+
309+
def time_merge_asof_noby(self):
310+
merge_asof(self.df1, self.df2, on='time')
311+
312+
313+
class merge_asof_by(object):
314+
315+
def setup(self):
316+
import string
317+
np.random.seed(0)
318+
one_count = 200000
319+
two_count = 1000000
320+
self.df1 = pd.DataFrame({'time': np.random.randint(0, one_count/20, one_count),
321+
'key': np.random.choice(list(string.uppercase), one_count),
322+
'value1': np.random.randn(one_count)})
323+
self.df2 = pd.DataFrame({'time': np.random.randint(0, two_count/20, two_count),
324+
'key': np.random.choice(list(string.uppercase), two_count),
325+
'value2': np.random.randn(two_count)})
326+
self.df1 = self.df1.sort_values('time')
327+
self.df2 = self.df2.sort_values('time')
328+
329+
def time_merge_asof_by(self):
330+
merge_asof(self.df1, self.df2, on='time', by='key')
331+
332+
296333
class join_non_unique_equal(object):
297334
goal_time = 0.2
298335

doc/source/whatsnew/v0.19.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ The following are now part of this API:
4747
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
4848

4949
A long-time requested feature has been added through the :func:`merge_asof` function, to
50-
support asof style joining of time-series. (:issue:`1870`, :issue:`13695`). Full documentation is
50+
support asof style joining of time-series. (:issue:`1870`, :issue:`13695`, :issue:`13709`). Full documentation is
5151
:ref:`here <merging.merge_asof>`
5252

5353
The :func:`merge_asof` performs an asof merge, which is similar to a left-join

pandas/src/join.pyx

+38-132
Original file line numberDiff line numberDiff line change
@@ -126,150 +126,56 @@ def left_outer_join(ndarray[int64_t] left, ndarray[int64_t] right,
126126

127127

128128
def left_outer_asof_join(ndarray[int64_t] left, ndarray[int64_t] right,
129-
Py_ssize_t max_groups, sort=True,
129+
Py_ssize_t max_groups, # ignored
130130
bint allow_exact_matches=1,
131-
left_distance=None,
132-
right_distance=None,
131+
left_values=None,
132+
right_values=None,
133133
tolerance=None):
134134

135135
cdef:
136-
Py_ssize_t i, j, k, count = 0
137-
Py_ssize_t loc, left_pos, right_pos, position
138-
Py_ssize_t offset
139-
ndarray[int64_t] left_count, right_count
140-
ndarray left_sorter, right_sorter, rev
136+
Py_ssize_t left_pos, right_pos, left_size, right_size
141137
ndarray[int64_t] left_indexer, right_indexer
142-
int64_t lc, rc, tol, left_val, right_val, diff, indexer
143-
ndarray[int64_t] ld, rd
144-
bint has_tol = 0
138+
bint has_tolerance = 0
139+
ndarray[int64_t] left_values_, right_values_
140+
int64_t tolerance_
145141

146142
# if we are using tolerance, set our objects
147-
if left_distance is not None and right_distance is not None and tolerance is not None:
148-
has_tol = 1
149-
ld = left_distance
150-
rd = right_distance
151-
tol = tolerance
143+
if left_values is not None and right_values is not None and tolerance is not None:
144+
has_tolerance = 1
145+
left_values_ = left_values
146+
right_values_ = right_values
147+
tolerance_ = tolerance
152148

153-
# NA group in location 0
154-
left_sorter, left_count = groupsort_indexer(left, max_groups)
155-
right_sorter, right_count = groupsort_indexer(right, max_groups)
149+
left_size = len(left)
150+
right_size = len(right)
156151

157-
# First pass, determine size of result set, do not use the NA group
158-
for i in range(1, max_groups + 1):
159-
if right_count[i] > 0:
160-
count += left_count[i] * right_count[i]
161-
else:
162-
count += left_count[i]
152+
left_indexer = np.empty(left_size, dtype=np.int64)
153+
right_indexer = np.empty(left_size, dtype=np.int64)
163154

164-
# group 0 is the NA group
165-
left_pos = 0
166155
right_pos = 0
167-
position = 0
168-
169-
# exclude the NA group
170-
left_pos = left_count[0]
171-
right_pos = right_count[0]
172-
173-
left_indexer = np.empty(count, dtype=np.int64)
174-
right_indexer = np.empty(count, dtype=np.int64)
175-
176-
for i in range(1, max_groups + 1):
177-
lc = left_count[i]
178-
rc = right_count[i]
179-
180-
if rc == 0:
181-
for j in range(lc):
182-
indexer = position + j
183-
left_indexer[indexer] = left_pos + j
184-
185-
# take the most recent value
186-
# if we are not the first
187-
if right_pos:
188-
189-
if has_tol:
190-
191-
left_val = ld[left_pos + j]
192-
right_val = rd[right_pos - 1]
193-
diff = left_val - right_val
194-
195-
# do we allow exact matches
196-
if allow_exact_matches:
197-
if diff > tol:
198-
right_indexer[indexer] = -1
199-
continue
200-
elif not allow_exact_matches:
201-
if diff >= tol or lc == rc:
202-
right_indexer[indexer] = -1
203-
continue
204-
205-
right_indexer[indexer] = right_pos - 1
206-
else:
207-
right_indexer[indexer] = -1
208-
position += lc
156+
for left_pos in range(left_size):
157+
# restart right_pos if it went negative in a previous iteration
158+
if right_pos < 0:
159+
right_pos = 0
160+
161+
# find last position in right whose value is less than left's value
162+
if allow_exact_matches:
163+
while right_pos < right_size and right[right_pos] <= left[left_pos]:
164+
right_pos += 1
209165
else:
210-
for j in range(lc):
211-
offset = position + j * rc
212-
for k in range(rc):
213-
214-
indexer = offset + k
215-
left_indexer[indexer] = left_pos + j
216-
217-
if has_tol:
218-
219-
left_val = ld[left_pos + j]
220-
right_val = rd[right_pos + k]
221-
diff = left_val - right_val
222-
223-
# do we allow exact matches
224-
if allow_exact_matches:
225-
if diff > tol:
226-
right_indexer[indexer] = -1
227-
continue
228-
229-
# we don't allow exact matches
230-
elif not allow_exact_matches:
231-
if diff >= tol or lc == rc:
232-
right_indexer[indexer] = -1
233-
else:
234-
right_indexer[indexer] = right_pos - 1
235-
continue
236-
237-
else:
238-
239-
# do we allow exact matches
240-
if not allow_exact_matches:
241-
242-
if right_pos:
243-
right_indexer[indexer] = right_pos - 1
244-
else:
245-
right_indexer[indexer] = -1
246-
continue
247-
248-
right_indexer[indexer] = right_pos + k
249-
position += lc * rc
250-
left_pos += lc
251-
right_pos += rc
252-
253-
left_indexer = _get_result_indexer(left_sorter, left_indexer)
254-
right_indexer = _get_result_indexer(right_sorter, right_indexer)
255-
256-
if not sort: # if not asked to sort, revert to original order
257-
if len(left) == len(left_indexer):
258-
# no multiple matches for any row on the left
259-
# this is a short-cut to avoid groupsort_indexer
260-
# otherwise, the `else` path also works in this case
261-
if left_sorter.dtype != np.int_:
262-
left_sorter = left_sorter.astype(np.int_)
263-
264-
rev = np.empty(len(left), dtype=np.int_)
265-
rev.put(left_sorter, np.arange(len(left)))
266-
else:
267-
rev, _ = groupsort_indexer(left_indexer, len(left))
268-
269-
if rev.dtype != np.int_:
270-
rev = rev.astype(np.int_)
271-
right_indexer = right_indexer.take(rev)
272-
left_indexer = left_indexer.take(rev)
166+
while right_pos < right_size and right[right_pos] < left[left_pos]:
167+
right_pos += 1
168+
right_pos -= 1
169+
170+
# save positions as the desired index
171+
left_indexer[left_pos] = left_pos
172+
right_indexer[left_pos] = right_pos
173+
174+
# if needed, verify that tolerance is met
175+
if has_tolerance and right_pos != -1:
176+
diff = left_values[left_pos] - right_values[right_pos]
177+
if diff > tolerance_:
178+
right_indexer[left_pos] = -1
273179

274180
return left_indexer, right_indexer
275181

pandas/tools/merge.py

+4-21
Original file line numberDiff line numberDiff line change
@@ -258,8 +258,7 @@ def merge_asof(left, right, on=None,
258258
by=None,
259259
suffixes=('_x', '_y'),
260260
tolerance=None,
261-
allow_exact_matches=True,
262-
check_duplicates=True):
261+
allow_exact_matches=True):
263262
"""Perform an asof merge. This is similar to a left-join except that we
264263
match on nearest key rather than equal keys.
265264
@@ -304,14 +303,6 @@ def merge_asof(left, right, on=None,
304303
- If False, don't match the same 'on' value
305304
(i.e., stricly less-than)
306305
307-
check_duplicates : boolean, default True
308-
309-
- If True, check and remove duplicates for the right
310-
DataFrame, on the [by, on] combination, keeping the last value.
311-
- If False, no check for duplicates. If you *know* that
312-
you don't have duplicates, then turning off the check for duplicates
313-
can be more performant.
314-
315306
Returns
316307
-------
317308
merged : DataFrame
@@ -436,7 +427,7 @@ def _merger(x, y):
436427
if by is not None:
437428
result, groupby = _groupby_and_merge(by, on, left, right,
438429
lambda x, y: _merger(x, y),
439-
check_duplicates=check_duplicates)
430+
check_duplicates=False)
440431

441432
# we want to preserve the original order
442433
# we had grouped, so need to reverse this
@@ -446,20 +437,12 @@ def _merger(x, y):
446437
sorter = _ensure_platform_int(
447438
np.concatenate([groupby.indices[g] for g, _ in groupby]))
448439
if len(result) != len(sorter):
449-
if check_duplicates:
450-
raise AssertionError("invalid reverse grouping")
451440
return result
452441

453442
rev = np.empty(len(sorter), dtype=np.int_)
454443
rev.put(sorter, np.arange(len(sorter)))
455444
return result.take(rev).reset_index(drop=True)
456445

457-
if check_duplicates:
458-
if on is None:
459-
on = []
460-
elif not isinstance(on, (list, tuple)):
461-
on = [on]
462-
463446
if right.duplicated(on).any():
464447
right = right.drop_duplicates(on, keep='last')
465448

@@ -1067,8 +1050,8 @@ def _get_join_indexers(self):
10671050
lt = lt.view('i8')
10681051
t = t.value
10691052
rt = rt.view('i8')
1070-
kwargs['left_distance'] = lt
1071-
kwargs['right_distance'] = rt
1053+
kwargs['left_values'] = lt
1054+
kwargs['right_values'] = rt
10721055
kwargs['tolerance'] = t
10731056

10741057
return _get_join_indexers(self.left_join_keys,

pandas/tools/tests/test_merge_asof.py

+20-35
Original file line numberDiff line numberDiff line change
@@ -184,43 +184,8 @@ def test_with_duplicates(self):
184184
expected = self.read_data('asof.csv')
185185
assert_frame_equal(result, expected)
186186

187-
result = merge_asof(self.trades, q,
188-
on='time',
189-
by='ticker',
190-
check_duplicates=False)
191-
expected = self.read_data('asof.csv')
192-
expected = pd.concat([expected, expected]).sort_values(
193-
['time', 'ticker']).reset_index(drop=True)
194-
195-
# the results are not ordered in a meaningful way
196-
# nor are the exact matches duplicated, so comparisons
197-
# are pretty tricky here, however the uniques are the same
198-
199-
def aligner(x, ticker):
200-
return (x[x.ticker == ticker]
201-
.sort_values(['time', 'ticker', 'quantity', 'price',
202-
'marketCenter', 'bid', 'ask'])
203-
.drop_duplicates(keep='last')
204-
.reset_index(drop=True)
205-
)
206-
207-
for ticker in expected.ticker.unique():
208-
r = aligner(result, ticker)
209-
e = aligner(expected, ticker)
210-
assert_frame_equal(r, e)
211-
212187
def test_with_duplicates_no_on(self):
213188

214-
df1 = pd.DataFrame({'key': [1, 1, 3],
215-
'left_val': [1, 2, 3]})
216-
df2 = pd.DataFrame({'key': [1, 3, 3],
217-
'right_val': [1, 2, 3]})
218-
result = merge_asof(df1, df2, on='key', check_duplicates=False)
219-
expected = pd.DataFrame({'key': [1, 1, 3, 3],
220-
'left_val': [1, 2, 3, 3],
221-
'right_val': [1, 1, 2, 3]})
222-
assert_frame_equal(result, expected)
223-
224189
df1 = pd.DataFrame({'key': [1, 1, 3],
225190
'left_val': [1, 2, 3]})
226191
df2 = pd.DataFrame({'key': [1, 2, 2],
@@ -379,6 +344,26 @@ def test_allow_exact_matches_and_tolerance2(self):
379344
'version': [np.nan]})
380345
assert_frame_equal(result, expected)
381346

347+
def test_allow_exact_matches_and_tolerance3(self):
348+
# GH 13709
349+
df1 = pd.DataFrame({
350+
'time': pd.to_datetime(['2016-07-15 13:30:00.030',
351+
'2016-07-15 13:30:00.030']),
352+
'username': ['bob', 'charlie']})
353+
df2 = pd.DataFrame({
354+
'time': pd.to_datetime(['2016-07-15 13:30:00.000',
355+
'2016-07-15 13:30:00.030']),
356+
'version': [1, 2]})
357+
358+
result = pd.merge_asof(df1, df2, on='time', allow_exact_matches=False,
359+
tolerance=pd.Timedelta('10ms'))
360+
expected = pd.DataFrame({
361+
'time': pd.to_datetime(['2016-07-15 13:30:00.030',
362+
'2016-07-15 13:30:00.030']),
363+
'username': ['bob', 'charlie'],
364+
'version': [np.nan, np.nan]})
365+
assert_frame_equal(result, expected)
366+
382367

383368
if __name__ == '__main__':
384369
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],

0 commit comments

Comments
 (0)