Skip to content

ENH: Added 'direction' parameter to merge_asof() (#14887) #15129

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ Other enhancements
- ``pandas.io.json.json_normalize()`` gained the option ``errors='ignore'|'raise'``; the default is ``errors='raise'`` which is backward compatible. (:issue:`14583`)

- ``.select_dtypes()`` now allows the string 'datetimetz' to generically select datetimes with tz (:issue:`14910`)
- ``pd.merge_asof()`` gained the option ``direction='backward'|'forward'|'nearest'`` (:issue:`14887`)


.. _whatsnew_0200.api_breaking:
Expand Down
216 changes: 212 additions & 4 deletions pandas/src/joins_func_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ from hashtable cimport *
{{for on_dtype in on_dtypes}}


def asof_join_{{on_dtype}}_by_{{by_dtype}}(ndarray[{{on_dtype}}] left_values,
def asof_join_backward_{{on_dtype}}_by_{{by_dtype}}(
ndarray[{{on_dtype}}] left_values,
ndarray[{{on_dtype}}] right_values,
ndarray[{{by_dtype}}] left_by_values,
ndarray[{{by_dtype}}] right_by_values,
Expand All @@ -40,6 +41,7 @@ def asof_join_{{on_dtype}}_by_{{by_dtype}}(ndarray[{{on_dtype}}] left_values,
ndarray[int64_t] left_indexer, right_indexer
bint has_tolerance = 0
{{on_dtype}} tolerance_
{{on_dtype}} diff
{{table_type}} hash_table
{{by_dtype}} by_value

Expand All @@ -62,7 +64,7 @@ def asof_join_{{on_dtype}}_by_{{by_dtype}}(ndarray[{{on_dtype}}] left_values,
if right_pos < 0:
right_pos = 0

# find last position in right whose value is less than left's value
# find last position in right whose value is less than left's
if allow_exact_matches:
while right_pos < right_size and\
right_values[right_pos] <= left_values[left_pos]:
Expand Down Expand Up @@ -90,6 +92,119 @@ def asof_join_{{on_dtype}}_by_{{by_dtype}}(ndarray[{{on_dtype}}] left_values,

return left_indexer, right_indexer


def asof_join_forward_{{on_dtype}}_by_{{by_dtype}}(
ndarray[{{on_dtype}}] left_values,
ndarray[{{on_dtype}}] right_values,
ndarray[{{by_dtype}}] left_by_values,
ndarray[{{by_dtype}}] right_by_values,
bint allow_exact_matches=1,
tolerance=None):

cdef:
Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos
ndarray[int64_t] left_indexer, right_indexer
bint has_tolerance = 0
{{on_dtype}} tolerance_
{{on_dtype}} diff
{{table_type}} hash_table
{{by_dtype}} by_value

# if we are using tolerance, set our objects
if tolerance is not None:
has_tolerance = 1
tolerance_ = tolerance

left_size = len(left_values)
right_size = len(right_values)

left_indexer = np.empty(left_size, dtype=np.int64)
right_indexer = np.empty(left_size, dtype=np.int64)

hash_table = {{table_type}}(right_size)

right_pos = right_size - 1
for left_pos in range(left_size - 1, -1, -1):
# restart right_pos if it went over in a previous iteration
if right_pos == right_size:
right_pos = right_size - 1

# find first position in right whose value is greater than left's
if allow_exact_matches:
while right_pos >= 0 and\
right_values[right_pos] >= left_values[left_pos]:
hash_table.set_item(right_by_values[right_pos], right_pos)
right_pos -= 1
else:
while right_pos >= 0 and\
right_values[right_pos] > left_values[left_pos]:
hash_table.set_item(right_by_values[right_pos], right_pos)
right_pos -= 1
right_pos += 1

# save positions as the desired index
by_value = left_by_values[left_pos]
found_right_pos = hash_table.get_item(by_value)\
if by_value in hash_table else -1
left_indexer[left_pos] = left_pos
right_indexer[left_pos] = found_right_pos

# if needed, verify that tolerance is met
if has_tolerance and found_right_pos != -1:
diff = right_values[found_right_pos] - left_values[left_pos]
if diff > tolerance_:
right_indexer[left_pos] = -1

return left_indexer, right_indexer


def asof_join_nearest_{{on_dtype}}_by_{{by_dtype}}(
ndarray[{{on_dtype}}] left_values,
ndarray[{{on_dtype}}] right_values,
ndarray[{{by_dtype}}] left_by_values,
ndarray[{{by_dtype}}] right_by_values,
bint allow_exact_matches=1,
tolerance=None):

cdef:
Py_ssize_t left_size, right_size, i
ndarray[int64_t] left_indexer, right_indexer, bli, bri, fli, fri
{{on_dtype}} bdiff, fdiff

left_size = len(left_values)
right_size = len(right_values)

left_indexer = np.empty(left_size, dtype=np.int64)
right_indexer = np.empty(left_size, dtype=np.int64)

# search both forward and backward
bli, bri =\
asof_join_backward_{{on_dtype}}_by_{{by_dtype}}(left_values,
right_values,
left_by_values,
right_by_values,
allow_exact_matches,
tolerance)
fli, fri =\
asof_join_forward_{{on_dtype}}_by_{{by_dtype}}(left_values,
right_values,
left_by_values,
right_by_values,
allow_exact_matches,
tolerance)

for i in range(len(bri)):
# choose timestamp from right with smaller difference
if bri[i] != -1 and fri[i] != -1:
bdiff = left_values[bli[i]] - right_values[bri[i]]
fdiff = right_values[fri[i]] - left_values[fli[i]]
right_indexer[i] = bri[i] if bdiff <= fdiff else fri[i]
else:
right_indexer[i] = bri[i] if bri[i] != -1 else fri[i]
left_indexer[i] = bli[i]

return left_indexer, right_indexer

{{endfor}}
{{endfor}}

Expand All @@ -110,7 +225,8 @@ dtypes = ['uint8_t', 'uint16_t', 'uint32_t', 'uint64_t',
{{for on_dtype in dtypes}}


def asof_join_{{on_dtype}}(ndarray[{{on_dtype}}] left_values,
def asof_join_backward_{{on_dtype}}(
ndarray[{{on_dtype}}] left_values,
ndarray[{{on_dtype}}] right_values,
bint allow_exact_matches=1,
tolerance=None):
Expand All @@ -120,6 +236,7 @@ def asof_join_{{on_dtype}}(ndarray[{{on_dtype}}] left_values,
ndarray[int64_t] left_indexer, right_indexer
bint has_tolerance = 0
{{on_dtype}} tolerance_
{{on_dtype}} diff

# if we are using tolerance, set our objects
if tolerance is not None:
Expand All @@ -138,7 +255,7 @@ def asof_join_{{on_dtype}}(ndarray[{{on_dtype}}] left_values,
if right_pos < 0:
right_pos = 0

# find last position in right whose value is less than left's value
# find last position in right whose value is less than left's
if allow_exact_matches:
while right_pos < right_size and\
right_values[right_pos] <= left_values[left_pos]:
Expand All @@ -161,5 +278,96 @@ def asof_join_{{on_dtype}}(ndarray[{{on_dtype}}] left_values,

return left_indexer, right_indexer


def asof_join_forward_{{on_dtype}}(
ndarray[{{on_dtype}}] left_values,
ndarray[{{on_dtype}}] right_values,
bint allow_exact_matches=1,
tolerance=None):

cdef:
Py_ssize_t left_pos, right_pos, left_size, right_size
ndarray[int64_t] left_indexer, right_indexer
bint has_tolerance = 0
{{on_dtype}} tolerance_
{{on_dtype}} diff

# if we are using tolerance, set our objects
if tolerance is not None:
has_tolerance = 1
tolerance_ = tolerance

left_size = len(left_values)
right_size = len(right_values)

left_indexer = np.empty(left_size, dtype=np.int64)
right_indexer = np.empty(left_size, dtype=np.int64)

right_pos = right_size - 1
for left_pos in range(left_size - 1, -1, -1):
# restart right_pos if it went over in a previous iteration
if right_pos == right_size:
right_pos = right_size - 1

# find first position in right whose value is greater than left's
if allow_exact_matches:
while right_pos >= 0 and\
right_values[right_pos] >= left_values[left_pos]:
right_pos -= 1
else:
while right_pos >= 0 and\
right_values[right_pos] > left_values[left_pos]:
right_pos -= 1
right_pos += 1

# save positions as the desired index
left_indexer[left_pos] = left_pos
right_indexer[left_pos] = right_pos\
if right_pos != right_size else -1

# if needed, verify that tolerance is met
if has_tolerance and right_pos != right_size:
diff = right_values[right_pos] - left_values[left_pos]
if diff > tolerance_:
right_indexer[left_pos] = -1

return left_indexer, right_indexer


def asof_join_nearest_{{on_dtype}}(
ndarray[{{on_dtype}}] left_values,
ndarray[{{on_dtype}}] right_values,
bint allow_exact_matches=1,
tolerance=None):

cdef:
Py_ssize_t left_size, right_size, i
ndarray[int64_t] left_indexer, right_indexer, bli, bri, fli, fri
{{on_dtype}} bdiff, fdiff

left_size = len(left_values)
right_size = len(right_values)

left_indexer = np.empty(left_size, dtype=np.int64)
right_indexer = np.empty(left_size, dtype=np.int64)

# search both forward and backward
bli, bri = asof_join_backward_{{on_dtype}}(left_values, right_values,
allow_exact_matches, tolerance)
fli, fri = asof_join_forward_{{on_dtype}}(left_values, right_values,
allow_exact_matches, tolerance)

for i in range(len(bri)):
# choose timestamp from right with smaller difference
if bri[i] != -1 and fri[i] != -1:
bdiff = left_values[bli[i]] - right_values[bri[i]]
fdiff = right_values[fri[i]] - left_values[fli[i]]
right_indexer[i] = bri[i] if bdiff <= fdiff else fri[i]
else:
right_indexer[i] = bri[i] if bri[i] != -1 else fri[i]
left_indexer[i] = bli[i]

return left_indexer, right_indexer

{{endfor}}

Loading