Skip to content

Commit c520b25

Browse files
Christopher C. Aycockjorisvandenbossche
Christopher C. Aycock
authored andcommitted
ENH: merge_asof() has type specializations and can take multiple 'by' parameters (pandas-dev#13936)
closes pandas-dev#13936 Author: Christopher C. Aycock <[email protected]> Closes pandas-dev#14783 from chrisaycock/GH13936 and squashes the following commits: ffcf0c2 [Christopher C. Aycock] Added test to reject float16; fixed typos 1f208a8 [Christopher C. Aycock] Use tuple representation instead of strings 77eb47b [Christopher C. Aycock] Merge master branch into GH13936 89256f0 [Christopher C. Aycock] Test 8-bit integers and raise error on 16-bit floats; add comments 0ad1687 [Christopher C. Aycock] Fixed whatsnew 2bce3cc [Christopher C. Aycock] Revert dict back to PyObjectHashTable in response to code review fafbb02 [Christopher C. Aycock] Updated benchmarks to reflect new ASV setup 5eeb7d9 [Christopher C. Aycock] Merge master into GH13936 c33c4cb [Christopher C. Aycock] Merge branch 'master' into GH13936 46cc309 [Christopher C. Aycock] Update documentation f01142c [Christopher C. Aycock] Merge master branch 75157fc [Christopher C. Aycock] merge_asof() has type specializations and can take multiple 'by' parameters (pandas-dev#13936) (cherry picked from commit e7df751)
1 parent a509172 commit c520b25

File tree

4 files changed

+251
-43
lines changed

4 files changed

+251
-43
lines changed

doc/source/whatsnew/v0.19.2.txt

+1
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ Other Enhancements
3030
~~~~~~~~~~~~~~~~~~
3131

3232
- ``pd.merge_asof()`` gained ``left_index``/``right_index`` and ``left_by``/``right_by`` arguments (:issue:`14253`)
33+
- ``pd.merge_asof()`` can take multiple columns in ``by`` parameter and has specialized dtypes for better performace (:issue:`13936`)
3334

3435

3536
.. _whatsnew_0192.bug_fixes:

pandas/src/joins_func_helper.pxi.in

+7-2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# cython: boundscheck=False, wraparound=False
12
"""
23
Template for each `dtype` helper function for hashtable
34

@@ -14,7 +15,9 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
1415
by_dtypes = [('PyObjectHashTable', 'object'), ('Int64HashTable', 'int64_t')]
1516

1617
# on_dtype
17-
on_dtypes = ['int64_t', 'double']
18+
on_dtypes = ['uint8_t', 'uint16_t', 'uint32_t', 'uint64_t',
19+
'int8_t', 'int16_t', 'int32_t', 'int64_t',
20+
'float', 'double']
1821

1922
}}
2023

@@ -98,7 +101,9 @@ def asof_join_{{on_dtype}}_by_{{by_dtype}}(ndarray[{{on_dtype}}] left_values,
98101
{{py:
99102

100103
# on_dtype
101-
dtypes = ['int64_t', 'double']
104+
dtypes = ['uint8_t', 'uint16_t', 'uint32_t', 'uint64_t',
105+
'int8_t', 'int16_t', 'int32_t', 'int64_t',
106+
'float', 'double']
102107

103108
}}
104109

pandas/tools/merge.py

+60-41
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
import copy
66
import warnings
77

8+
import string
9+
810
import numpy as np
911
from pandas.compat import range, lrange, lzip, zip, map, filter
1012
import pandas.compat as compat
@@ -28,7 +30,8 @@
2830
is_list_like,
2931
_ensure_int64,
3032
_ensure_float64,
31-
_ensure_object)
33+
_ensure_object,
34+
_get_dtype)
3235
from pandas.types.missing import na_value_for_dtype
3336

3437
from pandas.core.generic import NDFrame
@@ -271,8 +274,8 @@ def merge_asof(left, right, on=None,
271274
DataFrame whose 'on' key is less than or equal to the left's key. Both
272275
DataFrames must be sorted by the key.
273276
274-
Optionally perform group-wise merge. This searches for the nearest match
275-
on the 'on' key within the same group according to 'by'.
277+
Optionally match on equivalent keys with 'by' before searching for nearest
278+
match with 'on'.
276279
277280
.. versionadded:: 0.19.0
278281
@@ -299,16 +302,15 @@ def merge_asof(left, right, on=None,
299302
300303
.. versionadded:: 0.19.2
301304
302-
by : column name
303-
Group both the left and right DataFrames by the group column; perform
304-
the merge operation on these pieces and recombine.
305+
by : column name or list of column names
306+
Match on these columns before performing merge operation.
305307
left_by : column name
306-
Field name to group by in the left DataFrame.
308+
Field names to match on in the left DataFrame.
307309
308310
.. versionadded:: 0.19.2
309311
310312
right_by : column name
311-
Field name to group by in the right DataFrame.
313+
Field names to match on in the right DataFrame.
312314
313315
.. versionadded:: 0.19.2
314316
@@ -997,27 +999,46 @@ def get_result(self):
997999
return result
9981000

9991001

1000-
_asof_functions = {
1001-
'int64_t': _join.asof_join_int64_t,
1002-
'double': _join.asof_join_double,
1003-
}
1002+
def _asof_function(on_type):
1003+
return getattr(_join, 'asof_join_%s' % on_type, None)
1004+
1005+
1006+
def _asof_by_function(on_type, by_type):
1007+
return getattr(_join, 'asof_join_%s_by_%s' % (on_type, by_type), None)
10041008

1005-
_asof_by_functions = {
1006-
('int64_t', 'int64_t'): _join.asof_join_int64_t_by_int64_t,
1007-
('double', 'int64_t'): _join.asof_join_double_by_int64_t,
1008-
('int64_t', 'object'): _join.asof_join_int64_t_by_object,
1009-
('double', 'object'): _join.asof_join_double_by_object,
1010-
}
10111009

10121010
_type_casters = {
10131011
'int64_t': _ensure_int64,
10141012
'double': _ensure_float64,
10151013
'object': _ensure_object,
10161014
}
10171015

1016+
_cython_types = {
1017+
'uint8': 'uint8_t',
1018+
'uint32': 'uint32_t',
1019+
'uint16': 'uint16_t',
1020+
'uint64': 'uint64_t',
1021+
'int8': 'int8_t',
1022+
'int32': 'int32_t',
1023+
'int16': 'int16_t',
1024+
'int64': 'int64_t',
1025+
'float16': 'error',
1026+
'float32': 'float',
1027+
'float64': 'double',
1028+
}
1029+
10181030

10191031
def _get_cython_type(dtype):
1020-
""" Given a dtype, return 'int64_t', 'double', or 'object' """
1032+
""" Given a dtype, return a C name like 'int64_t' or 'double' """
1033+
type_name = _get_dtype(dtype).name
1034+
ctype = _cython_types.get(type_name, 'object')
1035+
if ctype == 'error':
1036+
raise MergeError('unsupported type: ' + type_name)
1037+
return ctype
1038+
1039+
1040+
def _get_cython_type_upcast(dtype):
1041+
""" Upcast a dtype to 'int64_t', 'double', or 'object' """
10211042
if is_integer_dtype(dtype):
10221043
return 'int64_t'
10231044
elif is_float_dtype(dtype):
@@ -1084,11 +1105,6 @@ def _validate_specification(self):
10841105
if not is_list_like(self.right_by):
10851106
self.right_by = [self.right_by]
10861107

1087-
if len(self.left_by) != 1:
1088-
raise MergeError("can only asof by a single key")
1089-
if len(self.right_by) != 1:
1090-
raise MergeError("can only asof by a single key")
1091-
10921108
self.left_on = self.left_by + list(self.left_on)
10931109
self.right_on = self.right_by + list(self.right_on)
10941110

@@ -1142,6 +1158,13 @@ def _get_merge_keys(self):
11421158
def _get_join_indexers(self):
11431159
""" return the join indexers """
11441160

1161+
def flip(xs):
1162+
""" unlike np.transpose, this returns an array of tuples """
1163+
labels = list(string.ascii_lowercase[:len(xs)])
1164+
dtypes = [x.dtype for x in xs]
1165+
labeled_dtypes = list(zip(labels, dtypes))
1166+
return np.array(lzip(*xs), labeled_dtypes)
1167+
11451168
# values to compare
11461169
left_values = (self.left.index.values if self.left_index else
11471170
self.left_join_keys[-1])
@@ -1165,22 +1188,23 @@ def _get_join_indexers(self):
11651188

11661189
# a "by" parameter requires special handling
11671190
if self.left_by is not None:
1168-
left_by_values = self.left_join_keys[0]
1169-
right_by_values = self.right_join_keys[0]
1170-
1171-
# choose appropriate function by type
1172-
on_type = _get_cython_type(left_values.dtype)
1173-
by_type = _get_cython_type(left_by_values.dtype)
1191+
if len(self.left_join_keys) > 2:
1192+
# get tuple representation of values if more than one
1193+
left_by_values = flip(self.left_join_keys[0:-1])
1194+
right_by_values = flip(self.right_join_keys[0:-1])
1195+
else:
1196+
left_by_values = self.left_join_keys[0]
1197+
right_by_values = self.right_join_keys[0]
11741198

1175-
on_type_caster = _type_casters[on_type]
1199+
# upcast 'by' parameter because HashTable is limited
1200+
by_type = _get_cython_type_upcast(left_by_values.dtype)
11761201
by_type_caster = _type_casters[by_type]
1177-
func = _asof_by_functions[(on_type, by_type)]
1178-
1179-
left_values = on_type_caster(left_values)
1180-
right_values = on_type_caster(right_values)
11811202
left_by_values = by_type_caster(left_by_values)
11821203
right_by_values = by_type_caster(right_by_values)
11831204

1205+
# choose appropriate function by type
1206+
on_type = _get_cython_type(left_values.dtype)
1207+
func = _asof_by_function(on_type, by_type)
11841208
return func(left_values,
11851209
right_values,
11861210
left_by_values,
@@ -1190,12 +1214,7 @@ def _get_join_indexers(self):
11901214
else:
11911215
# choose appropriate function by type
11921216
on_type = _get_cython_type(left_values.dtype)
1193-
type_caster = _type_casters[on_type]
1194-
func = _asof_functions[on_type]
1195-
1196-
left_values = type_caster(left_values)
1197-
right_values = type_caster(right_values)
1198-
1217+
func = _asof_function(on_type)
11991218
return func(left_values,
12001219
right_values,
12011220
self.allow_exact_matches,

0 commit comments

Comments
 (0)