Skip to content

Commit ad3f3f7

Browse files
authored
BUG/PERF: merge_asof with multiple "by" keys (#55580)
* improve perf in merge_asof with multiple "by" keys * whatsnew * add test for EA dtypes * fix test * use how=left in factorize_keys * add test
1 parent b3a4b97 commit ad3f3f7

File tree

3 files changed

+65
-37
lines changed

3 files changed

+65
-37
lines changed

doc/source/whatsnew/v2.2.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,7 @@ Other Deprecations
277277
Performance improvements
278278
~~~~~~~~~~~~~~~~~~~~~~~~
279279
- Performance improvement in :func:`concat` with ``axis=1`` and objects with unaligned indexes (:issue:`55084`)
280+
- Performance improvement in :func:`merge_asof` when ``by`` contains more than one key (:issue:`55580`)
280281
- Performance improvement in :func:`read_stata` for files with many variables (:issue:`55515`)
281282
- Performance improvement in :func:`to_dict` on converting DataFrame to dictionary (:issue:`50990`)
282283
- Performance improvement in :meth:`DataFrame.groupby` when aggregating pyarrow timestamp and duration dtypes (:issue:`55031`)

pandas/core/reshape/merge.py

+21-36
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
)
1010
import datetime
1111
from functools import partial
12-
import string
1312
from typing import (
1413
TYPE_CHECKING,
1514
Literal,
@@ -90,7 +89,6 @@
9089
BaseMaskedArray,
9190
ExtensionArray,
9291
)
93-
from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
9492
from pandas.core.arrays.string_ import StringDtype
9593
import pandas.core.common as com
9694
from pandas.core.construction import (
@@ -99,7 +97,10 @@
9997
)
10098
from pandas.core.frame import _merge_doc
10199
from pandas.core.indexes.api import default_index
102-
from pandas.core.sorting import is_int64_overflow_possible
100+
from pandas.core.sorting import (
101+
get_group_index,
102+
is_int64_overflow_possible,
103+
)
103104

104105
if TYPE_CHECKING:
105106
from pandas import DataFrame
@@ -2117,34 +2118,6 @@ def _convert_values_for_libjoin(
21172118
def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
21182119
"""return the join indexers"""
21192120

2120-
def flip(xs: list[ArrayLike]) -> np.ndarray:
2121-
"""unlike np.transpose, this returns an array of tuples"""
2122-
2123-
def injection(obj: ArrayLike):
2124-
if not isinstance(obj.dtype, ExtensionDtype):
2125-
# ndarray
2126-
return obj
2127-
obj = extract_array(obj)
2128-
if isinstance(obj, NDArrayBackedExtensionArray):
2129-
# fastpath for e.g. dt64tz, categorical
2130-
return obj._ndarray
2131-
# FIXME: returning obj._values_for_argsort() here doesn't
2132-
# break in any existing test cases, but i (@jbrockmendel)
2133-
# am pretty sure it should!
2134-
# e.g.
2135-
# arr = pd.array([0, pd.NA, 255], dtype="UInt8")
2136-
# will have values_for_argsort (before GH#45434)
2137-
# np.array([0, 255, 255], dtype=np.uint8)
2138-
# and the non-injectivity should make a difference somehow
2139-
# shouldn't it?
2140-
return np.asarray(obj)
2141-
2142-
xs = [injection(x) for x in xs]
2143-
labels = list(string.ascii_lowercase[: len(xs)])
2144-
dtypes = [x.dtype for x in xs]
2145-
labeled_dtypes = list(zip(labels, dtypes))
2146-
return np.array(list(zip(*xs)), labeled_dtypes)
2147-
21482121
# values to compare
21492122
left_values = (
21502123
self.left.index._values if self.left_index else self.left_join_keys[-1]
@@ -2197,11 +2170,23 @@ def injection(obj: ArrayLike):
21972170
else:
21982171
# We get here with non-ndarrays in test_merge_by_col_tz_aware
21992172
# and test_merge_groupby_multiple_column_with_categorical_column
2200-
lbv = flip(left_by_values)
2201-
rbv = flip(right_by_values)
2202-
lbv = ensure_object(lbv)
2203-
rbv = ensure_object(rbv)
2204-
2173+
mapped = [
2174+
_factorize_keys(
2175+
left_by_values[n],
2176+
right_by_values[n],
2177+
sort=False,
2178+
how="left",
2179+
)
2180+
for n in range(len(left_by_values))
2181+
]
2182+
arrs = [np.concatenate(m[:2]) for m in mapped]
2183+
shape = tuple(m[2] for m in mapped)
2184+
group_index = get_group_index(
2185+
arrs, shape=shape, sort=False, xnull=False
2186+
)
2187+
left_len = len(left_by_values[0])
2188+
lbv = group_index[:left_len]
2189+
rbv = group_index[left_len:]
22052190
# error: Incompatible types in assignment (expression has type
22062191
# "Union[ndarray[Any, dtype[Any]], ndarray[Any, dtype[object_]]]",
22072192
# variable has type "List[Union[Union[ExtensionArray,

pandas/tests/reshape/merge/test_merge_asof.py

+43-1
Original file line numberDiff line numberDiff line change
@@ -353,7 +353,8 @@ def test_multiby(self):
353353
result = merge_asof(trades, quotes, on="time", by=["ticker", "exch"])
354354
tm.assert_frame_equal(result, expected)
355355

356-
def test_multiby_heterogeneous_types(self):
356+
@pytest.mark.parametrize("dtype", ["object", "string"])
357+
def test_multiby_heterogeneous_types(self, dtype):
357358
# GH13936
358359
trades = pd.DataFrame(
359360
{
@@ -373,6 +374,7 @@ def test_multiby_heterogeneous_types(self):
373374
},
374375
columns=["time", "ticker", "exch", "price", "quantity"],
375376
)
377+
trades = trades.astype({"ticker": dtype, "exch": dtype})
376378

377379
quotes = pd.DataFrame(
378380
{
@@ -393,6 +395,7 @@ def test_multiby_heterogeneous_types(self):
393395
},
394396
columns=["time", "ticker", "exch", "bid", "ask"],
395397
)
398+
quotes = quotes.astype({"ticker": dtype, "exch": dtype})
396399

397400
expected = pd.DataFrame(
398401
{
@@ -414,6 +417,7 @@ def test_multiby_heterogeneous_types(self):
414417
},
415418
columns=["time", "ticker", "exch", "price", "quantity", "bid", "ask"],
416419
)
420+
expected = expected.astype({"ticker": dtype, "exch": dtype})
417421

418422
result = merge_asof(trades, quotes, on="time", by=["ticker", "exch"])
419423
tm.assert_frame_equal(result, expected)
@@ -1666,3 +1670,41 @@ def test_merge_asof_read_only_ndarray():
16661670
result = merge_asof(left, right, left_index=True, right_index=True)
16671671
expected = pd.DataFrame({"left": [2], "right": [1]}, index=[2])
16681672
tm.assert_frame_equal(result, expected)
1673+
1674+
1675+
def test_merge_asof_multiby_with_categorical():
1676+
# GH 43541
1677+
left = pd.DataFrame(
1678+
{
1679+
"c1": pd.Categorical(["a", "a", "b", "b"], categories=["a", "b"]),
1680+
"c2": ["x"] * 4,
1681+
"t": [1] * 4,
1682+
"v": range(4),
1683+
}
1684+
)
1685+
right = pd.DataFrame(
1686+
{
1687+
"c1": pd.Categorical(["b", "b"], categories=["b", "a"]),
1688+
"c2": ["x"] * 2,
1689+
"t": [1, 2],
1690+
"v": range(2),
1691+
}
1692+
)
1693+
result = merge_asof(
1694+
left,
1695+
right,
1696+
by=["c1", "c2"],
1697+
on="t",
1698+
direction="forward",
1699+
suffixes=["_left", "_right"],
1700+
)
1701+
expected = pd.DataFrame(
1702+
{
1703+
"c1": pd.Categorical(["a", "a", "b", "b"], categories=["a", "b"]),
1704+
"c2": ["x"] * 4,
1705+
"t": [1] * 4,
1706+
"v_left": range(4),
1707+
"v_right": [np.nan, np.nan, 0.0, 0.0],
1708+
}
1709+
)
1710+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)