diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index c8c27f2f2e178..f21cbdfa0c3a4 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -277,6 +277,7 @@ Other Deprecations Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Performance improvement in :func:`concat` with ``axis=1`` and objects with unaligned indexes (:issue:`55084`) +- Performance improvement in :func:`merge_asof` when ``by`` contains more than one key (:issue:`55580`) - Performance improvement in :func:`read_stata` for files with many variables (:issue:`55515`) - Performance improvement in :func:`to_dict` on converting DataFrame to dictionary (:issue:`50990`) - Performance improvement in :meth:`DataFrame.groupby` when aggregating pyarrow timestamp and duration dtypes (:issue:`55031`) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index ba6579a739f54..9cdf513ad7da3 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -9,7 +9,6 @@ ) import datetime from functools import partial -import string from typing import ( TYPE_CHECKING, Literal, @@ -90,7 +89,6 @@ BaseMaskedArray, ExtensionArray, ) -from pandas.core.arrays._mixins import NDArrayBackedExtensionArray from pandas.core.arrays.string_ import StringDtype import pandas.core.common as com from pandas.core.construction import ( @@ -99,7 +97,10 @@ ) from pandas.core.frame import _merge_doc from pandas.core.indexes.api import default_index -from pandas.core.sorting import is_int64_overflow_possible +from pandas.core.sorting import ( + get_group_index, + is_int64_overflow_possible, +) if TYPE_CHECKING: from pandas import DataFrame @@ -2117,34 +2118,6 @@ def _convert_values_for_libjoin( def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: """return the join indexers""" - def flip(xs: list[ArrayLike]) -> np.ndarray: - """unlike np.transpose, this returns an array of tuples""" - - def injection(obj: ArrayLike): - if not isinstance(obj.dtype, ExtensionDtype): - # ndarray - return obj - obj = extract_array(obj) - if isinstance(obj, NDArrayBackedExtensionArray): - # fastpath for e.g. dt64tz, categorical - return obj._ndarray - # FIXME: returning obj._values_for_argsort() here doesn't - # break in any existing test cases, but i (@jbrockmendel) - # am pretty sure it should! - # e.g. - # arr = pd.array([0, pd.NA, 255], dtype="UInt8") - # will have values_for_argsort (before GH#45434) - # np.array([0, 255, 255], dtype=np.uint8) - # and the non-injectivity should make a difference somehow - # shouldn't it? - return np.asarray(obj) - - xs = [injection(x) for x in xs] - labels = list(string.ascii_lowercase[: len(xs)]) - dtypes = [x.dtype for x in xs] - labeled_dtypes = list(zip(labels, dtypes)) - return np.array(list(zip(*xs)), labeled_dtypes) - # values to compare left_values = ( self.left.index._values if self.left_index else self.left_join_keys[-1] @@ -2197,11 +2170,23 @@ def injection(obj: ArrayLike): else: # We get here with non-ndarrays in test_merge_by_col_tz_aware # and test_merge_groupby_multiple_column_with_categorical_column - lbv = flip(left_by_values) - rbv = flip(right_by_values) - lbv = ensure_object(lbv) - rbv = ensure_object(rbv) - + mapped = [ + _factorize_keys( + left_by_values[n], + right_by_values[n], + sort=False, + how="left", + ) + for n in range(len(left_by_values)) + ] + arrs = [np.concatenate(m[:2]) for m in mapped] + shape = tuple(m[2] for m in mapped) + group_index = get_group_index( + arrs, shape=shape, sort=False, xnull=False + ) + left_len = len(left_by_values[0]) + lbv = group_index[:left_len] + rbv = group_index[left_len:] # error: Incompatible types in assignment (expression has type # "Union[ndarray[Any, dtype[Any]], ndarray[Any, dtype[object_]]]", # variable has type "List[Union[Union[ExtensionArray, diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index d4cb89dadde9b..8ada42898f947 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -353,7 +353,8 @@ def test_multiby(self): result = merge_asof(trades, quotes, on="time", by=["ticker", "exch"]) tm.assert_frame_equal(result, expected) - def test_multiby_heterogeneous_types(self): + @pytest.mark.parametrize("dtype", ["object", "string"]) + def test_multiby_heterogeneous_types(self, dtype): # GH13936 trades = pd.DataFrame( { @@ -373,6 +374,7 @@ def test_multiby_heterogeneous_types(self): }, columns=["time", "ticker", "exch", "price", "quantity"], ) + trades = trades.astype({"ticker": dtype, "exch": dtype}) quotes = pd.DataFrame( { @@ -393,6 +395,7 @@ def test_multiby_heterogeneous_types(self): }, columns=["time", "ticker", "exch", "bid", "ask"], ) + quotes = quotes.astype({"ticker": dtype, "exch": dtype}) expected = pd.DataFrame( { @@ -414,6 +417,7 @@ def test_multiby_heterogeneous_types(self): }, columns=["time", "ticker", "exch", "price", "quantity", "bid", "ask"], ) + expected = expected.astype({"ticker": dtype, "exch": dtype}) result = merge_asof(trades, quotes, on="time", by=["ticker", "exch"]) tm.assert_frame_equal(result, expected) @@ -1666,3 +1670,41 @@ def test_merge_asof_read_only_ndarray(): result = merge_asof(left, right, left_index=True, right_index=True) expected = pd.DataFrame({"left": [2], "right": [1]}, index=[2]) tm.assert_frame_equal(result, expected) + + +def test_merge_asof_multiby_with_categorical(): + # GH 43541 + left = pd.DataFrame( + { + "c1": pd.Categorical(["a", "a", "b", "b"], categories=["a", "b"]), + "c2": ["x"] * 4, + "t": [1] * 4, + "v": range(4), + } + ) + right = pd.DataFrame( + { + "c1": pd.Categorical(["b", "b"], categories=["b", "a"]), + "c2": ["x"] * 2, + "t": [1, 2], + "v": range(2), + } + ) + result = merge_asof( + left, + right, + by=["c1", "c2"], + on="t", + direction="forward", + suffixes=["_left", "_right"], + ) + expected = pd.DataFrame( + { + "c1": pd.Categorical(["a", "a", "b", "b"], categories=["a", "b"]), + "c2": ["x"] * 4, + "t": [1] * 4, + "v_left": range(4), + "v_right": [np.nan, np.nan, 0.0, 0.0], + } + ) + tm.assert_frame_equal(result, expected)