BUG/PERF: merge_asof with multiple "by" keys (#55580)

lukemanley · web-flow · commit ad3f3f78b02d · 2023-10-22T12:47:31.000-07:00
* improve perf in merge_asof with multiple "by" keys

* whatsnew

* add test for EA dtypes

* fix test

* use how=left in factorize_keys

* add test
diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
@@ -277,6 +277,7 @@ Other Deprecations
 Performance improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
 - Performance improvement in :func:`concat` with ``axis=1`` and objects with unaligned indexes (:issue:`55084`)
+- Performance improvement in :func:`merge_asof` when ``by`` contains more than one key (:issue:`55580`)
 - Performance improvement in :func:`read_stata` for files with many variables (:issue:`55515`)
 - Performance improvement in :func:`to_dict` on converting DataFrame to dictionary (:issue:`50990`)
 - Performance improvement in :meth:`DataFrame.groupby` when aggregating pyarrow timestamp and duration dtypes (:issue:`55031`)
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
@@ -9,7 +9,6 @@
 )
 import datetime
 from functools import partial
-import string
 from typing import (
     TYPE_CHECKING,
     Literal,
@@ -90,7 +89,6 @@
     BaseMaskedArray,
     ExtensionArray,
 )
-from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
 from pandas.core.arrays.string_ import StringDtype
 import pandas.core.common as com
 from pandas.core.construction import (
@@ -99,7 +97,10 @@
 )
 from pandas.core.frame import _merge_doc
 from pandas.core.indexes.api import default_index
-from pandas.core.sorting import is_int64_overflow_possible
+from pandas.core.sorting import (
+    get_group_index,
+    is_int64_overflow_possible,
+)
 
 if TYPE_CHECKING:
     from pandas import DataFrame
@@ -2117,34 +2118,6 @@ def _convert_values_for_libjoin(
     def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
         """return the join indexers"""
 
-        def flip(xs: list[ArrayLike]) -> np.ndarray:
-            """unlike np.transpose, this returns an array of tuples"""
-
-            def injection(obj: ArrayLike):
-                if not isinstance(obj.dtype, ExtensionDtype):
-                    # ndarray
-                    return obj
-                obj = extract_array(obj)
-                if isinstance(obj, NDArrayBackedExtensionArray):
-                    # fastpath for e.g. dt64tz, categorical
-                    return obj._ndarray
-                # FIXME: returning obj._values_for_argsort() here doesn't
-                #  break in any existing test cases, but i (@jbrockmendel)
-                #  am pretty sure it should!
-                #  e.g.
-                #  arr = pd.array([0, pd.NA, 255], dtype="UInt8")
-                #  will have values_for_argsort (before GH#45434)
-                #  np.array([0, 255, 255], dtype=np.uint8)
-                #  and the non-injectivity should make a difference somehow
-                #  shouldn't it?
-                return np.asarray(obj)
-
-            xs = [injection(x) for x in xs]
-            labels = list(string.ascii_lowercase[: len(xs)])
-            dtypes = [x.dtype for x in xs]
-            labeled_dtypes = list(zip(labels, dtypes))
-            return np.array(list(zip(*xs)), labeled_dtypes)
-
         # values to compare
         left_values = (
             self.left.index._values if self.left_index else self.left_join_keys[-1]
@@ -2197,11 +2170,23 @@ def injection(obj: ArrayLike):
             else:
                 # We get here with non-ndarrays in test_merge_by_col_tz_aware
                 #  and test_merge_groupby_multiple_column_with_categorical_column
-                lbv = flip(left_by_values)
-                rbv = flip(right_by_values)
-                lbv = ensure_object(lbv)
-                rbv = ensure_object(rbv)
-
+                mapped = [
+                    _factorize_keys(
+                        left_by_values[n],
+                        right_by_values[n],
+                        sort=False,
+                        how="left",
+                    )
+                    for n in range(len(left_by_values))
+                ]
+                arrs = [np.concatenate(m[:2]) for m in mapped]
+                shape = tuple(m[2] for m in mapped)
+                group_index = get_group_index(
+                    arrs, shape=shape, sort=False, xnull=False
+                )
+                left_len = len(left_by_values[0])
+                lbv = group_index[:left_len]
+                rbv = group_index[left_len:]
             # error: Incompatible types in assignment (expression has type
             # "Union[ndarray[Any, dtype[Any]], ndarray[Any, dtype[object_]]]",
             # variable has type "List[Union[Union[ExtensionArray,
diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py
@@ -353,7 +353,8 @@ def test_multiby(self):
         result = merge_asof(trades, quotes, on="time", by=["ticker", "exch"])
         tm.assert_frame_equal(result, expected)
 
-    def test_multiby_heterogeneous_types(self):
+    @pytest.mark.parametrize("dtype", ["object", "string"])
+    def test_multiby_heterogeneous_types(self, dtype):
         # GH13936
         trades = pd.DataFrame(
             {
@@ -373,6 +374,7 @@ def test_multiby_heterogeneous_types(self):
             },
             columns=["time", "ticker", "exch", "price", "quantity"],
         )
+        trades = trades.astype({"ticker": dtype, "exch": dtype})
 
         quotes = pd.DataFrame(
             {
@@ -393,6 +395,7 @@ def test_multiby_heterogeneous_types(self):
             },
             columns=["time", "ticker", "exch", "bid", "ask"],
         )
+        quotes = quotes.astype({"ticker": dtype, "exch": dtype})
 
         expected = pd.DataFrame(
             {
@@ -414,6 +417,7 @@ def test_multiby_heterogeneous_types(self):
             },
             columns=["time", "ticker", "exch", "price", "quantity", "bid", "ask"],
         )
+        expected = expected.astype({"ticker": dtype, "exch": dtype})
 
         result = merge_asof(trades, quotes, on="time", by=["ticker", "exch"])
         tm.assert_frame_equal(result, expected)
@@ -1666,3 +1670,41 @@ def test_merge_asof_read_only_ndarray():
     result = merge_asof(left, right, left_index=True, right_index=True)
     expected = pd.DataFrame({"left": [2], "right": [1]}, index=[2])
     tm.assert_frame_equal(result, expected)
+
+
+def test_merge_asof_multiby_with_categorical():
+    # GH 43541
+    left = pd.DataFrame(
+        {
+            "c1": pd.Categorical(["a", "a", "b", "b"], categories=["a", "b"]),
+            "c2": ["x"] * 4,
+            "t": [1] * 4,
+            "v": range(4),
+        }
+    )
+    right = pd.DataFrame(
+        {
+            "c1": pd.Categorical(["b", "b"], categories=["b", "a"]),
+            "c2": ["x"] * 2,
+            "t": [1, 2],
+            "v": range(2),
+        }
+    )
+    result = merge_asof(
+        left,
+        right,
+        by=["c1", "c2"],
+        on="t",
+        direction="forward",
+        suffixes=["_left", "_right"],
+    )
+    expected = pd.DataFrame(
+        {
+            "c1": pd.Categorical(["a", "a", "b", "b"], categories=["a", "b"]),
+            "c2": ["x"] * 4,
+            "t": [1] * 4,
+            "v_left": range(4),
+            "v_right": [np.nan, np.nan, 0.0, 0.0],
+        }
+    )
+    tm.assert_frame_equal(result, expected)