Skip to content

BUG/PERF: merge_asof with multiple "by" keys #55580

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Oct 22, 2023
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,7 @@ Other Deprecations
Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~
- Performance improvement in :func:`concat` with ``axis=1`` and objects with unaligned indexes (:issue:`55084`)
- Performance improvement in :func:`merge_asof` when ``by`` contains more than one key (:issue:`55580`)
- Performance improvement in :func:`read_stata` for files with many variables (:issue:`55515`)
- Performance improvement in :func:`to_dict` on converting DataFrame to dictionary (:issue:`50990`)
- Performance improvement in :meth:`DataFrame.groupby` when aggregating pyarrow timestamp and duration dtypes (:issue:`55031`)
Expand Down
57 changes: 21 additions & 36 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
)
import datetime
from functools import partial
import string
from typing import (
TYPE_CHECKING,
Literal,
Expand Down Expand Up @@ -90,7 +89,6 @@
BaseMaskedArray,
ExtensionArray,
)
from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
from pandas.core.arrays.string_ import StringDtype
import pandas.core.common as com
from pandas.core.construction import (
Expand All @@ -99,7 +97,10 @@
)
from pandas.core.frame import _merge_doc
from pandas.core.indexes.api import default_index
from pandas.core.sorting import is_int64_overflow_possible
from pandas.core.sorting import (
get_group_index,
is_int64_overflow_possible,
)

if TYPE_CHECKING:
from pandas import DataFrame
Expand Down Expand Up @@ -2117,34 +2118,6 @@ def _convert_values_for_libjoin(
def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
"""return the join indexers"""

def flip(xs: list[ArrayLike]) -> np.ndarray:
"""unlike np.transpose, this returns an array of tuples"""

def injection(obj: ArrayLike):
if not isinstance(obj.dtype, ExtensionDtype):
# ndarray
return obj
obj = extract_array(obj)
if isinstance(obj, NDArrayBackedExtensionArray):
# fastpath for e.g. dt64tz, categorical
return obj._ndarray
# FIXME: returning obj._values_for_argsort() here doesn't
# break in any existing test cases, but i (@jbrockmendel)
# am pretty sure it should!
# e.g.
# arr = pd.array([0, pd.NA, 255], dtype="UInt8")
# will have values_for_argsort (before GH#45434)
# np.array([0, 255, 255], dtype=np.uint8)
# and the non-injectivity should make a difference somehow
# shouldn't it?
return np.asarray(obj)

xs = [injection(x) for x in xs]
labels = list(string.ascii_lowercase[: len(xs)])
dtypes = [x.dtype for x in xs]
labeled_dtypes = list(zip(labels, dtypes))
return np.array(list(zip(*xs)), labeled_dtypes)

# values to compare
left_values = (
self.left.index._values if self.left_index else self.left_join_keys[-1]
Expand Down Expand Up @@ -2197,11 +2170,23 @@ def injection(obj: ArrayLike):
else:
# We get here with non-ndarrays in test_merge_by_col_tz_aware
# and test_merge_groupby_multiple_column_with_categorical_column
lbv = flip(left_by_values)
rbv = flip(right_by_values)
lbv = ensure_object(lbv)
rbv = ensure_object(rbv)

mapped = [
_factorize_keys(
Copy link
Member

@mroeschke mroeschke Oct 19, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm assuming this work alright with ExtensionArrays? Do we have tests for merging on by that is a extension column?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this will work with EAs. I've parameterized an existing test to include EA dtypes.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added an additional test as this seems to fix #43541

left_by_values[n],
right_by_values[n],
sort=False,
how="left",
)
for n in range(len(left_by_values))
]
arrs = [np.concatenate(m[:2]) for m in mapped]
shape = tuple(m[2] for m in mapped)
group_index = get_group_index(
arrs, shape=shape, sort=False, xnull=False
)
left_len = len(left_by_values[0])
lbv = group_index[:left_len]
rbv = group_index[left_len:]
# error: Incompatible types in assignment (expression has type
# "Union[ndarray[Any, dtype[Any]], ndarray[Any, dtype[object_]]]",
# variable has type "List[Union[Union[ExtensionArray,
Expand Down
44 changes: 43 additions & 1 deletion pandas/tests/reshape/merge/test_merge_asof.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,8 @@ def test_multiby(self):
result = merge_asof(trades, quotes, on="time", by=["ticker", "exch"])
tm.assert_frame_equal(result, expected)

def test_multiby_heterogeneous_types(self):
@pytest.mark.parametrize("dtype", ["object", "string"])
def test_multiby_heterogeneous_types(self, dtype):
# GH13936
trades = pd.DataFrame(
{
Expand All @@ -373,6 +374,7 @@ def test_multiby_heterogeneous_types(self):
},
columns=["time", "ticker", "exch", "price", "quantity"],
)
trades = trades.astype({"ticker": dtype, "exch": dtype})

quotes = pd.DataFrame(
{
Expand All @@ -393,6 +395,7 @@ def test_multiby_heterogeneous_types(self):
},
columns=["time", "ticker", "exch", "bid", "ask"],
)
quotes = quotes.astype({"ticker": dtype, "exch": dtype})

expected = pd.DataFrame(
{
Expand All @@ -414,6 +417,7 @@ def test_multiby_heterogeneous_types(self):
},
columns=["time", "ticker", "exch", "price", "quantity", "bid", "ask"],
)
expected = expected.astype({"ticker": dtype, "exch": dtype})

result = merge_asof(trades, quotes, on="time", by=["ticker", "exch"])
tm.assert_frame_equal(result, expected)
Expand Down Expand Up @@ -1666,3 +1670,41 @@ def test_merge_asof_read_only_ndarray():
result = merge_asof(left, right, left_index=True, right_index=True)
expected = pd.DataFrame({"left": [2], "right": [1]}, index=[2])
tm.assert_frame_equal(result, expected)


def test_merge_asof_multiby_with_categorical():
# GH 43541
left = pd.DataFrame(
{
"c1": pd.Categorical(["a", "a", "b", "b"], categories=["a", "b"]),
"c2": ["x"] * 4,
"t": [1] * 4,
"v": range(4),
}
)
right = pd.DataFrame(
{
"c1": pd.Categorical(["b", "b"], categories=["b", "a"]),
"c2": ["x"] * 2,
"t": [1, 2],
"v": range(2),
}
)
result = merge_asof(
left,
right,
by=["c1", "c2"],
on="t",
direction="forward",
suffixes=["_left", "_right"],
)
expected = pd.DataFrame(
{
"c1": pd.Categorical(["a", "a", "b", "b"], categories=["a", "b"]),
"c2": ["x"] * 4,
"t": [1] * 4,
"v_left": range(4),
"v_right": [np.nan, np.nan, 0.0, 0.0],
}
)
tm.assert_frame_equal(result, expected)