Skip to content

PERF: skip libjoin fastpath for MultiIndex #54765

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Aug 28, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 15 additions & 27 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@
from pandas.core.dtypes.generic import (
ABCDataFrame,
ABCDatetimeIndex,
ABCIntervalIndex,
ABCMultiIndex,
ABCPeriodIndex,
ABCSeries,
Expand Down Expand Up @@ -3492,8 +3493,6 @@ def _intersection(self, other: Index, sort: bool = False):
and other.is_monotonic_increasing
and self._can_use_libjoin
and other._can_use_libjoin
and not isinstance(self, ABCMultiIndex)
and not isinstance(other, ABCMultiIndex)
):
try:
res_indexer, indexer, _ = self._inner_indexer(other)
Expand Down Expand Up @@ -4632,28 +4631,13 @@ def join(

_validate_join_method(how)

if not self.is_unique and not other.is_unique:
return self._join_non_unique(other, how=how, sort=sort)
elif not self.is_unique or not other.is_unique:
if self.is_monotonic_increasing and other.is_monotonic_increasing:
# Note: 2023-08-15 we *do* have tests that get here with
# Categorical, string[python] (can use libjoin)
# and Interval (cannot)
if self._can_use_libjoin and other._can_use_libjoin:
# otherwise we will fall through to _join_via_get_indexer
# GH#39133
# go through object dtype for ea till engine is supported properly
return self._join_monotonic(other, how=how)
else:
return self._join_non_unique(other, how=how, sort=sort)
elif (
# GH48504: exclude MultiIndex to avoid going through MultiIndex._values
self.is_monotonic_increasing
if (
not isinstance(self.dtype, CategoricalDtype)
and self.is_monotonic_increasing
and other.is_monotonic_increasing
and self._can_use_libjoin
and other._can_use_libjoin
and not isinstance(self, ABCMultiIndex)
and not isinstance(self.dtype, CategoricalDtype)
and (self.is_unique or other.is_unique)
):
# Categorical is monotonic if data are ordered as categories, but join can
# not handle this in case of not lexicographically monotonic GH#38502
Expand All @@ -4662,6 +4646,8 @@ def join(
except TypeError:
# object dtype; non-comparable objects
pass
elif not self.is_unique or not other.is_unique:
return self._join_non_unique(other, how=how, sort=sort)

return self._join_via_get_indexer(other, how, sort)

Expand Down Expand Up @@ -4797,6 +4783,9 @@ def _join_non_unique(
join_idx = self.take(left_idx)
right = other.take(right_idx)
join_index = join_idx.putmask(mask, right)
if isinstance(join_index, ABCMultiIndex) and how == "outer":
# test_join_index_levels
join_index = join_index._sort_levels_monotonic()
return join_index, left_idx, right_idx

@final
Expand Down Expand Up @@ -5042,10 +5031,10 @@ def _can_use_libjoin(self) -> bool:
or isinstance(self._values, (ArrowExtensionArray, BaseMaskedArray))
or self.dtype == "string[python]"
)
# For IntervalIndex, the conversion to numpy converts
# to object dtype, which negates the performance benefit of libjoin
# TODO: exclude RangeIndex and MultiIndex as these also make copies?
return not isinstance(self.dtype, IntervalDtype)
# Exclude index types where the conversion to numpy converts to object dtype,
# which negates the performance benefit of libjoin
# TODO: exclude RangeIndex? Seems to break test_concat_datetime_timezone
return not isinstance(self, (ABCIntervalIndex, ABCMultiIndex))

# --------------------------------------------------------------------
# Uncategorized Methods
Expand Down Expand Up @@ -5180,8 +5169,7 @@ def _get_join_target(self) -> np.ndarray:
# present
return self._values.to_numpy()

# TODO: exclude ABCRangeIndex, ABCMultiIndex cases here as those create
# copies.
# TODO: exclude ABCRangeIndex case here as it copies
target = self._get_engine_target()
if not isinstance(target, np.ndarray):
raise ValueError("_can_use_libjoin should return False.")
Expand Down