diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a49db84450bb3..1fbf4381cb64e 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -124,6 +124,7 @@ from pandas.core.dtypes.generic import ( ABCDataFrame, ABCDatetimeIndex, + ABCIntervalIndex, ABCMultiIndex, ABCPeriodIndex, ABCSeries, @@ -3492,8 +3493,6 @@ def _intersection(self, other: Index, sort: bool = False): and other.is_monotonic_increasing and self._can_use_libjoin and other._can_use_libjoin - and not isinstance(self, ABCMultiIndex) - and not isinstance(other, ABCMultiIndex) ): try: res_indexer, indexer, _ = self._inner_indexer(other) @@ -4632,28 +4631,13 @@ def join( _validate_join_method(how) - if not self.is_unique and not other.is_unique: - return self._join_non_unique(other, how=how, sort=sort) - elif not self.is_unique or not other.is_unique: - if self.is_monotonic_increasing and other.is_monotonic_increasing: - # Note: 2023-08-15 we *do* have tests that get here with - # Categorical, string[python] (can use libjoin) - # and Interval (cannot) - if self._can_use_libjoin and other._can_use_libjoin: - # otherwise we will fall through to _join_via_get_indexer - # GH#39133 - # go through object dtype for ea till engine is supported properly - return self._join_monotonic(other, how=how) - else: - return self._join_non_unique(other, how=how, sort=sort) - elif ( - # GH48504: exclude MultiIndex to avoid going through MultiIndex._values - self.is_monotonic_increasing + if ( + not isinstance(self.dtype, CategoricalDtype) + and self.is_monotonic_increasing and other.is_monotonic_increasing and self._can_use_libjoin and other._can_use_libjoin - and not isinstance(self, ABCMultiIndex) - and not isinstance(self.dtype, CategoricalDtype) + and (self.is_unique or other.is_unique) ): # Categorical is monotonic if data are ordered as categories, but join can # not handle this in case of not lexicographically monotonic GH#38502 @@ -4662,6 +4646,8 @@ def join( except TypeError: # object dtype; non-comparable objects pass + elif not self.is_unique or not other.is_unique: + return self._join_non_unique(other, how=how, sort=sort) return self._join_via_get_indexer(other, how, sort) @@ -4797,6 +4783,9 @@ def _join_non_unique( join_idx = self.take(left_idx) right = other.take(right_idx) join_index = join_idx.putmask(mask, right) + if isinstance(join_index, ABCMultiIndex) and how == "outer": + # test_join_index_levels + join_index = join_index._sort_levels_monotonic() return join_index, left_idx, right_idx @final @@ -5042,10 +5031,10 @@ def _can_use_libjoin(self) -> bool: or isinstance(self._values, (ArrowExtensionArray, BaseMaskedArray)) or self.dtype == "string[python]" ) - # For IntervalIndex, the conversion to numpy converts - # to object dtype, which negates the performance benefit of libjoin - # TODO: exclude RangeIndex and MultiIndex as these also make copies? - return not isinstance(self.dtype, IntervalDtype) + # Exclude index types where the conversion to numpy converts to object dtype, + # which negates the performance benefit of libjoin + # TODO: exclude RangeIndex? Seems to break test_concat_datetime_timezone + return not isinstance(self, (ABCIntervalIndex, ABCMultiIndex)) # -------------------------------------------------------------------- # Uncategorized Methods @@ -5180,8 +5169,7 @@ def _get_join_target(self) -> np.ndarray: # present return self._values.to_numpy() - # TODO: exclude ABCRangeIndex, ABCMultiIndex cases here as those create - # copies. + # TODO: exclude ABCRangeIndex case here as it copies target = self._get_engine_target() if not isinstance(target, np.ndarray): raise ValueError("_can_use_libjoin should return False.")