REF: check monotonicity inside _can_use_libjoin (#55342)

jbrockmendel · lukemanley · web-flow · commit ee8f3358007b · 2023-12-27T11:09:17.000-08:00
* REF: fix can_use_libjoin check

* DOC: docstring for can_use_libjoin

* Make can_use_libjoin checks more-correct

* avoid allocating mapping in monotonic cases

* fix categorical memory usage tests

* catch decimal.InvalidOperation

---------

Co-authored-by: Luke Manley &lt;lukemanley@gmail.com&gt;
diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
@@ -43,6 +43,8 @@ from pandas._libs.missing cimport (
     is_matching_na,
 )
 
+from decimal import InvalidOperation
+
 # Defines shift of MultiIndex codes to avoid negative codes (missing values)
 multiindex_nulls_shift = 2
 
@@ -248,6 +250,10 @@ cdef class IndexEngine:
 
     @property
     def is_unique(self) -> bool:
+        # for why we check is_monotonic_increasing here, see
+        # https://github.com/pandas-dev/pandas/pull/55342#discussion_r1361405781
+        if self.need_monotonic_check:
+            self.is_monotonic_increasing
         if self.need_unique_check:
             self._do_unique_check()
 
@@ -281,7 +287,7 @@ cdef class IndexEngine:
                 values = self.values
                 self.monotonic_inc, self.monotonic_dec, is_strict_monotonic = \
                     self._call_monotonic(values)
-            except TypeError:
+            except (TypeError, InvalidOperation):
                 self.monotonic_inc = 0
                 self.monotonic_dec = 0
                 is_strict_monotonic = 0
@@ -843,6 +849,10 @@ cdef class SharedEngine:
 
     @property
     def is_unique(self) -> bool:
+        # for why we check is_monotonic_increasing here, see
+        # https://github.com/pandas-dev/pandas/pull/55342#discussion_r1361405781
+        if self.need_monotonic_check:
+            self.is_monotonic_increasing
         if self.need_unique_check:
             arr = self.values.unique()
             self.unique = len(arr) == len(self.values)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -3711,7 +3711,7 @@ def memory_usage(self, index: bool = True, deep: bool = False) -> Series:
         many repeated values.
 
         >>> df['object'].astype('category').memory_usage(deep=True)
-        5244
+        5136
         """
         result = self._constructor_sliced(
             [c.memory_usage(index=False, deep=deep) for col, c in self.items()],
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -3382,9 +3382,7 @@ def _union(self, other: Index, sort: bool | None):
 
         if (
             sort in (None, True)
-            and self.is_monotonic_increasing
-            and other.is_monotonic_increasing
-            and not (self.has_duplicates and other.has_duplicates)
+            and (self.is_unique or other.is_unique)
             and self._can_use_libjoin
             and other._can_use_libjoin
         ):
@@ -3536,12 +3534,7 @@ def _intersection(self, other: Index, sort: bool = False):
         """
         intersection specialized to the case with matching dtypes.
         """
-        if (
-            self.is_monotonic_increasing
-            and other.is_monotonic_increasing
-            and self._can_use_libjoin
-            and other._can_use_libjoin
-        ):
+        if self._can_use_libjoin and other._can_use_libjoin:
             try:
                 res_indexer, indexer, _ = self._inner_indexer(other)
             except TypeError:
@@ -4980,7 +4973,10 @@ def _get_leaf_sorter(labels: list[np.ndarray]) -> npt.NDArray[np.intp]:
     def _join_monotonic(
         self, other: Index, how: JoinHow = "left"
     ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:
-        # We only get here with matching dtypes and both monotonic increasing
+        # We only get here with (caller is responsible for ensuring):
+        #  1) matching dtypes
+        #  2) both monotonic increasing
+        #  3) other.is_unique or self.is_unique
         assert other.dtype == self.dtype
         assert self._can_use_libjoin and other._can_use_libjoin
 
@@ -5062,6 +5058,10 @@ def _can_use_libjoin(self) -> bool:
         making a copy. If we cannot, this negates the performance benefit
         of using libjoin.
         """
+        if not self.is_monotonic_increasing:
+            # The libjoin functions all assume monotonicity.
+            return False
+
         if type(self) is Index:
             # excludes EAs, but include masks, we get here with monotonic
             # values only, meaning no NA
diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py
@@ -75,11 +75,6 @@ def data_for_grouping():
 
 
 class TestCategorical(base.ExtensionTests):
-    @pytest.mark.xfail(reason="Memory usage doesn't match")
-    def test_memory_usage(self, data):
-        # TODO: Is this deliberate?
-        super().test_memory_usage(data)
-
     def test_contains(self, data, data_missing):
         # GH-37867
         # na value handling in Categorical.__contains__ is deprecated.