Skip to content

Commit ee8f335

Browse files
REF: check monotonicity inside _can_use_libjoin (#55342)
* REF: fix can_use_libjoin check * DOC: docstring for can_use_libjoin * Make can_use_libjoin checks more-correct * avoid allocating mapping in monotonic cases * fix categorical memory usage tests * catch decimal.InvalidOperation --------- Co-authored-by: Luke Manley <[email protected]>
1 parent 44330e8 commit ee8f335

File tree

4 files changed

+22
-17
lines changed

4 files changed

+22
-17
lines changed

pandas/_libs/index.pyx

+11-1
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@ from pandas._libs.missing cimport (
4343
is_matching_na,
4444
)
4545

46+
from decimal import InvalidOperation
47+
4648
# Defines shift of MultiIndex codes to avoid negative codes (missing values)
4749
multiindex_nulls_shift = 2
4850

@@ -248,6 +250,10 @@ cdef class IndexEngine:
248250

249251
@property
250252
def is_unique(self) -> bool:
253+
# for why we check is_monotonic_increasing here, see
254+
# https://github.com/pandas-dev/pandas/pull/55342#discussion_r1361405781
255+
if self.need_monotonic_check:
256+
self.is_monotonic_increasing
251257
if self.need_unique_check:
252258
self._do_unique_check()
253259

@@ -281,7 +287,7 @@ cdef class IndexEngine:
281287
values = self.values
282288
self.monotonic_inc, self.monotonic_dec, is_strict_monotonic = \
283289
self._call_monotonic(values)
284-
except TypeError:
290+
except (TypeError, InvalidOperation):
285291
self.monotonic_inc = 0
286292
self.monotonic_dec = 0
287293
is_strict_monotonic = 0
@@ -843,6 +849,10 @@ cdef class SharedEngine:
843849

844850
@property
845851
def is_unique(self) -> bool:
852+
# for why we check is_monotonic_increasing here, see
853+
# https://github.com/pandas-dev/pandas/pull/55342#discussion_r1361405781
854+
if self.need_monotonic_check:
855+
self.is_monotonic_increasing
846856
if self.need_unique_check:
847857
arr = self.values.unique()
848858
self.unique = len(arr) == len(self.values)

pandas/core/frame.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -3711,7 +3711,7 @@ def memory_usage(self, index: bool = True, deep: bool = False) -> Series:
37113711
many repeated values.
37123712
37133713
>>> df['object'].astype('category').memory_usage(deep=True)
3714-
5244
3714+
5136
37153715
"""
37163716
result = self._constructor_sliced(
37173717
[c.memory_usage(index=False, deep=deep) for col, c in self.items()],

pandas/core/indexes/base.py

+10-10
Original file line numberDiff line numberDiff line change
@@ -3382,9 +3382,7 @@ def _union(self, other: Index, sort: bool | None):
33823382

33833383
if (
33843384
sort in (None, True)
3385-
and self.is_monotonic_increasing
3386-
and other.is_monotonic_increasing
3387-
and not (self.has_duplicates and other.has_duplicates)
3385+
and (self.is_unique or other.is_unique)
33883386
and self._can_use_libjoin
33893387
and other._can_use_libjoin
33903388
):
@@ -3536,12 +3534,7 @@ def _intersection(self, other: Index, sort: bool = False):
35363534
"""
35373535
intersection specialized to the case with matching dtypes.
35383536
"""
3539-
if (
3540-
self.is_monotonic_increasing
3541-
and other.is_monotonic_increasing
3542-
and self._can_use_libjoin
3543-
and other._can_use_libjoin
3544-
):
3537+
if self._can_use_libjoin and other._can_use_libjoin:
35453538
try:
35463539
res_indexer, indexer, _ = self._inner_indexer(other)
35473540
except TypeError:
@@ -4980,7 +4973,10 @@ def _get_leaf_sorter(labels: list[np.ndarray]) -> npt.NDArray[np.intp]:
49804973
def _join_monotonic(
49814974
self, other: Index, how: JoinHow = "left"
49824975
) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:
4983-
# We only get here with matching dtypes and both monotonic increasing
4976+
# We only get here with (caller is responsible for ensuring):
4977+
# 1) matching dtypes
4978+
# 2) both monotonic increasing
4979+
# 3) other.is_unique or self.is_unique
49844980
assert other.dtype == self.dtype
49854981
assert self._can_use_libjoin and other._can_use_libjoin
49864982

@@ -5062,6 +5058,10 @@ def _can_use_libjoin(self) -> bool:
50625058
making a copy. If we cannot, this negates the performance benefit
50635059
of using libjoin.
50645060
"""
5061+
if not self.is_monotonic_increasing:
5062+
# The libjoin functions all assume monotonicity.
5063+
return False
5064+
50655065
if type(self) is Index:
50665066
# excludes EAs, but include masks, we get here with monotonic
50675067
# values only, meaning no NA

pandas/tests/extension/test_categorical.py

-5
Original file line numberDiff line numberDiff line change
@@ -75,11 +75,6 @@ def data_for_grouping():
7575

7676

7777
class TestCategorical(base.ExtensionTests):
78-
@pytest.mark.xfail(reason="Memory usage doesn't match")
79-
def test_memory_usage(self, data):
80-
# TODO: Is this deliberate?
81-
super().test_memory_usage(data)
82-
8378
def test_contains(self, data, data_missing):
8479
# GH-37867
8580
# na value handling in Categorical.__contains__ is deprecated.

0 commit comments

Comments
 (0)