Skip to content

Commit 2adb306

Browse files
committed
perf: Cach hashing of categories
1 parent 602ab16 commit 2adb306

File tree

2 files changed

+6
-1
lines changed

2 files changed

+6
-1
lines changed

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -388,6 +388,7 @@ Performance improvements
388388
- Performance improvement in :class:`Styler` where render times are more than 50% reduced (:issue:`39972` :issue:`39952`)
389389
- Performance improvement in :meth:`core.window.ewm.ExponentialMovingWindow.mean` with ``times`` (:issue:`39784`)
390390
- Performance improvement in :meth:`.GroupBy.apply` when requiring the python fallback implementation (:issue:`40176`)
391+
- Performance improvement for concatenation of data with type :class:`CategoricalDtype` (:issue:`40193`)
391392

392393
.. ---------------------------------------------------------------------------
393394

pandas/core/dtypes/dtypes.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,7 @@ def _finalize(self, categories, ordered: Ordered, fastpath: bool = False) -> Non
343343

344344
self._categories = categories
345345
self._ordered = ordered
346+
self._hash_cache = -10 # Sentinel. -1 and -2 are used below with some semantics
346347

347348
def __setstate__(self, state: MutableMapping[str_type, Any]) -> None:
348349
# for pickle compat. __get_state__ is defined in the
@@ -352,6 +353,8 @@ def __setstate__(self, state: MutableMapping[str_type, Any]) -> None:
352353
self._ordered = state.pop("ordered", False)
353354

354355
def __hash__(self) -> int:
356+
if self._hash_cache >= 0:
357+
return self._hash_cache
355358
# _hash_categories returns a uint64, so use the negative
356359
# space for when we have unknown categories to avoid a conflict
357360
if self.categories is None:
@@ -360,7 +363,8 @@ def __hash__(self) -> int:
360363
else:
361364
return -2
362365
# We *do* want to include the real self.ordered here
363-
return int(self._hash_categories(self.categories, self.ordered))
366+
self._hash_cache = int(self._hash_categories(self.categories, self.ordered))
367+
return self._hash_cache
364368

365369
def __eq__(self, other: Any) -> bool:
366370
"""

0 commit comments

Comments
 (0)