Skip to content

Commit ab64538

Browse files
committed
perf: Cach hashing of categories
1 parent b2f09d9 commit ab64538

File tree

2 files changed

+20
-14
lines changed

2 files changed

+20
-14
lines changed

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -388,6 +388,7 @@ Performance improvements
388388
- Performance improvement in :class:`Styler` where render times are more than 50% reduced (:issue:`39972` :issue:`39952`)
389389
- Performance improvement in :meth:`core.window.ewm.ExponentialMovingWindow.mean` with ``times`` (:issue:`39784`)
390390
- Performance improvement in :meth:`.GroupBy.apply` when requiring the python fallback implementation (:issue:`40176`)
391+
- Performance improvement for concatenation of data with type :class:`CategoricalDtype` (:issue:`40193`)
391392

392393
.. ---------------------------------------------------------------------------
393394

pandas/core/dtypes/dtypes.py

+19-14
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import pytz
2222

2323
from pandas._libs.interval import Interval
24+
from pandas._libs.properties import cache_readonly
2425
from pandas._libs.tslibs import (
2526
BaseOffset,
2627
NaT,
@@ -351,17 +352,6 @@ def __setstate__(self, state: MutableMapping[str_type, Any]) -> None:
351352
self._categories = state.pop("categories", None)
352353
self._ordered = state.pop("ordered", False)
353354

354-
def __hash__(self) -> int:
355-
# _hash_categories returns a uint64, so use the negative
356-
# space for when we have unknown categories to avoid a conflict
357-
if self.categories is None:
358-
if self.ordered:
359-
return -1
360-
else:
361-
return -2
362-
# We *do* want to include the real self.ordered here
363-
return int(self._hash_categories(self.categories, self.ordered))
364-
365355
def __eq__(self, other: Any) -> bool:
366356
"""
367357
Rules for CDT equality:
@@ -434,14 +424,29 @@ def __repr__(self) -> str_type:
434424
data = data.rstrip(", ")
435425
return f"CategoricalDtype(categories={data}, ordered={self.ordered})"
436426

437-
@staticmethod
438-
def _hash_categories(categories, ordered: Ordered = True) -> int:
427+
def __hash__(self) -> int:
428+
# _hash_categories returns a uint64, so use the negative
429+
# space for when we have unknown categories to avoid a conflict
430+
if self.categories is None:
431+
if self.ordered:
432+
return -1
433+
else:
434+
return -2
435+
# Indirection via property to allow for easier caching
436+
return self._hash_categories
437+
438+
@cache_readonly
439+
def _hash_categories(self) -> int:
439440
from pandas.core.util.hashing import (
440441
combine_hash_arrays,
441442
hash_array,
442443
hash_tuples,
443444
)
444445

446+
# We *do* want to include the real self.ordered here
447+
categories = self.categories
448+
ordered = self.ordered
449+
445450
if len(categories) and isinstance(categories[0], tuple):
446451
# assumes if any individual category is a tuple, then all our. ATM
447452
# I don't really want to support just some of the categories being
@@ -469,7 +474,7 @@ def _hash_categories(categories, ordered: Ordered = True) -> int:
469474
else:
470475
cat_array = [cat_array]
471476
hashed = combine_hash_arrays(iter(cat_array), num_items=len(cat_array))
472-
return np.bitwise_xor.reduce(hashed)
477+
return int(np.bitwise_xor.reduce(hashed))
473478

474479
@classmethod
475480
def construct_array_type(cls) -> Type[Categorical]:

0 commit comments

Comments
 (0)