Skip to content

Commit 314ac9a

Browse files
PERF: Fixed cut regression, improve Categorical (#34952)
1 parent 3f4f564 commit 314ac9a

File tree

4 files changed

+53
-0
lines changed

4 files changed

+53
-0
lines changed

asv_bench/benchmarks/categoricals.py

+4
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ def setup(self):
3434
self.values_all_int8 = np.ones(N, "int8")
3535
self.categorical = pd.Categorical(self.values, self.categories)
3636
self.series = pd.Series(self.categorical)
37+
self.intervals = pd.interval_range(0, 1, periods=N // 10)
3738

3839
def time_regular(self):
3940
pd.Categorical(self.values, self.categories)
@@ -44,6 +45,9 @@ def time_fastpath(self):
4445
def time_datetimes(self):
4546
pd.Categorical(self.datetimes)
4647

48+
def time_interval(self):
49+
pd.Categorical(self.datetimes, categories=self.datetimes)
50+
4751
def time_datetimes_with_nat(self):
4852
pd.Categorical(self.datetimes_with_nat)
4953

doc/source/whatsnew/v1.1.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -828,6 +828,8 @@ Performance improvements
828828
- Performance improvement for groupby methods :meth:`~pandas.core.groupby.groupby.Groupby.first`
829829
and :meth:`~pandas.core.groupby.groupby.Groupby.last` (:issue:`34178`)
830830
- Performance improvement in :func:`factorize` for nullable (integer and boolean) dtypes (:issue:`33064`).
831+
- Performance improvement when constructing :class:`Categorical` objects (:issue:`33921`)
832+
- Fixed performance regression in :func:`pandas.qcut` and :func:`pandas.cut` (:issue:`33921`)
831833
- Performance improvement in reductions (sum, prod, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`, :issue:`33442`).
832834
- Performance improvement in arithmetic operations between two :class:`DataFrame` objects (:issue:`32779`)
833835
- Performance improvement in :class:`pandas.core.groupby.RollingGroupby` (:issue:`34052`)

pandas/core/arrays/categorical.py

+5
Original file line numberDiff line numberDiff line change
@@ -2611,6 +2611,11 @@ def _get_codes_for_values(values, categories):
26112611
values = ensure_object(values)
26122612
categories = ensure_object(categories)
26132613

2614+
if isinstance(categories, ABCIndexClass):
2615+
return coerce_indexer_dtype(categories.get_indexer_for(values), categories)
2616+
2617+
# Only hit here when we've already coerced to object dtypee.
2618+
26142619
hash_klass, vals = _get_data_algo(values)
26152620
_, cats = _get_data_algo(categories)
26162621
t = hash_klass(len(cats))

pandas/tests/arrays/categorical/test_constructors.py

+42
Original file line numberDiff line numberDiff line change
@@ -643,3 +643,45 @@ def test_constructor_string_and_tuples(self):
643643
c = pd.Categorical(np.array(["c", ("a", "b"), ("b", "a"), "c"], dtype=object))
644644
expected_index = pd.Index([("a", "b"), ("b", "a"), "c"])
645645
assert c.categories.equals(expected_index)
646+
647+
def test_interval(self):
648+
idx = pd.interval_range(0, 10, periods=10)
649+
cat = pd.Categorical(idx, categories=idx)
650+
expected_codes = np.arange(10, dtype="int8")
651+
tm.assert_numpy_array_equal(cat.codes, expected_codes)
652+
tm.assert_index_equal(cat.categories, idx)
653+
654+
# infer categories
655+
cat = pd.Categorical(idx)
656+
tm.assert_numpy_array_equal(cat.codes, expected_codes)
657+
tm.assert_index_equal(cat.categories, idx)
658+
659+
# list values
660+
cat = pd.Categorical(list(idx))
661+
tm.assert_numpy_array_equal(cat.codes, expected_codes)
662+
tm.assert_index_equal(cat.categories, idx)
663+
664+
# list values, categories
665+
cat = pd.Categorical(list(idx), categories=list(idx))
666+
tm.assert_numpy_array_equal(cat.codes, expected_codes)
667+
tm.assert_index_equal(cat.categories, idx)
668+
669+
# shuffled
670+
values = idx.take([1, 2, 0])
671+
cat = pd.Categorical(values, categories=idx)
672+
tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype="int8"))
673+
tm.assert_index_equal(cat.categories, idx)
674+
675+
# extra
676+
values = pd.interval_range(8, 11, periods=3)
677+
cat = pd.Categorical(values, categories=idx)
678+
expected_codes = np.array([8, 9, -1], dtype="int8")
679+
tm.assert_numpy_array_equal(cat.codes, expected_codes)
680+
tm.assert_index_equal(cat.categories, idx)
681+
682+
# overlapping
683+
idx = pd.IntervalIndex([pd.Interval(0, 2), pd.Interval(0, 1)])
684+
cat = pd.Categorical(idx, categories=idx)
685+
expected_codes = np.array([0, 1], dtype="int8")
686+
tm.assert_numpy_array_equal(cat.codes, expected_codes)
687+
tm.assert_index_equal(cat.categories, idx)

0 commit comments

Comments
 (0)