diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 107b9b9edcd5d..a0b24342091ec 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -34,6 +34,7 @@ def setup(self): self.values_all_int8 = np.ones(N, "int8") self.categorical = pd.Categorical(self.values, self.categories) self.series = pd.Series(self.categorical) + self.intervals = pd.interval_range(0, 1, periods=N // 10) def time_regular(self): pd.Categorical(self.values, self.categories) @@ -44,6 +45,9 @@ def time_fastpath(self): def time_datetimes(self): pd.Categorical(self.datetimes) + def time_interval(self): + pd.Categorical(self.datetimes, categories=self.datetimes) + def time_datetimes_with_nat(self): pd.Categorical(self.datetimes_with_nat) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 22b83425b58c2..a190f01101ac3 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -805,6 +805,8 @@ Performance improvements - Performance improvement for groupby methods :meth:`~pandas.core.groupby.groupby.Groupby.first` and :meth:`~pandas.core.groupby.groupby.Groupby.last` (:issue:`34178`) - Performance improvement in :func:`factorize` for nullable (integer and boolean) dtypes (:issue:`33064`). +- Performance improvement when constructing :class:`Categorical` objects (:issue:`33921`) +- Fixed performance regression in :func:`pandas.qcut` and :func:`pandas.cut` (:issue:`33921`) - Performance improvement in reductions (sum, prod, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`, :issue:`33442`). - Performance improvement in arithmetic operations between two :class:`DataFrame` objects (:issue:`32779`) - Performance improvement in :class:`pandas.core.groupby.RollingGroupby` (:issue:`34052`) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 80fe1ac7ce619..3d469ec28b9c4 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2611,6 +2611,11 @@ def _get_codes_for_values(values, categories): values = ensure_object(values) categories = ensure_object(categories) + if isinstance(categories, ABCIndexClass): + return coerce_indexer_dtype(categories.get_indexer_for(values), categories) + + # Only hit here when we've already coerced to object dtypee. + hash_klass, vals = _get_data_algo(values) _, cats = _get_data_algo(categories) t = hash_klass(len(cats)) diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index 9be741274c15a..ca942c9288898 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -643,3 +643,45 @@ def test_constructor_string_and_tuples(self): c = pd.Categorical(np.array(["c", ("a", "b"), ("b", "a"), "c"], dtype=object)) expected_index = pd.Index([("a", "b"), ("b", "a"), "c"]) assert c.categories.equals(expected_index) + + def test_interval(self): + idx = pd.interval_range(0, 10, periods=10) + cat = pd.Categorical(idx, categories=idx) + expected_codes = np.arange(10, dtype="int8") + tm.assert_numpy_array_equal(cat.codes, expected_codes) + tm.assert_index_equal(cat.categories, idx) + + # infer categories + cat = pd.Categorical(idx) + tm.assert_numpy_array_equal(cat.codes, expected_codes) + tm.assert_index_equal(cat.categories, idx) + + # list values + cat = pd.Categorical(list(idx)) + tm.assert_numpy_array_equal(cat.codes, expected_codes) + tm.assert_index_equal(cat.categories, idx) + + # list values, categories + cat = pd.Categorical(list(idx), categories=list(idx)) + tm.assert_numpy_array_equal(cat.codes, expected_codes) + tm.assert_index_equal(cat.categories, idx) + + # shuffled + values = idx.take([1, 2, 0]) + cat = pd.Categorical(values, categories=idx) + tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype="int8")) + tm.assert_index_equal(cat.categories, idx) + + # extra + values = pd.interval_range(8, 11, periods=3) + cat = pd.Categorical(values, categories=idx) + expected_codes = np.array([8, 9, -1], dtype="int8") + tm.assert_numpy_array_equal(cat.codes, expected_codes) + tm.assert_index_equal(cat.categories, idx) + + # overlapping + idx = pd.IntervalIndex([pd.Interval(0, 2), pd.Interval(0, 1)]) + cat = pd.Categorical(idx, categories=idx) + expected_codes = np.array([0, 1], dtype="int8") + tm.assert_numpy_array_equal(cat.codes, expected_codes) + tm.assert_index_equal(cat.categories, idx)