pandas-dev · jreback · Jun 24, 2020 · Jun 22, 2020 · Jun 23, 2020 · Jun 24, 2020
diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py
@@ -34,6 +34,7 @@ def setup(self):
         self.values_all_int8 = np.ones(N, "int8")
         self.categorical = pd.Categorical(self.values, self.categories)
         self.series = pd.Series(self.categorical)
+        self.intervals = pd.interval_range(0, 1, periods=N // 10)
 
     def time_regular(self):
         pd.Categorical(self.values, self.categories)
@@ -44,6 +45,9 @@ def time_fastpath(self):
     def time_datetimes(self):
         pd.Categorical(self.datetimes)
 
+    def time_interval(self):
+        pd.Categorical(self.datetimes, categories=self.datetimes)
+
     def time_datetimes_with_nat(self):
         pd.Categorical(self.datetimes_with_nat)
 

diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -805,6 +805,8 @@ Performance improvements
 - Performance improvement for groupby methods :meth:`~pandas.core.groupby.groupby.Groupby.first`
   and :meth:`~pandas.core.groupby.groupby.Groupby.last` (:issue:`34178`)
 - Performance improvement in :func:`factorize` for nullable (integer and boolean) dtypes (:issue:`33064`).
+- Performance improvement when constructing :class`Categorical` objects (:issue:`33921`)
+- Fixed performance regression in :func:`pandas.qcut` and :func:`pandas.cut` (:issue:`33921`)
 - Performance improvement in reductions (sum, prod, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`, :issue:`33442`).
 - Performance improvement in arithmetic operations between two :class:`DataFrame` objects (:issue:`32779`)
 - Performance improvement in :class:`pandas.core.groupby.RollingGroupby` (:issue:`34052`)

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -2611,6 +2611,11 @@ def _get_codes_for_values(values, categories):
         values = ensure_object(values)
         categories = ensure_object(categories)
 
+    if isinstance(categories, ABCIndexClass):
+        return coerce_indexer_dtype(categories.get_indexer(values), categories)
+
+    # Only hit here when we've already coerced to object dtypee.
+
     hash_klass, vals = _get_data_algo(values)
     _, cats = _get_data_algo(categories)
     t = hash_klass(len(cats))

diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py
@@ -413,6 +413,7 @@ def _bins_to_cuts(
 
     na_mask = isna(x) | (ids == len(bins)) | (ids == 0)
     has_nas = na_mask.any()
+    labels_are_unique = None
 
     if labels is not False:
         if not (labels is None or is_list_like(labels)):
@@ -425,6 +426,7 @@ def _bins_to_cuts(
             labels = _format_labels(
                 bins, precision, right=right, include_lowest=include_lowest, dtype=dtype
             )
+            labels_are_unique = True
         elif ordered and len(set(labels)) != len(labels):
             raise ValueError(
                 "labels must be unique if ordered=True; pass ordered=False for duplicate labels"  # noqa
@@ -435,11 +437,14 @@ def _bins_to_cuts(
                     "Bin labels must be one fewer than the number of bin edges"
                 )
         if not is_categorical_dtype(labels):
-            labels = Categorical(
-                labels,
-                categories=labels if len(set(labels)) == len(labels) else None,
-                ordered=ordered,
-            )
+            if labels_are_unique:
+                categories = labels
+            elif labels_are_unique is None:
+                categories = labels if len(set(labels)) == len(labels) else None
+            else:
+                categories = None
+
+            labels = Categorical(labels, categories=categories, ordered=ordered)
         # TODO: handle mismatch between categorical label order and pandas.cut order.
         np.putmask(ids, na_mask, 0)
         result = algos.take_nd(labels, ids - 1)

diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py
@@ -643,3 +643,38 @@ def test_constructor_string_and_tuples(self):
         c = pd.Categorical(np.array(["c", ("a", "b"), ("b", "a"), "c"], dtype=object))
         expected_index = pd.Index([("a", "b"), ("b", "a"), "c"])
         assert c.categories.equals(expected_index)
+
+    def test_interval(self):
+        idx = pd.interval_range(0, 10, periods=10)
+        cat = pd.Categorical(idx, categories=idx)
+        expected_codes = np.arange(10, dtype="int8")
+        tm.assert_numpy_array_equal(cat.codes, expected_codes)
+        tm.assert_index_equal(cat.categories, idx)
+
+        # infer categories
+        cat = pd.Categorical(idx)
+        tm.assert_numpy_array_equal(cat.codes, expected_codes)
+        tm.assert_index_equal(cat.categories, idx)
+
+        # list values
+        cat = pd.Categorical(list(idx))
+        tm.assert_numpy_array_equal(cat.codes, expected_codes)
+        tm.assert_index_equal(cat.categories, idx)
+
+        # list values, categories
+        cat = pd.Categorical(list(idx), categories=list(idx))
+        tm.assert_numpy_array_equal(cat.codes, expected_codes)
+        tm.assert_index_equal(cat.categories, idx)
+
+        # shuffled
+        values = idx.take([1, 2, 0])
+        cat = pd.Categorical(values, categories=idx)
+        tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype="int8"))
+        tm.assert_index_equal(cat.categories, idx)
+
+        # extra
+        values = pd.interval_range(8, 11, periods=3)
+        cat = pd.Categorical(values, categories=idx)
+        expected_codes = np.array([8, 9, -1], dtype="int8")
+        tm.assert_numpy_array_equal(cat.codes, expected_codes)
+        tm.assert_index_equal(cat.categories, idx)