pandas-dev · jreback · Nov 18, 2020 · Oct 22, 2020 · Oct 23, 2020 · Oct 23, 2020
diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py
@@ -1,3 +1,5 @@
+import string
+import sys
 import warnings
 
 import numpy as np
@@ -67,6 +69,47 @@ def time_existing_series(self):
         pd.Categorical(self.series)
 
 
+class AsType:
+    def setup(self):
+        N = 10 ** 5
+
+        random_pick = np.random.default_rng().choice
+
+        categories = {
+            "str": list(string.ascii_letters),
+            "int": np.random.randint(2 ** 16, size=154),
+            "float": sys.maxsize * np.random.random((38,)),
+            "timestamp": [
+                pd.Timestamp(x, unit="s") for x in np.random.randint(2 ** 18, size=578)
+            ],
+        }
+
+        self.df = pd.DataFrame(
+            {col: random_pick(cats, N) for col, cats in categories.items()}
+        )
+
+        for col in ("int", "float", "timestamp"):
+            self.df[col + "_as_str"] = self.df[col].astype(str)
+
+        for col in self.df.columns:
+            self.df[col] = self.df[col].astype("category")
+
+    def astype_str(self):
+        [self.df[col].astype("str") for col in "int float timestamp".split()]
+
+    def astype_int(self):
+        [self.df[col].astype("int") for col in "int_as_str timestamp".split()]
+
+    def astype_float(self):
+        [
+            self.df[col].astype("float")
+            for col in "float_as_str int int_as_str timestamp".split()
+        ]
+
+    def astype_datetime(self):
+        self.df["float"].astype(pd.DatetimeTZDtype(tz="US/Pacific"))
+
+
 class Concat:
     def setup(self):
         N = 10 ** 5

diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
@@ -498,6 +498,7 @@ Performance improvements
 - Reduced peak memory usage in :meth:`DataFrame.to_pickle` when using ``protocol=5`` in python 3.8+ (:issue:`34244`)
 - faster ``dir`` calls when many index labels, e.g. ``dir(ser)`` (:issue:`37450`)
 - Performance improvement in :class:`ExpandingGroupby` (:issue:`37064`)
+- Performance improvement in :meth:`Series.astype` and :meth:`DataFrame.astype` for :class:`Categorical` (:issue:`8628`)
 - Performance improvement in :meth:`pd.DataFrame.groupby` for ``float`` ``dtype`` (:issue:`28303`), changes of the underlying hash-function can lead to changes in float based indexes sort ordering for ties (e.g. :meth:`pd.Index.value_counts`)
 - Performance improvement in :meth:`pd.isin` for inputs with more than 1e6 elements
 

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -402,20 +402,40 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike:
             If copy is set to False and dtype is categorical, the original
             object is returned.
         """
-        if is_categorical_dtype(dtype):
+        if self.dtype is dtype:
+            result = self.copy() if copy else self
+
+        elif is_categorical_dtype(dtype):
             dtype = cast(Union[str, CategoricalDtype], dtype)
 
             # GH 10696/18593/18630
             dtype = self.dtype.update_dtype(dtype)
-            result = self.copy() if copy else self
-            if dtype == self.dtype:
-                return result
-            return result._set_dtype(dtype)
-        if is_extension_array_dtype(dtype):
-            return array(self, dtype=dtype, copy=copy)
-        if is_integer_dtype(dtype) and self.isna().any():
+            self = self.copy() if copy else self
+            result = self._set_dtype(dtype)
+
+        # TODO: consolidate with ndarray case?
+        elif is_extension_array_dtype(dtype):
+            result = array(self, dtype=dtype, copy=copy)
+
+        elif is_integer_dtype(dtype) and self.isna().any():
             raise ValueError("Cannot convert float NaN to integer")
-        return np.array(self, dtype=dtype, copy=copy)
+
+        elif len(self.codes) == 0 or len(self.categories) == 0:
+            result = np.array(self, dtype=dtype, copy=copy)
+
+        else:
+            # GH8628 (PERF): astype category codes instead of astyping array
+            try:
+                astyped_cats = self.categories.astype(dtype=dtype, copy=copy)
+            except (TypeError, ValueError):
+                raise ValueError(
+                    f"Cannot cast {self.categories.dtype} dtype to {dtype}"
+                )
+
+            astyped_cats = extract_array(astyped_cats, extract_numpy=True)
+            result = take_1d(astyped_cats, libalgos.ensure_platform_int(self._codes))
+
+        return result
 
     @cache_readonly
     def itemsize(self) -> int:

diff --git a/pandas/tests/arrays/categorical/test_dtypes.py b/pandas/tests/arrays/categorical/test_dtypes.py
@@ -127,7 +127,7 @@ def test_astype(self, ordered):
         expected = np.array(cat)
         tm.assert_numpy_array_equal(result, expected)
 
-        msg = "could not convert string to float"
+        msg = r"Cannot cast object dtype to <class 'float'>"
         with pytest.raises(ValueError, match=msg):
             cat.astype(float)
 
@@ -138,7 +138,7 @@ def test_astype(self, ordered):
         tm.assert_numpy_array_equal(result, expected)
 
         result = cat.astype(int)
-        expected = np.array(cat, dtype=int)
+        expected = np.array(cat, dtype="int64")
         tm.assert_numpy_array_equal(result, expected)
 
         result = cat.astype(float)

diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py
@@ -60,15 +60,15 @@ def test_astype_categorical_to_other(self):
         expected = ser
         tm.assert_series_equal(ser.astype("category"), expected)
         tm.assert_series_equal(ser.astype(CategoricalDtype()), expected)
-        msg = r"could not convert string to float|invalid literal for float\(\)"
+        msg = r"Cannot cast object dtype to float64"
         with pytest.raises(ValueError, match=msg):
             ser.astype("float64")
 
         cat = Series(Categorical(["a", "b", "b", "a", "a", "c", "c", "c"]))
         exp = Series(["a", "b", "b", "a", "a", "c", "c", "c"])
         tm.assert_series_equal(cat.astype("str"), exp)
         s2 = Series(Categorical(["1", "2", "3", "4"]))
-        exp2 = Series([1, 2, 3, 4]).astype(int)
+        exp2 = Series([1, 2, 3, 4]).astype("int64")
         tm.assert_series_equal(s2.astype("int"), exp2)
 
         # object don't sort correctly, so just compare that we have the same