pandas-dev · topper-123 · May 11, 2023 · May 11, 2023 · May 12, 2023 · May 22, 2023
diff --git a/asv_bench/benchmarks/pandas_vb_common.py b/asv_bench/benchmarks/pandas_vb_common.py
@@ -44,7 +44,7 @@
         pd.UInt16Dtype,
         pd.UInt32Dtype,
         pd.UInt64Dtype,
-        pd.CategoricalDtype,
+        pd.CategoricalDtype(),
         pd.IntervalDtype,
         pd.DatetimeTZDtype("ns", "UTC"),
         pd.PeriodDtype("D"),

diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst
@@ -138,15 +138,26 @@ behavior:
 1. Categories are inferred from the data.
 2. Categories are unordered.
 
-To control those behaviors, instead of passing ``'category'``, use an instance
+It is also possible to give a dtype inside bracket to ensure the dtype of the categories, like this:
+
+.. ipython:: python
+
+    s = pd.Series(["a", "b", "c", "a"], dtype="category[string]")
+    s.dtype.categories
+
+.. versionadded:: 2.1.0
+
+    The ability to a specify the categories dtype in the dtype string was added in :ref:`v2.1.0 <whatsnew_210.enhancements.category_subtype>`
+
+To control those behaviors even more, instead of passing ``'category'``, use an instance
 of :class:`~pandas.api.types.CategoricalDtype`.
 
 .. ipython:: python
 
     from pandas.api.types import CategoricalDtype
 
     s = pd.Series(["a", "b", "c", "a"])
-    cat_type = CategoricalDtype(categories=["b", "c", "d"], ordered=True)
+    cat_type = CategoricalDtype(["b", "c", "d"], ordered=True, categories_dtype="string")
     s_cat = s.astype(cat_type)
     s_cat
 
@@ -257,11 +268,18 @@ unordered categoricals, the order of the ``categories`` is not considered.
    # Unequal, since the second CategoricalDtype is ordered
    c1 == CategoricalDtype(["a", "b", "c"], ordered=True)
 
-All instances of ``CategoricalDtype`` compare equal to the string ``'category'``.
+All instances of ``CategoricalDtype`` compare equal to the string ``'category'`` or the
+string ``'category'`` with the string dtype for the categories inside square brackets.
 
 .. ipython:: python
 
    c1 == "category"
+   c1 == "category[object]"
+
+.. versionadded:: 2.1.0
+
+    The ability to a specify the categories dtype inside square brackets in the dtype
+    string was added in :ref:`v2.1.0 <whatsnew_210.enhancements.category_subtype>`
 
 Description
 -----------

diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -14,12 +14,39 @@ including other versions of pandas.
 Enhancements
 ~~~~~~~~~~~~
 
-.. _whatsnew_210.enhancements.enhancement1:
+.. _whatsnew_210.enhancements.category_subtype:
+
+Specific categorical string dtypes like ``dtype="category[string]"`` now works
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+When giving the string ``"category"`` as a dtype it is now possible to specify the dtype
+of the categories as part of the dtype string:
+
+.. ipython:: python
+
+    ser = pd.Series(["a", "b", np.nan], dtype="category[string]")
+    ser
+
+The expression inside the brackets can be any string that Pandas accepts for a dtype and
+whose data can be stored in an :class:`Index` (:issue:`48515`).
+
+The categories dtype will also now be part of the dtype repr:
+
+.. ipython:: python
+
+   df = pd.DataFrame({"a": ser, "b": pd.array([1, 2, 3], dtype="category[Int8]")})
+   df.dtypes
+
+We can now also compare categorical dtypes to a string with the dtype of the categories inside brackets in order to get more precise comparisons:
+
+.. ipython:: python
+
+    ser.dtype == "category[string]"
+    ser.dtype == "category"  # also works, but doesn't check the categories dtype
+    ser.dtype == "category[object]"  # fails, wrong categories dtype
 
-enhancement1
-^^^^^^^^^^^^
 
-.. _whatsnew_210.enhancements.enhancement2:
+.. _whatsnew_210.enhancements.map_na_action:
 
 ``map(func, na_action="ignore")`` now works for all array types
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -96,6 +123,7 @@ Other enhancements
 - Improved error message when creating a DataFrame with empty data (0 rows), no index and an incorrect number of columns. (:issue:`52084`)
 - Let :meth:`DataFrame.to_feather` accept a non-default :class:`Index` and non-string column names (:issue:`51787`)
 - Performance improvement in :func:`read_csv` (:issue:`52632`) with ``engine="c"``
+- :class:`CategoricalDtype` has gotten a new parameter and attribute named :meth:`CategoricalDtype.categories_dtype` (:issue:`48515`)
 - :meth:`Categorical.from_codes` has gotten a ``validate`` parameter (:issue:`50975`)
 - Added ``engine_kwargs`` parameter to :meth:`DataFrame.to_excel` (:issue:`53220`)
 - Performance improvement in :func:`concat` with homogeneous ``np.float64`` or ``np.float32`` dtypes (:issue:`52685`)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -1388,6 +1388,12 @@ cdef object _try_infer_map(object dtype):
         val = getattr(dtype, attr, None)
         if val in _TYPE_MAP:
             return _TYPE_MAP[val]
+
+    # CategoricalDtype may have name category[dtype], so not caught above
+    name = getattr(dtype, "name", None)
+    if name.startswith("category["):
+        return _TYPE_MAP["category"]
+
     return None
 
 

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -453,7 +453,7 @@ def __init__(
                     ) from err
 
             # we're inferring from values
-            dtype = CategoricalDtype(categories, dtype.ordered)
+            dtype = CategoricalDtype(categories, dtype.ordered, dtype.categories_dtype)
 
         elif isinstance(values.dtype, CategoricalDtype):
             old_codes = extract_array(values)._codes
@@ -1999,7 +1999,7 @@ def _repr_categories_info(self) -> str:
 
     def _repr_footer(self) -> str:
         info = self._repr_categories_info()
-        return f"Length: {len(self)}\n{info}"
+        return f"Length: {len(self)}, dtype: {self.dtype}\n{info}"
 
     def _get_repr(
         self, length: bool = True, na_rep: str = "NaN", footer: bool = True
@@ -2524,7 +2524,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
     3    c
     4    c
     5    c
-    dtype: category
+    dtype: category[object]
     Categories (3, object): ['a', 'b', 'c']
 
     >>> s.cat.categories
@@ -2537,7 +2537,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
     3    a
     4    a
     5    a
-    dtype: category
+    dtype: category[object]
     Categories (3, object): ['c', 'b', 'a']
 
     >>> s.cat.reorder_categories(list("cba"))
@@ -2547,7 +2547,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
     3    c
     4    c
     5    c
-    dtype: category
+    dtype: category[object]
     Categories (3, object): ['c', 'b', 'a']
 
     >>> s.cat.add_categories(["d", "e"])
@@ -2557,7 +2557,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
     3    c
     4    c
     5    c
-    dtype: category
+    dtype: category[object]
     Categories (5, object): ['a', 'b', 'c', 'd', 'e']
 
     >>> s.cat.remove_categories(["a", "c"])
@@ -2567,7 +2567,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
     3    NaN
     4    NaN
     5    NaN
-    dtype: category
+    dtype: category[object]
     Categories (1, object): ['b']
 
     >>> s1 = s.cat.add_categories(["d", "e"])
@@ -2578,7 +2578,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
     3    c
     4    c
     5    c
-    dtype: category
+    dtype: category[object]
     Categories (3, object): ['a', 'b', 'c']
 
     >>> s.cat.set_categories(list("abcde"))
@@ -2588,7 +2588,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
     3    c
     4    c
     5    c
-    dtype: category
+    dtype: category[object]
     Categories (5, object): ['a', 'b', 'c', 'd', 'e']
 
     >>> s.cat.as_ordered()
@@ -2598,7 +2598,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
     3    c
     4    c
     5    c
-    dtype: category
+    dtype: category[object]
     Categories (3, object): ['a' < 'b' < 'c']
 
     >>> s.cat.as_unordered()
@@ -2608,7 +2608,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
     3    c
     4    c
     5    c
-    dtype: category
+    dtype: category[object]
     Categories (3, object): ['a', 'b', 'c']
     """
 

diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
@@ -502,9 +502,8 @@ def is_categorical_dtype(arr_or_dtype) -> bool:
         FutureWarning,
         stacklevel=find_stack_level(),
     )
-    if isinstance(arr_or_dtype, ExtensionDtype):
-        # GH#33400 fastpath for dtype object
-        return arr_or_dtype.name == "category"
+    if isinstance(arr_or_dtype, CategoricalDtype):
+        return True
 
     if arr_or_dtype is None:
         return False