ENH: allow more specific categorical dtype strings, e.g. `category[string]'.

topper-123 · topper-123 · commit c5858674a4fa · 2023-05-27T09:12:36.000+01:00
diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst
@@ -138,15 +138,26 @@ behavior:
 1. Categories are inferred from the data.
 2. Categories are unordered.
 
-To control those behaviors, instead of passing ``'category'``, use an instance
+It is also possible to give a dtype inside bracket to ensure the dtype of the categories, like this:
+
+.. ipython:: python
+
+    s = pd.Series(["a", "b", "c", "a"], dtype="category[string]")
+    s.dtype.categories
+
+.. versionadded:: 2.1.0
+
+    The ability to a specify the categories dtype in the dtype string was added in :ref:`v2.1.0 <whatsnew_210.enhancements.category_subtype>`
+
+To control those behaviors even more, instead of passing ``'category'``, use an instance
 of :class:`~pandas.api.types.CategoricalDtype`.
 
 .. ipython:: python
 
     from pandas.api.types import CategoricalDtype
 
     s = pd.Series(["a", "b", "c", "a"])
-    cat_type = CategoricalDtype(categories=["b", "c", "d"], ordered=True)
+    cat_type = CategoricalDtype(["b", "c", "d"], ordered=True, categories_dtype="string")
     s_cat = s.astype(cat_type)
     s_cat
 
diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -14,12 +14,24 @@ including other versions of pandas.
 Enhancements
 ~~~~~~~~~~~~
 
-.. _whatsnew_210.enhancements.enhancement1:
+.. _whatsnew_210.enhancements.category_subtype:
+
+Specific categorical string dtypes like ``dtype="category[string]"`` now works
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+When giving the string ``"category"`` as a dtype it is now possible to specify the dtype
+of the categories as part of the dtype string:
+
+.. ipython:: python
+
+    ser = pd.Series(["a", "b", np.nan], dtype="category[string]")
+    ser.dtype.categories
+
+The expression inside the brackets can be any string that Pandas accepts for a dtype and
+whose data can be stored in an :class:`Index` (:issue:`48515`).
 
-enhancement1
-^^^^^^^^^^^^
 
-.. _whatsnew_210.enhancements.enhancement2:
+.. _whatsnew_210.enhancements.map_na_action:
 
 ``map(func, na_action="ignore")`` now works for all array types
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -99,6 +111,7 @@ Other enhancements
 - :meth:`Categorical.from_codes` has gotten a ``validate`` parameter (:issue:`50975`)
 - Added ``engine_kwargs`` parameter to :meth:`DataFrame.to_excel` (:issue:`53220`)
 - Performance improvement in :func:`concat` with homogeneous ``np.float64`` or ``np.float32`` dtypes (:issue:`52685`)
+- :class:`CategoricalDtype` has gotten a new parameter and attribute named :meth:`CategoricalDtype.categories_dtype` (:issue:`48515`)
 - Performance improvement in :meth:`DataFrame.filter` when ``items`` is given (:issue:`52941`)
 -
 
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -453,7 +453,7 @@ def __init__(
                     ) from err
 
             # we're inferring from values
-            dtype = CategoricalDtype(categories, dtype.ordered)
+            dtype = CategoricalDtype(categories, dtype.ordered, dtype.categories_dtype)
 
         elif isinstance(values.dtype, CategoricalDtype):
             old_codes = extract_array(values)._codes
diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
@@ -152,13 +152,17 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype):
     ----------
     categories : sequence, optional
         Must be unique, and must not contain any nulls.
-        The categories are stored in an Index,
-        and if an index is provided the dtype of that index will be used.
+        The categories are stored in an Index.
     ordered : bool or None, default False
         Whether or not this categorical is treated as a ordered categorical.
         None can be used to maintain the ordered value of existing categoricals when
         used in operations that combine categoricals, e.g. astype, and will resolve to
         False if there is no existing ordered to maintain.
+    categories_dtype : dtype, optional
+        If given, will be the dtype of the categories.
+        If not given, the categories dtype will be inferred.
+
+        .. versionadded:: 2.1.0
 
     Attributes
     ----------
@@ -181,14 +185,14 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype):
 
     Examples
     --------
-    >>> t = pd.CategoricalDtype(categories=['b', 'a'], ordered=True)
+    >>> t = pd.CategoricalDtype(['b', 'a'], ordered=True, categories_dtype="string")
     >>> pd.Series(['a', 'b', 'a', 'c'], dtype=t)
     0      a
     1      b
     2      a
     3    NaN
     dtype: category
-    Categories (2, object): ['b' < 'a']
+    Categories (2, string): ['b' < 'a']
 
     An empty CategoricalDtype with a specific dtype can be created
     by providing an empty index. As follows,
@@ -205,8 +209,19 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype):
     base = np.dtype("O")
     _metadata = ("categories", "ordered")
     _cache_dtypes: dict[str_type, PandasExtensionDtype] = {}
+    _categories_dtype: Dtype | None = None
+    _match = re.compile(r"category\[(?P<categories_dtype>.+)\]")
+
+    def __init__(
+        self,
+        categories=None,
+        ordered: Ordered = False,
+        categories_dtype: Dtype | None = None,
+    ) -> None:
+        if categories_dtype is not None:
+            from pandas.core.dtypes.common import pandas_dtype
 
-    def __init__(self, categories=None, ordered: Ordered = False) -> None:
+            self._categories_dtype = pandas_dtype(categories_dtype)
         self._finalize(categories, ordered, fastpath=False)
 
     @classmethod
@@ -352,12 +367,31 @@ def construct_from_string(cls, string: str_type) -> CategoricalDtype:
             raise TypeError(
                 f"'construct_from_string' expects a string, got {type(string)}"
             )
-        if string != cls.name:
-            raise TypeError(f"Cannot construct a 'CategoricalDtype' from '{string}'")
 
         # need ordered=None to ensure that operations specifying dtype="category" don't
         # override the ordered value for existing categoricals
-        return cls(ordered=None)
+
+        if string == cls.name:
+            return cls(ordered=None)
+
+        msg = f"Cannot construct a '{cls.__name__}' from '{string}'"
+        match = cls._match.match(string)
+        if match:
+            d = match.groupdict()
+            try:
+                return cls(categories_dtype=d["categories_dtype"])
+            except (KeyError, TypeError, ValueError) as err:
+                # keyError is if "categories_dtype" key is not found
+                # TypeError if we pass a nonsense;
+                raise TypeError(msg) from err
+        raise TypeError(msg)
+
+    @property
+    def categories_dtype(self) -> Dtype:
+        if self.categories is None:
+            return self._categories_dtype
+
+        return self.categories.dtype
 
     def _finalize(self, categories, ordered: Ordered, fastpath: bool = False) -> None:
         if ordered is not None:
@@ -451,18 +485,16 @@ def __eq__(self, other: Any) -> bool:
     def __repr__(self) -> str_type:
         if self.categories is None:
             data = "None"
-            dtype = "None"
         else:
             data = self.categories._format_data(name=type(self).__name__)
             if data is None:
                 # self.categories is RangeIndex
                 data = str(self.categories._range)
             data = data.rstrip(", ")
-            dtype = self.categories.dtype
 
         return (
             f"CategoricalDtype(categories={data}, ordered={self.ordered}, "
-            f"categories_dtype={dtype})"
+            f"categories_dtype={self.categories_dtype})"
         )
 
     @cache_readonly
@@ -537,8 +569,7 @@ def validate_ordered(ordered: Ordered) -> None:
         if not is_bool(ordered):
             raise TypeError("'ordered' must either be 'True' or 'False'")
 
-    @staticmethod
-    def validate_categories(categories, fastpath: bool = False) -> Index:
+    def validate_categories(self, categories, fastpath: bool = False) -> Index:
         """
         Validates that we have good categories
 
@@ -558,8 +589,11 @@ def validate_categories(categories, fastpath: bool = False) -> Index:
             raise TypeError(
                 f"Parameter 'categories' must be list-like, was {repr(categories)}"
             )
+        dtype = self._categories_dtype
         if not isinstance(categories, ABCIndex):
-            categories = Index._with_infer(categories, tupleize_cols=False)
+            categories = Index._with_infer(categories, dtype=dtype, tupleize_cols=False)
+        elif dtype is not None:
+            categories = categories.astype(dtype)
 
         if not fastpath:
             if categories.hasnans: