Skip to content

Commit c585867

Browse files
committed
ENH: allow more specific categorical dtype strings, e.g. `category[string]'.
1 parent 3863a48 commit c585867

File tree

4 files changed

+79
-21
lines changed

4 files changed

+79
-21
lines changed

doc/source/user_guide/categorical.rst

+13-2
Original file line numberDiff line numberDiff line change
@@ -138,15 +138,26 @@ behavior:
138138
1. Categories are inferred from the data.
139139
2. Categories are unordered.
140140

141-
To control those behaviors, instead of passing ``'category'``, use an instance
141+
It is also possible to give a dtype inside bracket to ensure the dtype of the categories, like this:
142+
143+
.. ipython:: python
144+
145+
s = pd.Series(["a", "b", "c", "a"], dtype="category[string]")
146+
s.dtype.categories
147+
148+
.. versionadded:: 2.1.0
149+
150+
The ability to a specify the categories dtype in the dtype string was added in :ref:`v2.1.0 <whatsnew_210.enhancements.category_subtype>`
151+
152+
To control those behaviors even more, instead of passing ``'category'``, use an instance
142153
of :class:`~pandas.api.types.CategoricalDtype`.
143154

144155
.. ipython:: python
145156
146157
from pandas.api.types import CategoricalDtype
147158
148159
s = pd.Series(["a", "b", "c", "a"])
149-
cat_type = CategoricalDtype(categories=["b", "c", "d"], ordered=True)
160+
cat_type = CategoricalDtype(["b", "c", "d"], ordered=True, categories_dtype="string")
150161
s_cat = s.astype(cat_type)
151162
s_cat
152163

doc/source/whatsnew/v2.1.0.rst

+17-4
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,24 @@ including other versions of pandas.
1414
Enhancements
1515
~~~~~~~~~~~~
1616

17-
.. _whatsnew_210.enhancements.enhancement1:
17+
.. _whatsnew_210.enhancements.category_subtype:
18+
19+
Specific categorical string dtypes like ``dtype="category[string]"`` now works
20+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
21+
22+
When giving the string ``"category"`` as a dtype it is now possible to specify the dtype
23+
of the categories as part of the dtype string:
24+
25+
.. ipython:: python
26+
27+
ser = pd.Series(["a", "b", np.nan], dtype="category[string]")
28+
ser.dtype.categories
29+
30+
The expression inside the brackets can be any string that Pandas accepts for a dtype and
31+
whose data can be stored in an :class:`Index` (:issue:`48515`).
1832

19-
enhancement1
20-
^^^^^^^^^^^^
2133

22-
.. _whatsnew_210.enhancements.enhancement2:
34+
.. _whatsnew_210.enhancements.map_na_action:
2335

2436
``map(func, na_action="ignore")`` now works for all array types
2537
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -99,6 +111,7 @@ Other enhancements
99111
- :meth:`Categorical.from_codes` has gotten a ``validate`` parameter (:issue:`50975`)
100112
- Added ``engine_kwargs`` parameter to :meth:`DataFrame.to_excel` (:issue:`53220`)
101113
- Performance improvement in :func:`concat` with homogeneous ``np.float64`` or ``np.float32`` dtypes (:issue:`52685`)
114+
- :class:`CategoricalDtype` has gotten a new parameter and attribute named :meth:`CategoricalDtype.categories_dtype` (:issue:`48515`)
102115
- Performance improvement in :meth:`DataFrame.filter` when ``items`` is given (:issue:`52941`)
103116
-
104117

pandas/core/arrays/categorical.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -453,7 +453,7 @@ def __init__(
453453
) from err
454454

455455
# we're inferring from values
456-
dtype = CategoricalDtype(categories, dtype.ordered)
456+
dtype = CategoricalDtype(categories, dtype.ordered, dtype.categories_dtype)
457457

458458
elif isinstance(values.dtype, CategoricalDtype):
459459
old_codes = extract_array(values)._codes

pandas/core/dtypes/dtypes.py

+48-14
Original file line numberDiff line numberDiff line change
@@ -152,13 +152,17 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype):
152152
----------
153153
categories : sequence, optional
154154
Must be unique, and must not contain any nulls.
155-
The categories are stored in an Index,
156-
and if an index is provided the dtype of that index will be used.
155+
The categories are stored in an Index.
157156
ordered : bool or None, default False
158157
Whether or not this categorical is treated as a ordered categorical.
159158
None can be used to maintain the ordered value of existing categoricals when
160159
used in operations that combine categoricals, e.g. astype, and will resolve to
161160
False if there is no existing ordered to maintain.
161+
categories_dtype : dtype, optional
162+
If given, will be the dtype of the categories.
163+
If not given, the categories dtype will be inferred.
164+
165+
.. versionadded:: 2.1.0
162166
163167
Attributes
164168
----------
@@ -181,14 +185,14 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype):
181185
182186
Examples
183187
--------
184-
>>> t = pd.CategoricalDtype(categories=['b', 'a'], ordered=True)
188+
>>> t = pd.CategoricalDtype(['b', 'a'], ordered=True, categories_dtype="string")
185189
>>> pd.Series(['a', 'b', 'a', 'c'], dtype=t)
186190
0 a
187191
1 b
188192
2 a
189193
3 NaN
190194
dtype: category
191-
Categories (2, object): ['b' < 'a']
195+
Categories (2, string): ['b' < 'a']
192196
193197
An empty CategoricalDtype with a specific dtype can be created
194198
by providing an empty index. As follows,
@@ -205,8 +209,19 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype):
205209
base = np.dtype("O")
206210
_metadata = ("categories", "ordered")
207211
_cache_dtypes: dict[str_type, PandasExtensionDtype] = {}
212+
_categories_dtype: Dtype | None = None
213+
_match = re.compile(r"category\[(?P<categories_dtype>.+)\]")
214+
215+
def __init__(
216+
self,
217+
categories=None,
218+
ordered: Ordered = False,
219+
categories_dtype: Dtype | None = None,
220+
) -> None:
221+
if categories_dtype is not None:
222+
from pandas.core.dtypes.common import pandas_dtype
208223

209-
def __init__(self, categories=None, ordered: Ordered = False) -> None:
224+
self._categories_dtype = pandas_dtype(categories_dtype)
210225
self._finalize(categories, ordered, fastpath=False)
211226

212227
@classmethod
@@ -352,12 +367,31 @@ def construct_from_string(cls, string: str_type) -> CategoricalDtype:
352367
raise TypeError(
353368
f"'construct_from_string' expects a string, got {type(string)}"
354369
)
355-
if string != cls.name:
356-
raise TypeError(f"Cannot construct a 'CategoricalDtype' from '{string}'")
357370

358371
# need ordered=None to ensure that operations specifying dtype="category" don't
359372
# override the ordered value for existing categoricals
360-
return cls(ordered=None)
373+
374+
if string == cls.name:
375+
return cls(ordered=None)
376+
377+
msg = f"Cannot construct a '{cls.__name__}' from '{string}'"
378+
match = cls._match.match(string)
379+
if match:
380+
d = match.groupdict()
381+
try:
382+
return cls(categories_dtype=d["categories_dtype"])
383+
except (KeyError, TypeError, ValueError) as err:
384+
# keyError is if "categories_dtype" key is not found
385+
# TypeError if we pass a nonsense;
386+
raise TypeError(msg) from err
387+
raise TypeError(msg)
388+
389+
@property
390+
def categories_dtype(self) -> Dtype:
391+
if self.categories is None:
392+
return self._categories_dtype
393+
394+
return self.categories.dtype
361395

362396
def _finalize(self, categories, ordered: Ordered, fastpath: bool = False) -> None:
363397
if ordered is not None:
@@ -451,18 +485,16 @@ def __eq__(self, other: Any) -> bool:
451485
def __repr__(self) -> str_type:
452486
if self.categories is None:
453487
data = "None"
454-
dtype = "None"
455488
else:
456489
data = self.categories._format_data(name=type(self).__name__)
457490
if data is None:
458491
# self.categories is RangeIndex
459492
data = str(self.categories._range)
460493
data = data.rstrip(", ")
461-
dtype = self.categories.dtype
462494

463495
return (
464496
f"CategoricalDtype(categories={data}, ordered={self.ordered}, "
465-
f"categories_dtype={dtype})"
497+
f"categories_dtype={self.categories_dtype})"
466498
)
467499

468500
@cache_readonly
@@ -537,8 +569,7 @@ def validate_ordered(ordered: Ordered) -> None:
537569
if not is_bool(ordered):
538570
raise TypeError("'ordered' must either be 'True' or 'False'")
539571

540-
@staticmethod
541-
def validate_categories(categories, fastpath: bool = False) -> Index:
572+
def validate_categories(self, categories, fastpath: bool = False) -> Index:
542573
"""
543574
Validates that we have good categories
544575
@@ -558,8 +589,11 @@ def validate_categories(categories, fastpath: bool = False) -> Index:
558589
raise TypeError(
559590
f"Parameter 'categories' must be list-like, was {repr(categories)}"
560591
)
592+
dtype = self._categories_dtype
561593
if not isinstance(categories, ABCIndex):
562-
categories = Index._with_infer(categories, tupleize_cols=False)
594+
categories = Index._with_infer(categories, dtype=dtype, tupleize_cols=False)
595+
elif dtype is not None:
596+
categories = categories.astype(dtype)
563597

564598
if not fastpath:
565599
if categories.hasnans:

0 commit comments

Comments
 (0)