@@ -152,13 +152,17 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype):
152
152
----------
153
153
categories : sequence, optional
154
154
Must be unique, and must not contain any nulls.
155
- The categories are stored in an Index,
156
- and if an index is provided the dtype of that index will be used.
155
+ The categories are stored in an Index.
157
156
ordered : bool or None, default False
158
157
Whether or not this categorical is treated as a ordered categorical.
159
158
None can be used to maintain the ordered value of existing categoricals when
160
159
used in operations that combine categoricals, e.g. astype, and will resolve to
161
160
False if there is no existing ordered to maintain.
161
+ categories_dtype : dtype, optional
162
+ If given, will be the dtype of the categories.
163
+ If not given, the categories dtype will be inferred.
164
+
165
+ .. versionadded:: 2.1.0
162
166
163
167
Attributes
164
168
----------
@@ -181,14 +185,14 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype):
181
185
182
186
Examples
183
187
--------
184
- >>> t = pd.CategoricalDtype(categories= ['b', 'a'], ordered=True)
188
+ >>> t = pd.CategoricalDtype(['b', 'a'], ordered=True, categories_dtype="string" )
185
189
>>> pd.Series(['a', 'b', 'a', 'c'], dtype=t)
186
190
0 a
187
191
1 b
188
192
2 a
189
193
3 NaN
190
194
dtype: category
191
- Categories (2, object ): ['b' < 'a']
195
+ Categories (2, string ): ['b' < 'a']
192
196
193
197
An empty CategoricalDtype with a specific dtype can be created
194
198
by providing an empty index. As follows,
@@ -205,8 +209,19 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype):
205
209
base = np .dtype ("O" )
206
210
_metadata = ("categories" , "ordered" )
207
211
_cache_dtypes : dict [str_type , PandasExtensionDtype ] = {}
212
+ _categories_dtype : Dtype | None = None
213
+ _match = re .compile (r"category\[(?P<categories_dtype>.+)\]" )
214
+
215
+ def __init__ (
216
+ self ,
217
+ categories = None ,
218
+ ordered : Ordered = False ,
219
+ categories_dtype : Dtype | None = None ,
220
+ ) -> None :
221
+ if categories_dtype is not None :
222
+ from pandas .core .dtypes .common import pandas_dtype
208
223
209
- def __init__ ( self , categories = None , ordered : Ordered = False ) -> None :
224
+ self . _categories_dtype = pandas_dtype ( categories_dtype )
210
225
self ._finalize (categories , ordered , fastpath = False )
211
226
212
227
@classmethod
@@ -352,12 +367,31 @@ def construct_from_string(cls, string: str_type) -> CategoricalDtype:
352
367
raise TypeError (
353
368
f"'construct_from_string' expects a string, got { type (string )} "
354
369
)
355
- if string != cls .name :
356
- raise TypeError (f"Cannot construct a 'CategoricalDtype' from '{ string } '" )
357
370
358
371
# need ordered=None to ensure that operations specifying dtype="category" don't
359
372
# override the ordered value for existing categoricals
360
- return cls (ordered = None )
373
+
374
+ if string == cls .name :
375
+ return cls (ordered = None )
376
+
377
+ msg = f"Cannot construct a '{ cls .__name__ } ' from '{ string } '"
378
+ match = cls ._match .match (string )
379
+ if match :
380
+ d = match .groupdict ()
381
+ try :
382
+ return cls (categories_dtype = d ["categories_dtype" ])
383
+ except (KeyError , TypeError , ValueError ) as err :
384
+ # keyError is if "categories_dtype" key is not found
385
+ # TypeError if we pass a nonsense;
386
+ raise TypeError (msg ) from err
387
+ raise TypeError (msg )
388
+
389
+ @property
390
+ def categories_dtype (self ) -> Dtype :
391
+ if self .categories is None :
392
+ return self ._categories_dtype
393
+
394
+ return self .categories .dtype
361
395
362
396
def _finalize (self , categories , ordered : Ordered , fastpath : bool = False ) -> None :
363
397
if ordered is not None :
@@ -451,18 +485,16 @@ def __eq__(self, other: Any) -> bool:
451
485
def __repr__ (self ) -> str_type :
452
486
if self .categories is None :
453
487
data = "None"
454
- dtype = "None"
455
488
else :
456
489
data = self .categories ._format_data (name = type (self ).__name__ )
457
490
if data is None :
458
491
# self.categories is RangeIndex
459
492
data = str (self .categories ._range )
460
493
data = data .rstrip (", " )
461
- dtype = self .categories .dtype
462
494
463
495
return (
464
496
f"CategoricalDtype(categories={ data } , ordered={ self .ordered } , "
465
- f"categories_dtype={ dtype } )"
497
+ f"categories_dtype={ self . categories_dtype } )"
466
498
)
467
499
468
500
@cache_readonly
@@ -537,8 +569,7 @@ def validate_ordered(ordered: Ordered) -> None:
537
569
if not is_bool (ordered ):
538
570
raise TypeError ("'ordered' must either be 'True' or 'False'" )
539
571
540
- @staticmethod
541
- def validate_categories (categories , fastpath : bool = False ) -> Index :
572
+ def validate_categories (self , categories , fastpath : bool = False ) -> Index :
542
573
"""
543
574
Validates that we have good categories
544
575
@@ -558,8 +589,11 @@ def validate_categories(categories, fastpath: bool = False) -> Index:
558
589
raise TypeError (
559
590
f"Parameter 'categories' must be list-like, was { repr (categories )} "
560
591
)
592
+ dtype = self ._categories_dtype
561
593
if not isinstance (categories , ABCIndex ):
562
- categories = Index ._with_infer (categories , tupleize_cols = False )
594
+ categories = Index ._with_infer (categories , dtype = dtype , tupleize_cols = False )
595
+ elif dtype is not None :
596
+ categories = categories .astype (dtype )
563
597
564
598
if not fastpath :
565
599
if categories .hasnans :
0 commit comments