Skip to content

Commit fe972fb

Browse files
jschendeljreback
authored andcommitted
API: Allow ordered=None in CategoricalDtype (#18889)
1 parent 5e7fabc commit fe972fb

File tree

5 files changed

+168
-102
lines changed

5 files changed

+168
-102
lines changed

doc/source/whatsnew/v0.23.0.txt

+23
Original file line numberDiff line numberDiff line change
@@ -500,6 +500,29 @@ To restore previous behavior, simply set ``expand`` to ``False``:
500500
extracted
501501
type(extracted)
502502

503+
.. _whatsnew_0230.api_breaking.cdt_ordered:
504+
505+
Default value for the ``ordered`` parameter of ``CategoricalDtype``
506+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
507+
508+
The default value of the ``ordered`` parameter for :class:`~pandas.api.types.CategoricalDtype` has changed from ``False`` to ``None`` to allow updating of ``categories`` without impacting ``ordered``. Behavior should remain consistent for downstream objects, such as :class:`Categorical` (:issue:`18790`)
509+
510+
In previous versions, the default value for the ``ordered`` parameter was ``False``. This could potentially lead to the ``ordered`` parameter unintentionally being changed from ``True`` to ``False`` when users attempt to update ``categories`` if ``ordered`` is not explicitly specified, as it would silently default to ``False``. The new behavior for ``ordered=None`` is to retain the existing value of ``ordered``.
511+
512+
New Behavior:
513+
514+
.. ipython:: python
515+
516+
from pandas.api.types import CategoricalDtype
517+
cat = pd.Categorical(list('abcaba'), ordered=True, categories=list('cba'))
518+
cat
519+
cdt = CategoricalDtype(categories=list('cbad'))
520+
cat.astype(cdt)
521+
522+
Notice in the example above that the converted ``Categorical`` has retained ``ordered=True``. Had the default value for ``ordered`` remained as ``False``, the converted ``Categorical`` would have become unordered, despite ``ordered=False`` never being explicitly specified. To change the value of ``ordered``, explicitly pass it to the new dtype, e.g. ``CategoricalDtype(categories=list('cbad'), ordered=False)``.
523+
524+
Note that the unintenional conversion of ``ordered`` discussed above did not arise in previous versions due to separate bugs that prevented ``astype`` from doing any type of category to category conversion (:issue:`10696`, :issue:`18593`). These bugs have been fixed in this release, and motivated changing the default value of ``ordered``.
525+
503526
.. _whatsnew_0230.api:
504527

505528
Other API Changes

pandas/core/arrays/categorical.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,7 @@ class Categorical(ExtensionArray, PandasObject):
243243
# For comparisons, so that numpy uses our implementation if the compare
244244
# ops, which raise
245245
__array_priority__ = 1000
246-
_dtype = CategoricalDtype()
246+
_dtype = CategoricalDtype(ordered=False)
247247
_deprecations = frozenset(['labels'])
248248
_typ = 'categorical'
249249

@@ -294,7 +294,7 @@ def __init__(self, values, categories=None, ordered=None, dtype=None,
294294

295295
if fastpath:
296296
self._codes = coerce_indexer_dtype(values, categories)
297-
self._dtype = dtype
297+
self._dtype = self._dtype.update_dtype(dtype)
298298
return
299299

300300
# null_mask indicates missing values we want to exclude from inference.
@@ -358,7 +358,7 @@ def __init__(self, values, categories=None, ordered=None, dtype=None,
358358
full_codes[~null_mask] = codes
359359
codes = full_codes
360360

361-
self._dtype = dtype
361+
self._dtype = self._dtype.update_dtype(dtype)
362362
self._codes = coerce_indexer_dtype(codes, dtype.categories)
363363

364364
@property
@@ -438,7 +438,7 @@ def astype(self, dtype, copy=True):
438438
"""
439439
if is_categorical_dtype(dtype):
440440
# GH 10696/18593
441-
dtype = self.dtype._update_dtype(dtype)
441+
dtype = self.dtype.update_dtype(dtype)
442442
self = self.copy() if copy else self
443443
if dtype == self.dtype:
444444
return self
@@ -560,7 +560,7 @@ def from_codes(cls, codes, categories, ordered=False):
560560
raise ValueError(
561561
"codes need to be convertible to an arrays of integers")
562562

563-
categories = CategoricalDtype._validate_categories(categories)
563+
categories = CategoricalDtype.validate_categories(categories)
564564

565565
if len(codes) and (codes.max() >= len(categories) or codes.min() < -1):
566566
raise ValueError("codes need to be between -1 and "
@@ -1165,7 +1165,7 @@ def __setstate__(self, state):
11651165

11661166
# Provide compatibility with pre-0.15.0 Categoricals.
11671167
if '_categories' not in state and '_levels' in state:
1168-
state['_categories'] = self.dtype._validate_categories(state.pop(
1168+
state['_categories'] = self.dtype.validate_categories(state.pop(
11691169
'_levels'))
11701170
if '_codes' not in state and 'labels' in state:
11711171
state['_codes'] = coerce_indexer_dtype(

pandas/core/dtypes/dtypes.py

+36-18
Original file line numberDiff line numberDiff line change
@@ -159,11 +159,11 @@ class CategoricalDtype(PandasExtensionDtype):
159159
_metadata = ['categories', 'ordered']
160160
_cache = {}
161161

162-
def __init__(self, categories=None, ordered=False):
162+
def __init__(self, categories=None, ordered=None):
163163
self._finalize(categories, ordered, fastpath=False)
164164

165165
@classmethod
166-
def _from_fastpath(cls, categories=None, ordered=False):
166+
def _from_fastpath(cls, categories=None, ordered=None):
167167
self = cls.__new__(cls)
168168
self._finalize(categories, ordered, fastpath=True)
169169
return self
@@ -180,14 +180,12 @@ def _from_categorical_dtype(cls, dtype, categories=None, ordered=None):
180180

181181
def _finalize(self, categories, ordered, fastpath=False):
182182

183-
if ordered is None:
184-
ordered = False
185-
else:
186-
self._validate_ordered(ordered)
183+
if ordered is not None:
184+
self.validate_ordered(ordered)
187185

188186
if categories is not None:
189-
categories = self._validate_categories(categories,
190-
fastpath=fastpath)
187+
categories = self.validate_categories(categories,
188+
fastpath=fastpath)
191189

192190
self._categories = categories
193191
self._ordered = ordered
@@ -208,6 +206,17 @@ def __hash__(self):
208206
return int(self._hash_categories(self.categories, self.ordered))
209207

210208
def __eq__(self, other):
209+
"""
210+
Rules for CDT equality:
211+
1) Any CDT is equal to the string 'category'
212+
2) Any CDT is equal to a CDT with categories=None regardless of ordered
213+
3) A CDT with ordered=True is only equal to another CDT with
214+
ordered=True and identical categories in the same order
215+
4) A CDT with ordered={False, None} is only equal to another CDT with
216+
ordered={False, None} and identical categories, but same order is
217+
not required. There is no distinction between False/None.
218+
5) Any other comparison returns False
219+
"""
211220
if isinstance(other, compat.string_types):
212221
return other == self.name
213222

@@ -220,12 +229,16 @@ def __eq__(self, other):
220229
# CDT(., .) = CDT(None, False) and *all*
221230
# CDT(., .) = CDT(None, True).
222231
return True
223-
elif self.ordered:
224-
return other.ordered and self.categories.equals(other.categories)
225-
elif other.ordered:
226-
return False
232+
elif self.ordered or other.ordered:
233+
# At least one has ordered=True; equal if both have ordered=True
234+
# and the same values for categories in the same order.
235+
return ((self.ordered == other.ordered) and
236+
self.categories.equals(other.categories))
227237
else:
228-
# both unordered; this could probably be optimized / cached
238+
# Neither has ordered=True; equal if both have the same categories,
239+
# but same order is not necessary. There is no distinction between
240+
# ordered=False and ordered=None: CDT(., False) and CDT(., None)
241+
# will be equal if they have the same categories.
229242
return hash(self) == hash(other)
230243

231244
def __repr__(self):
@@ -288,7 +301,7 @@ def construct_from_string(cls, string):
288301
raise TypeError("cannot construct a CategoricalDtype")
289302

290303
@staticmethod
291-
def _validate_ordered(ordered):
304+
def validate_ordered(ordered):
292305
"""
293306
Validates that we have a valid ordered parameter. If
294307
it is not a boolean, a TypeError will be raised.
@@ -308,7 +321,7 @@ def _validate_ordered(ordered):
308321
raise TypeError("'ordered' must either be 'True' or 'False'")
309322

310323
@staticmethod
311-
def _validate_categories(categories, fastpath=False):
324+
def validate_categories(categories, fastpath=False):
312325
"""
313326
Validates that we have good categories
314327
@@ -340,7 +353,7 @@ def _validate_categories(categories, fastpath=False):
340353

341354
return categories
342355

343-
def _update_dtype(self, dtype):
356+
def update_dtype(self, dtype):
344357
"""
345358
Returns a CategoricalDtype with categories and ordered taken from dtype
346359
if specified, otherwise falling back to self if unspecified
@@ -361,11 +374,16 @@ def _update_dtype(self, dtype):
361374
'got {dtype!r}').format(dtype=dtype)
362375
raise ValueError(msg)
363376

364-
# dtype is CDT: keep current categories if None (ordered can't be None)
377+
# dtype is CDT: keep current categories/ordered if None
365378
new_categories = dtype.categories
366379
if new_categories is None:
367380
new_categories = self.categories
368-
return CategoricalDtype(new_categories, dtype.ordered)
381+
382+
new_ordered = dtype.ordered
383+
if new_ordered is None:
384+
new_ordered = self.ordered
385+
386+
return CategoricalDtype(new_categories, new_ordered)
369387

370388
@property
371389
def categories(self):

pandas/core/indexes/category.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -344,7 +344,7 @@ def astype(self, dtype, copy=True):
344344
return IntervalIndex(np.array(self))
345345
elif is_categorical_dtype(dtype):
346346
# GH 18630
347-
dtype = self.dtype._update_dtype(dtype)
347+
dtype = self.dtype.update_dtype(dtype)
348348
if dtype == self.dtype:
349349
return self.copy() if copy else self
350350

0 commit comments

Comments
 (0)