Skip to content

Commit 8393e37

Browse files
authored
DEPR: Deprecate ordered=None for CategoricalDtype (#26403)
1 parent 7ab9ff5 commit 8393e37

File tree

12 files changed

+163
-48
lines changed

12 files changed

+163
-48
lines changed

doc/source/whatsnew/v0.23.0.rst

+16-6
Original file line numberDiff line numberDiff line change
@@ -935,13 +935,23 @@ In previous versions, the default value for the ``ordered`` parameter was ``Fals
935935

936936
New behavior:
937937

938-
.. ipython:: python
938+
.. code-block:: ipython
939939
940-
from pandas.api.types import CategoricalDtype
941-
cat = pd.Categorical(list('abcaba'), ordered=True, categories=list('cba'))
942-
cat
943-
cdt = CategoricalDtype(categories=list('cbad'))
944-
cat.astype(cdt)
940+
In [2]: from pandas.api.types import CategoricalDtype
941+
942+
In [3]: cat = pd.Categorical(list('abcaba'), ordered=True, categories=list('cba'))
943+
944+
In [4]: cat
945+
Out[4]:
946+
[a, b, c, a, b, a]
947+
Categories (3, object): [c < b < a]
948+
949+
In [5]: cdt = CategoricalDtype(categories=list('cbad'))
950+
951+
In [6]: cat.astype(cdt)
952+
Out[6]:
953+
[a, b, c, a, b, a]
954+
Categories (4, object): [c < b < a < d]
945955
946956
Notice in the example above that the converted ``Categorical`` has retained ``ordered=True``. Had the default value for ``ordered`` remained as ``False``, the converted ``Categorical`` would have become unordered, despite ``ordered=False`` never being explicitly specified. To change the value of ``ordered``, explicitly pass it to the new dtype, e.g. ``CategoricalDtype(categories=list('cbad'), ordered=False)``.
947957

doc/source/whatsnew/v0.25.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -761,6 +761,7 @@ Other deprecations
761761
- :attr:`Series.imag` and :attr:`Series.real` are deprecated. (:issue:`18262`)
762762
- :meth:`Series.put` is deprecated. (:issue:`18262`)
763763
- :meth:`Index.item` and :meth:`Series.item` is deprecated. (:issue:`18262`)
764+
- The default value ``ordered=None`` in :class:`~pandas.api.types.CategoricalDtype` has been deprecated in favor of ``ordered=False``. When converting between categorical types ``ordered=True`` must be explicitly passed in order to be preserved. (:issue:`26336`)
764765
- :meth:`Index.contains` is deprecated. Use ``key in index`` (``__contains__``) instead (:issue:`17753`).
765766
766767
.. _whatsnew_0250.prior_deprecations:

pandas/core/arrays/categorical.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -332,7 +332,7 @@ def __init__(self, values, categories=None, ordered=None, dtype=None,
332332
# sanitize input
333333
if is_categorical_dtype(values):
334334
if dtype.categories is None:
335-
dtype = CategoricalDtype(values.categories, dtype.ordered)
335+
dtype = CategoricalDtype(values.categories, dtype._ordered)
336336
elif not isinstance(values, (ABCIndexClass, ABCSeries)):
337337
# sanitize_array coerces np.nan to a string under certain versions
338338
# of numpy
@@ -355,7 +355,7 @@ def __init__(self, values, categories=None, ordered=None, dtype=None,
355355
codes, categories = factorize(values, sort=True)
356356
except TypeError:
357357
codes, categories = factorize(values, sort=False)
358-
if dtype.ordered:
358+
if dtype._ordered:
359359
# raise, as we don't have a sortable data structure and so
360360
# the user should give us one by specifying categories
361361
raise TypeError("'values' is not ordered, please "
@@ -368,7 +368,7 @@ def __init__(self, values, categories=None, ordered=None, dtype=None,
368368
"supported at this time")
369369

370370
# we're inferring from values
371-
dtype = CategoricalDtype(categories, dtype.ordered)
371+
dtype = CategoricalDtype(categories, dtype._ordered)
372372

373373
elif is_categorical_dtype(values):
374374
old_codes = (values._values.codes if isinstance(values, ABCSeries)
@@ -433,7 +433,7 @@ def ordered(self):
433433
"""
434434
Whether the categories have an ordered relationship.
435435
"""
436-
return self.dtype.ordered
436+
return self.dtype._ordered
437437

438438
@property
439439
def dtype(self) -> CategoricalDtype:
@@ -847,7 +847,7 @@ def set_categories(self, new_categories, ordered=None, rename=False,
847847
"""
848848
inplace = validate_bool_kwarg(inplace, 'inplace')
849849
if ordered is None:
850-
ordered = self.dtype.ordered
850+
ordered = self.dtype._ordered
851851
new_dtype = CategoricalDtype(new_categories, ordered=ordered)
852852

853853
cat = self if inplace else self.copy()

pandas/core/dtypes/dtypes.py

+44-18
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,13 @@
1717

1818
str_type = str
1919

20+
# GH26403: sentinel value used for the default value of ordered in the
21+
# CategoricalDtype constructor to detect when ordered=None is explicitly passed
22+
ordered_sentinel = object() # type: object
23+
24+
# TODO(GH26403): Replace with Optional[bool] or bool
25+
OrderedType = Union[None, bool, object]
26+
2027

2128
def register_extension_dtype(cls: Type[ExtensionDtype],
2229
) -> Type[ExtensionDtype]:
@@ -214,7 +221,9 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype):
214221
_metadata = ('categories', 'ordered')
215222
_cache = {} # type: Dict[str_type, PandasExtensionDtype]
216223

217-
def __init__(self, categories=None, ordered: Optional[bool] = None):
224+
def __init__(self,
225+
categories=None,
226+
ordered: OrderedType = ordered_sentinel):
218227
self._finalize(categories, ordered, fastpath=False)
219228

220229
@classmethod
@@ -230,7 +239,7 @@ def _from_fastpath(cls,
230239
def _from_categorical_dtype(cls,
231240
dtype: 'CategoricalDtype',
232241
categories=None,
233-
ordered: Optional[bool] = None,
242+
ordered: OrderedType = None,
234243
) -> 'CategoricalDtype':
235244
if categories is ordered is None:
236245
return dtype
@@ -330,19 +339,20 @@ def _from_values_or_dtype(cls,
330339

331340
def _finalize(self,
332341
categories,
333-
ordered: Optional[bool],
342+
ordered: OrderedType,
334343
fastpath: bool = False,
335344
) -> None:
336345

337-
if ordered is not None:
346+
if ordered is not None and ordered is not ordered_sentinel:
338347
self.validate_ordered(ordered)
339348

340349
if categories is not None:
341350
categories = self.validate_categories(categories,
342351
fastpath=fastpath)
343352

344353
self._categories = categories
345-
self._ordered = ordered
354+
self._ordered = ordered if ordered is not ordered_sentinel else None
355+
self._ordered_from_sentinel = ordered is ordered_sentinel
346356

347357
def __setstate__(self, state: Dict[str_type, Any]) -> None:
348358
# for pickle compat. __get_state__ is defined in the
@@ -355,12 +365,12 @@ def __hash__(self) -> int:
355365
# _hash_categories returns a uint64, so use the negative
356366
# space for when we have unknown categories to avoid a conflict
357367
if self.categories is None:
358-
if self.ordered:
368+
if self._ordered:
359369
return -1
360370
else:
361371
return -2
362372
# We *do* want to include the real self.ordered here
363-
return int(self._hash_categories(self.categories, self.ordered))
373+
return int(self._hash_categories(self.categories, self._ordered))
364374

365375
def __eq__(self, other: Any) -> bool:
366376
"""
@@ -379,7 +389,7 @@ def __eq__(self, other: Any) -> bool:
379389
return other == self.name
380390
elif other is self:
381391
return True
382-
elif not (hasattr(other, 'ordered') and hasattr(other, 'categories')):
392+
elif not (hasattr(other, '_ordered') and hasattr(other, 'categories')):
383393
return False
384394
elif self.categories is None or other.categories is None:
385395
# We're forced into a suboptimal corner thanks to math and
@@ -388,10 +398,10 @@ def __eq__(self, other: Any) -> bool:
388398
# CDT(., .) = CDT(None, False) and *all*
389399
# CDT(., .) = CDT(None, True).
390400
return True
391-
elif self.ordered or other.ordered:
401+
elif self._ordered or other._ordered:
392402
# At least one has ordered=True; equal if both have ordered=True
393403
# and the same values for categories in the same order.
394-
return ((self.ordered == other.ordered) and
404+
return ((self._ordered == other._ordered) and
395405
self.categories.equals(other.categories))
396406
else:
397407
# Neither has ordered=True; equal if both have the same categories,
@@ -406,10 +416,10 @@ def __repr__(self):
406416
data = "None, "
407417
else:
408418
data = self.categories._format_data(name=self.__class__.__name__)
409-
return tpl.format(data, self.ordered)
419+
return tpl.format(data, self._ordered)
410420

411421
@staticmethod
412-
def _hash_categories(categories, ordered: Optional[bool] = True) -> int:
422+
def _hash_categories(categories, ordered: OrderedType = True) -> int:
413423
from pandas.core.util.hashing import (
414424
hash_array, _combine_hash_arrays, hash_tuples
415425
)
@@ -459,7 +469,7 @@ def construct_array_type(cls):
459469
return Categorical
460470

461471
@staticmethod
462-
def validate_ordered(ordered: bool) -> None:
472+
def validate_ordered(ordered: OrderedType) -> None:
463473
"""
464474
Validates that we have a valid ordered parameter. If
465475
it is not a boolean, a TypeError will be raised.
@@ -534,17 +544,25 @@ def update_dtype(self, dtype: 'CategoricalDtype') -> 'CategoricalDtype':
534544
msg = ('a CategoricalDtype must be passed to perform an update, '
535545
'got {dtype!r}').format(dtype=dtype)
536546
raise ValueError(msg)
537-
elif dtype.categories is not None and dtype.ordered is self.ordered:
538-
return dtype
539547

540548
# dtype is CDT: keep current categories/ordered if None
541549
new_categories = dtype.categories
542550
if new_categories is None:
543551
new_categories = self.categories
544552

545-
new_ordered = dtype.ordered
553+
new_ordered = dtype._ordered
554+
new_ordered_from_sentinel = dtype._ordered_from_sentinel
546555
if new_ordered is None:
547-
new_ordered = self.ordered
556+
# maintain existing ordered if new dtype has ordered=None
557+
new_ordered = self._ordered
558+
if self._ordered and new_ordered_from_sentinel:
559+
# only warn if we'd actually change the existing behavior
560+
msg = ("Constructing a CategoricalDtype without specifying "
561+
"`ordered` will default to `ordered=False` in a future "
562+
"version, which will cause the resulting categorical's "
563+
"`ordered` attribute to change to False; `ordered=True`"
564+
" must be explicitly passed in order to be retained")
565+
warnings.warn(msg, FutureWarning, stacklevel=3)
548566

549567
return CategoricalDtype(new_categories, new_ordered)
550568

@@ -556,10 +574,18 @@ def categories(self):
556574
return self._categories
557575

558576
@property
559-
def ordered(self) -> Optional[bool]:
577+
def ordered(self) -> OrderedType:
560578
"""
561579
Whether the categories have an ordered relationship.
562580
"""
581+
# TODO: remove if block when ordered=None as default is deprecated
582+
if self._ordered_from_sentinel and self._ordered is None:
583+
# warn when accessing ordered if ordered=None and None was not
584+
# explicitly passed to the constructor
585+
msg = ("Constructing a CategoricalDtype without specifying "
586+
"`ordered` will default to `ordered=False` in a future "
587+
"version; `ordered=None` must be explicitly passed.")
588+
warnings.warn(msg, FutureWarning, stacklevel=2)
563589
return self._ordered
564590

565591
@property

pandas/core/internals/construction.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -725,7 +725,7 @@ def _try_cast(arr, dtype, copy, raise_cast_failure):
725725
# We *do* allow casting to categorical, since we know
726726
# that Categorical is the only array type for 'category'.
727727
subarr = Categorical(arr, dtype.categories,
728-
ordered=dtype.ordered)
728+
ordered=dtype._ordered)
729729
elif is_extension_array_dtype(dtype):
730730
# create an extension array from its dtype
731731
array_type = dtype.construct_array_type()._from_sequence

pandas/core/series.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
from pandas.util._validators import validate_bool_kwarg
2020

2121
from pandas.core.dtypes.common import (
22-
_is_unorderable_exception, ensure_platform_int, is_bool,
22+
_is_unorderable_exception, ensure_platform_int, is_bool, is_categorical,
2323
is_categorical_dtype, is_datetime64_dtype, is_datetimelike, is_dict_like,
2424
is_extension_array_dtype, is_extension_type, is_hashable, is_integer,
2525
is_iterator, is_list_like, is_scalar, is_string_like, is_timedelta64_dtype)
@@ -170,6 +170,12 @@ def __init__(self, data=None, index=None, dtype=None, name=None,
170170
if data is None:
171171
data = {}
172172
if dtype is not None:
173+
# GH 26336: explicitly handle 'category' to avoid warning
174+
# TODO: Remove after CategoricalDtype defaults to ordered=False
175+
if (isinstance(dtype, str) and dtype == 'category' and
176+
is_categorical(data)):
177+
dtype = data.dtype
178+
173179
dtype = self._validate_dtype(dtype)
174180

175181
if isinstance(data, MultiIndex):

pandas/io/packers.py

+2-7
Original file line numberDiff line numberDiff line change
@@ -618,14 +618,9 @@ def decode(obj):
618618
return Interval(obj['left'], obj['right'], obj['closed'])
619619
elif typ == 'series':
620620
dtype = dtype_for(obj['dtype'])
621-
pd_dtype = pandas_dtype(dtype)
622-
623621
index = obj['index']
624-
result = Series(unconvert(obj['data'], dtype, obj['compress']),
625-
index=index,
626-
dtype=pd_dtype,
627-
name=obj['name'])
628-
return result
622+
data = unconvert(obj['data'], dtype, obj['compress'])
623+
return Series(data, index=index, dtype=dtype, name=obj['name'])
629624

630625
elif typ == 'block_manager':
631626
axes = obj['axes']

pandas/tests/arrays/categorical/test_dtypes.py

+8
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,14 @@ def test_astype_category(self, dtype_ordered, cat_ordered):
160160
expected = cat
161161
tm.assert_categorical_equal(result, expected)
162162

163+
def test_astype_category_ordered_none_deprecated(self):
164+
# GH 26336
165+
cdt1 = CategoricalDtype(categories=list('cdab'), ordered=True)
166+
cdt2 = CategoricalDtype(categories=list('cedafb'))
167+
cat = Categorical(list('abcdaba'), dtype=cdt1)
168+
with tm.assert_produces_warning(FutureWarning):
169+
cat.astype(cdt2)
170+
163171
def test_iter_python_types(self):
164172
# GH-19909
165173
cat = Categorical([1, 2])

pandas/tests/dtypes/test_dtypes.py

+26-7
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@
1010
is_datetime64tz_dtype, is_datetimetz, is_dtype_equal, is_interval_dtype,
1111
is_period, is_period_dtype, is_string_dtype)
1212
from pandas.core.dtypes.dtypes import (
13-
CategoricalDtype, DatetimeTZDtype, IntervalDtype, PeriodDtype, registry)
13+
CategoricalDtype, DatetimeTZDtype, IntervalDtype, PeriodDtype,
14+
ordered_sentinel, registry)
1415

1516
import pandas as pd
1617
from pandas import (
@@ -54,7 +55,8 @@ def test_pickle(self):
5455
class TestCategoricalDtype(Base):
5556

5657
def create(self):
57-
return CategoricalDtype()
58+
# TODO(GH 26403): Remove when default ordered becomes False
59+
return CategoricalDtype(ordered=None)
5860

5961
def test_pickle(self):
6062
# make sure our cache is NOT pickled
@@ -675,7 +677,8 @@ def test_unordered_same(self, ordered):
675677
def test_categories(self):
676678
result = CategoricalDtype(['a', 'b', 'c'])
677679
tm.assert_index_equal(result.categories, pd.Index(['a', 'b', 'c']))
678-
assert result.ordered is None
680+
with tm.assert_produces_warning(FutureWarning):
681+
assert result.ordered is None
679682

680683
def test_equal_but_different(self, ordered_fixture):
681684
c1 = CategoricalDtype([1, 2, 3])
@@ -804,7 +807,8 @@ def test_categorical_categories(self):
804807

805808
@pytest.mark.parametrize('new_categories', [
806809
list('abc'), list('cba'), list('wxyz'), None])
807-
@pytest.mark.parametrize('new_ordered', [True, False, None])
810+
@pytest.mark.parametrize('new_ordered', [
811+
True, False, None, ordered_sentinel])
808812
def test_update_dtype(self, ordered_fixture, new_categories, new_ordered):
809813
dtype = CategoricalDtype(list('abc'), ordered_fixture)
810814
new_dtype = CategoricalDtype(new_categories, new_ordered)
@@ -813,11 +817,18 @@ def test_update_dtype(self, ordered_fixture, new_categories, new_ordered):
813817
if expected_categories is None:
814818
expected_categories = dtype.categories
815819

816-
expected_ordered = new_dtype.ordered
817-
if expected_ordered is None:
820+
expected_ordered = new_ordered
821+
if new_ordered is ordered_sentinel or new_ordered is None:
818822
expected_ordered = dtype.ordered
819823

820-
result = dtype.update_dtype(new_dtype)
824+
# GH 26336
825+
if new_ordered is ordered_sentinel and ordered_fixture is True:
826+
with tm.assert_produces_warning(FutureWarning,
827+
check_stacklevel=False):
828+
result = dtype.update_dtype(new_dtype)
829+
else:
830+
result = dtype.update_dtype(new_dtype)
831+
821832
tm.assert_index_equal(result.categories, expected_categories)
822833
assert result.ordered is expected_ordered
823834

@@ -837,6 +848,14 @@ def test_update_dtype_errors(self, bad_dtype):
837848
with pytest.raises(ValueError, match=msg):
838849
dtype.update_dtype(bad_dtype)
839850

851+
@pytest.mark.parametrize('ordered', [ordered_sentinel, None, True, False])
852+
def test_ordered_none_default_deprecated(self, ordered):
853+
# GH 26403: CDT.ordered only warns if ordered is not explicitly passed
854+
dtype = CategoricalDtype(list('abc'), ordered=ordered)
855+
warning = FutureWarning if ordered is ordered_sentinel else None
856+
with tm.assert_produces_warning(warning):
857+
dtype.ordered
858+
840859

841860
@pytest.mark.parametrize('dtype', [
842861
CategoricalDtype,

0 commit comments

Comments
 (0)