Skip to content

Commit cf6107e

Browse files
committed
API: Allow ordered=None in CategoricalDtype
1 parent c19bdc9 commit cf6107e

File tree

4 files changed

+109
-92
lines changed

4 files changed

+109
-92
lines changed

doc/source/whatsnew/v0.23.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,7 @@ Other API Changes
208208
- In :func:`read_excel`, the ``comment`` argument is now exposed as a named parameter (:issue:`18735`)
209209
- Rearranged the order of keyword arguments in :func:`read_excel()` to align with :func:`read_csv()` (:issue:`16672`)
210210
- The options ``html.border`` and ``mode.use_inf_as_null`` were deprecated in prior versions, these will now show ``FutureWarning`` rather than a ``DeprecationWarning`` (:issue:`19003`)
211+
- The default value of the ``ordered`` parameter for :class:`~pandas.api.types.CategoricalDtype` has changed from ``False`` to ``None``. Behavior should remain consistent for downstream objects, such as :class:`Categorical` (:issue:`18790`)
211212

212213
.. _whatsnew_0230.deprecations:
213214

pandas/core/categorical.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,7 @@ class Categorical(PandasObject):
242242
# For comparisons, so that numpy uses our implementation if the compare
243243
# ops, which raise
244244
__array_priority__ = 1000
245-
_dtype = CategoricalDtype()
245+
_dtype = CategoricalDtype(ordered=False)
246246
_deprecations = frozenset(['labels'])
247247
_typ = 'categorical'
248248

@@ -293,7 +293,7 @@ def __init__(self, values, categories=None, ordered=None, dtype=None,
293293

294294
if fastpath:
295295
self._codes = coerce_indexer_dtype(values, categories)
296-
self._dtype = dtype
296+
self._dtype = self._dtype._update_dtype(dtype)
297297
return
298298

299299
# null_mask indicates missing values we want to exclude from inference.
@@ -357,7 +357,7 @@ def __init__(self, values, categories=None, ordered=None, dtype=None,
357357
full_codes[~null_mask] = codes
358358
codes = full_codes
359359

360-
self._dtype = dtype
360+
self._dtype = self._dtype._update_dtype(dtype)
361361
self._codes = coerce_indexer_dtype(codes, dtype.categories)
362362

363363
@property

pandas/core/dtypes/dtypes.py

+14-11
Original file line numberDiff line numberDiff line change
@@ -159,11 +159,11 @@ class CategoricalDtype(ExtensionDtype):
159159
_metadata = ['categories', 'ordered']
160160
_cache = {}
161161

162-
def __init__(self, categories=None, ordered=False):
162+
def __init__(self, categories=None, ordered=None):
163163
self._finalize(categories, ordered, fastpath=False)
164164

165165
@classmethod
166-
def _from_fastpath(cls, categories=None, ordered=False):
166+
def _from_fastpath(cls, categories=None, ordered=None):
167167
self = cls.__new__(cls)
168168
self._finalize(categories, ordered, fastpath=True)
169169
return self
@@ -180,9 +180,7 @@ def _from_categorical_dtype(cls, dtype, categories=None, ordered=None):
180180

181181
def _finalize(self, categories, ordered, fastpath=False):
182182

183-
if ordered is None:
184-
ordered = False
185-
else:
183+
if ordered is not None:
186184
self._validate_ordered(ordered)
187185

188186
if categories is not None:
@@ -220,10 +218,10 @@ def __eq__(self, other):
220218
# CDT(., .) = CDT(None, False) and *all*
221219
# CDT(., .) = CDT(None, True).
222220
return True
223-
elif self.ordered:
224-
return other.ordered and self.categories.equals(other.categories)
225-
elif other.ordered:
226-
return False
221+
elif self.ordered or other.ordered:
222+
# at least one ordered
223+
return ((self.ordered == other.ordered) and
224+
self.categories.equals(other.categories))
227225
else:
228226
# both unordered; this could probably be optimized / cached
229227
return hash(self) == hash(other)
@@ -361,11 +359,16 @@ def _update_dtype(self, dtype):
361359
'got {dtype!r}').format(dtype=dtype)
362360
raise ValueError(msg)
363361

364-
# dtype is CDT: keep current categories if None (ordered can't be None)
362+
# dtype is CDT: keep current categories/ordered if None
365363
new_categories = dtype.categories
366364
if new_categories is None:
367365
new_categories = self.categories
368-
return CategoricalDtype(new_categories, dtype.ordered)
366+
367+
new_ordered = dtype.ordered
368+
if new_ordered is None:
369+
new_ordered = self.ordered
370+
371+
return CategoricalDtype(new_categories, new_ordered)
369372

370373
@property
371374
def categories(self):

pandas/tests/dtypes/test_dtypes.py

+91-78
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
from pandas import (
1010
Series, Categorical, CategoricalIndex, IntervalIndex, date_range)
1111

12-
from pandas.compat import string_types
1312
from pandas.core.dtypes.dtypes import (
1413
DatetimeTZDtype, PeriodDtype,
1514
IntervalDtype, CategoricalDtype)
@@ -24,6 +23,11 @@
2423
import pandas.util.testing as tm
2524

2625

26+
@pytest.fixture(scope='class', params=[True, False, None])
27+
def ordered(request):
28+
return request.param
29+
30+
2731
class Base(object):
2832

2933
def setup_method(self, method):
@@ -124,41 +128,6 @@ def test_tuple_categories(self):
124128
result = CategoricalDtype(categories)
125129
assert all(result.categories == categories)
126130

127-
@pytest.mark.parametrize('dtype', [
128-
CategoricalDtype(list('abc'), False),
129-
CategoricalDtype(list('abc'), True)])
130-
@pytest.mark.parametrize('new_dtype', [
131-
'category',
132-
CategoricalDtype(None, False),
133-
CategoricalDtype(None, True),
134-
CategoricalDtype(list('abc'), False),
135-
CategoricalDtype(list('abc'), True),
136-
CategoricalDtype(list('cba'), False),
137-
CategoricalDtype(list('cba'), True),
138-
CategoricalDtype(list('wxyz'), False),
139-
CategoricalDtype(list('wxyz'), True)])
140-
def test_update_dtype(self, dtype, new_dtype):
141-
if isinstance(new_dtype, string_types) and new_dtype == 'category':
142-
expected_categories = dtype.categories
143-
expected_ordered = dtype.ordered
144-
else:
145-
expected_categories = new_dtype.categories
146-
if expected_categories is None:
147-
expected_categories = dtype.categories
148-
expected_ordered = new_dtype.ordered
149-
150-
result = dtype._update_dtype(new_dtype)
151-
tm.assert_index_equal(result.categories, expected_categories)
152-
assert result.ordered is expected_ordered
153-
154-
@pytest.mark.parametrize('bad_dtype', [
155-
'foo', object, np.int64, PeriodDtype('Q'), IntervalDtype(object)])
156-
def test_update_dtype_errors(self, bad_dtype):
157-
dtype = CategoricalDtype(list('abc'), False)
158-
msg = 'a CategoricalDtype must be passed to perform an update, '
159-
with tm.assert_raises_regex(ValueError, msg):
160-
dtype._update_dtype(bad_dtype)
161-
162131

163132
class TestDatetimeTZDtype(Base):
164133

@@ -568,39 +537,37 @@ def test_caching(self):
568537

569538
class TestCategoricalDtypeParametrized(object):
570539

571-
@pytest.mark.parametrize('categories, ordered', [
572-
(['a', 'b', 'c', 'd'], False),
573-
(['a', 'b', 'c', 'd'], True),
574-
(np.arange(1000), False),
575-
(np.arange(1000), True),
576-
(['a', 'b', 10, 2, 1.3, True], False),
577-
([True, False], True),
578-
([True, False], False),
579-
(pd.date_range('2017', periods=4), True),
580-
(pd.date_range('2017', periods=4), False),
581-
])
540+
@pytest.mark.parametrize('categories', [
541+
list('abcd'),
542+
np.arange(1000),
543+
['a', 'b', 10, 2, 1.3, True],
544+
[True, False],
545+
pd.date_range('2017', periods=4)])
582546
def test_basic(self, categories, ordered):
583547
c1 = CategoricalDtype(categories, ordered=ordered)
584548
tm.assert_index_equal(c1.categories, pd.Index(categories))
585549
assert c1.ordered is ordered
586550

587551
def test_order_matters(self):
588552
categories = ['a', 'b']
589-
c1 = CategoricalDtype(categories, ordered=False)
590-
c2 = CategoricalDtype(categories, ordered=True)
553+
c1 = CategoricalDtype(categories, ordered=True)
554+
c2 = CategoricalDtype(categories, ordered=False)
555+
c3 = CategoricalDtype(categories, ordered=None)
591556
assert c1 is not c2
557+
assert c1 is not c3
592558

593-
def test_unordered_same(self):
594-
c1 = CategoricalDtype(['a', 'b'])
595-
c2 = CategoricalDtype(['b', 'a'])
559+
@pytest.mark.parametrize('ordered', [False, None])
560+
def test_unordered_same(self, ordered):
561+
c1 = CategoricalDtype(['a', 'b'], ordered=ordered)
562+
c2 = CategoricalDtype(['b', 'a'], ordered=ordered)
596563
assert hash(c1) == hash(c2)
597564

598565
def test_categories(self):
599566
result = CategoricalDtype(['a', 'b', 'c'])
600567
tm.assert_index_equal(result.categories, pd.Index(['a', 'b', 'c']))
601-
assert result.ordered is False
568+
assert result.ordered is None
602569

603-
def test_equal_but_different(self):
570+
def test_equal_but_different(self, ordered):
604571
c1 = CategoricalDtype([1, 2, 3])
605572
c2 = CategoricalDtype([1., 2., 3.])
606573
assert c1 is not c2
@@ -611,9 +578,11 @@ def test_equal_but_different(self):
611578
([1, 2, 3], [3, 2, 1]),
612579
])
613580
def test_order_hashes_different(self, v1, v2):
614-
c1 = CategoricalDtype(v1)
581+
c1 = CategoricalDtype(v1, ordered=False)
615582
c2 = CategoricalDtype(v2, ordered=True)
583+
c3 = CategoricalDtype(v1, ordered=None)
616584
assert c1 is not c2
585+
assert c1 is not c3
617586

618587
def test_nan_invalid(self):
619588
with pytest.raises(ValueError):
@@ -628,26 +597,35 @@ def test_same_categories_different_order(self):
628597
c2 = CategoricalDtype(['b', 'a'], ordered=True)
629598
assert c1 is not c2
630599

631-
@pytest.mark.parametrize('ordered, other, expected', [
632-
(True, CategoricalDtype(['a', 'b'], True), True),
633-
(False, CategoricalDtype(['a', 'b'], False), True),
634-
(True, CategoricalDtype(['a', 'b'], False), False),
635-
(False, CategoricalDtype(['a', 'b'], True), False),
636-
(True, CategoricalDtype([1, 2], False), False),
637-
(False, CategoricalDtype([1, 2], True), False),
638-
(False, CategoricalDtype(None, True), True),
639-
(True, CategoricalDtype(None, True), True),
640-
(False, CategoricalDtype(None, False), True),
641-
(True, CategoricalDtype(None, False), True),
642-
(True, 'category', True),
643-
(False, 'category', True),
644-
(True, 'not a category', False),
645-
(False, 'not a category', False),
646-
])
647-
def test_categorical_equality(self, ordered, other, expected):
648-
c1 = CategoricalDtype(['a', 'b'], ordered)
600+
@pytest.mark.parametrize('ordered1', [True, False, None])
601+
@pytest.mark.parametrize('ordered2', [True, False, None])
602+
def test_categorical_equality(self, ordered1, ordered2):
603+
# same categories
604+
c1 = CategoricalDtype(list('abc'), ordered1)
605+
c2 = CategoricalDtype(list('abc'), ordered2)
606+
result = c1 == c2
607+
expected = (ordered1 is ordered2) or not any([ordered1, ordered2])
608+
assert result is expected
609+
610+
# different categories
611+
c2 = CategoricalDtype([1, 2, 3], ordered2)
612+
assert c1 != c2
613+
614+
# none categories
615+
c1 = CategoricalDtype(list('abc'), ordered1)
616+
c2 = CategoricalDtype(None, ordered2)
617+
c3 = CategoricalDtype(None, ordered1)
618+
assert c1 == c2
619+
assert c2 == c1
620+
assert c2 == c3
621+
622+
@pytest.mark.parametrize('categories', [list('abc'), None])
623+
@pytest.mark.parametrize('other', ['category', 'not a category'])
624+
def test_categorical_equality_strings(self, categories, ordered, other):
625+
c1 = CategoricalDtype(categories, ordered)
649626
result = c1 == other
650-
assert result == expected
627+
expected = other == 'category'
628+
assert result is expected
651629

652630
def test_invalid_raises(self):
653631
with tm.assert_raises_regex(TypeError, 'ordered'):
@@ -688,16 +666,51 @@ def test_from_categorical_dtype_both(self):
688666
c1, categories=[1, 2], ordered=False)
689667
assert result == CategoricalDtype([1, 2], ordered=False)
690668

691-
def test_str_vs_repr(self):
692-
c1 = CategoricalDtype(['a', 'b'])
669+
def test_str_vs_repr(self, ordered):
670+
c1 = CategoricalDtype(['a', 'b'], ordered=ordered)
693671
assert str(c1) == 'category'
694672
# Py2 will have unicode prefixes
695-
pat = r"CategoricalDtype\(categories=\[.*\], ordered=False\)"
696-
assert re.match(pat, repr(c1))
673+
pat = r"CategoricalDtype\(categories=\[.*\], ordered={ordered}\)"
674+
assert re.match(pat.format(ordered=ordered), repr(c1))
697675

698676
def test_categorical_categories(self):
699677
# GH17884
700678
c1 = CategoricalDtype(Categorical(['a', 'b']))
701679
tm.assert_index_equal(c1.categories, pd.Index(['a', 'b']))
702680
c1 = CategoricalDtype(CategoricalIndex(['a', 'b']))
703681
tm.assert_index_equal(c1.categories, pd.Index(['a', 'b']))
682+
683+
@pytest.mark.parametrize('new_categories', [
684+
list('abc'), list('cba'), list('wxyz'), None])
685+
@pytest.mark.parametrize('new_ordered', [True, False, None])
686+
def test_update_dtype(self, ordered, new_categories, new_ordered):
687+
dtype = CategoricalDtype(list('abc'), ordered)
688+
new_dtype = CategoricalDtype(new_categories, new_ordered)
689+
690+
expected_categories = new_dtype.categories
691+
if expected_categories is None:
692+
expected_categories = dtype.categories
693+
694+
expected_ordered = new_dtype.ordered
695+
if expected_ordered is None:
696+
expected_ordered = dtype.ordered
697+
698+
result = dtype._update_dtype(new_dtype)
699+
tm.assert_index_equal(result.categories, expected_categories)
700+
assert result.ordered is expected_ordered
701+
702+
def test_update_dtype_string(self, ordered):
703+
dtype = CategoricalDtype(list('abc'), ordered)
704+
expected_categories = dtype.categories
705+
expected_ordered = dtype.ordered
706+
result = dtype._update_dtype('category')
707+
tm.assert_index_equal(result.categories, expected_categories)
708+
assert result.ordered is expected_ordered
709+
710+
@pytest.mark.parametrize('bad_dtype', [
711+
'foo', object, np.int64, PeriodDtype('Q'), IntervalDtype(object)])
712+
def test_update_dtype_errors(self, bad_dtype):
713+
dtype = CategoricalDtype(list('abc'), False)
714+
msg = 'a CategoricalDtype must be passed to perform an update, '
715+
with tm.assert_raises_regex(ValueError, msg):
716+
dtype._update_dtype(bad_dtype)

0 commit comments

Comments
 (0)