Skip to content

Commit 9b7af91

Browse files
committed
API: Allow ordered=None in CategoricalDtype
1 parent 983d71f commit 9b7af91

File tree

4 files changed

+109
-91
lines changed

4 files changed

+109
-91
lines changed

doc/source/whatsnew/v0.23.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -438,6 +438,7 @@ Other API Changes
438438
- Set operations (union, difference...) on :class:`IntervalIndex` with incompatible index types will now raise a ``TypeError`` rather than a ``ValueError`` (:issue:`19329`)
439439
- :class:`DateOffset` objects render more simply, e.g. "<DateOffset: days=1>" instead of "<DateOffset: kwds={'days': 1}>" (:issue:`19403`)
440440
- :func:`pandas.merge` provides a more informative error message when trying to merge on timezone-aware and timezone-naive columns (:issue:`15800`)
441+
- The default value of the ``ordered`` parameter for :class:`~pandas.api.types.CategoricalDtype` has changed from ``False`` to ``None``. Behavior should remain consistent for downstream objects, such as :class:`Categorical` (:issue:`18790`)
441442

442443
.. _whatsnew_0230.deprecations:
443444

pandas/core/arrays/categorical.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,7 @@ class Categorical(ExtensionArray, PandasObject):
243243
# For comparisons, so that numpy uses our implementation if the compare
244244
# ops, which raise
245245
__array_priority__ = 1000
246-
_dtype = CategoricalDtype()
246+
_dtype = CategoricalDtype(ordered=False)
247247
_deprecations = frozenset(['labels'])
248248
_typ = 'categorical'
249249

@@ -294,7 +294,7 @@ def __init__(self, values, categories=None, ordered=None, dtype=None,
294294

295295
if fastpath:
296296
self._codes = coerce_indexer_dtype(values, categories)
297-
self._dtype = dtype
297+
self._dtype = self._dtype._update_dtype(dtype)
298298
return
299299

300300
# null_mask indicates missing values we want to exclude from inference.
@@ -358,7 +358,7 @@ def __init__(self, values, categories=None, ordered=None, dtype=None,
358358
full_codes[~null_mask] = codes
359359
codes = full_codes
360360

361-
self._dtype = dtype
361+
self._dtype = self._dtype._update_dtype(dtype)
362362
self._codes = coerce_indexer_dtype(codes, dtype.categories)
363363

364364
@property

pandas/core/dtypes/dtypes.py

+14-11
Original file line numberDiff line numberDiff line change
@@ -159,11 +159,11 @@ class CategoricalDtype(PandasExtensionDtype):
159159
_metadata = ['categories', 'ordered']
160160
_cache = {}
161161

162-
def __init__(self, categories=None, ordered=False):
162+
def __init__(self, categories=None, ordered=None):
163163
self._finalize(categories, ordered, fastpath=False)
164164

165165
@classmethod
166-
def _from_fastpath(cls, categories=None, ordered=False):
166+
def _from_fastpath(cls, categories=None, ordered=None):
167167
self = cls.__new__(cls)
168168
self._finalize(categories, ordered, fastpath=True)
169169
return self
@@ -180,9 +180,7 @@ def _from_categorical_dtype(cls, dtype, categories=None, ordered=None):
180180

181181
def _finalize(self, categories, ordered, fastpath=False):
182182

183-
if ordered is None:
184-
ordered = False
185-
else:
183+
if ordered is not None:
186184
self._validate_ordered(ordered)
187185

188186
if categories is not None:
@@ -220,10 +218,10 @@ def __eq__(self, other):
220218
# CDT(., .) = CDT(None, False) and *all*
221219
# CDT(., .) = CDT(None, True).
222220
return True
223-
elif self.ordered:
224-
return other.ordered and self.categories.equals(other.categories)
225-
elif other.ordered:
226-
return False
221+
elif self.ordered or other.ordered:
222+
# at least one ordered
223+
return ((self.ordered == other.ordered) and
224+
self.categories.equals(other.categories))
227225
else:
228226
# both unordered; this could probably be optimized / cached
229227
return hash(self) == hash(other)
@@ -361,11 +359,16 @@ def _update_dtype(self, dtype):
361359
'got {dtype!r}').format(dtype=dtype)
362360
raise ValueError(msg)
363361

364-
# dtype is CDT: keep current categories if None (ordered can't be None)
362+
# dtype is CDT: keep current categories/ordered if None
365363
new_categories = dtype.categories
366364
if new_categories is None:
367365
new_categories = self.categories
368-
return CategoricalDtype(new_categories, dtype.ordered)
366+
367+
new_ordered = dtype.ordered
368+
if new_ordered is None:
369+
new_ordered = self.ordered
370+
371+
return CategoricalDtype(new_categories, new_ordered)
369372

370373
@property
371374
def categories(self):

pandas/tests/dtypes/test_dtypes.py

+91-77
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,11 @@
2626
import pandas.util.testing as tm
2727

2828

29+
@pytest.fixture(params=[True, False, None])
30+
def ordered(request):
31+
return request.param
32+
33+
2934
class Base(object):
3035

3136
def setup_method(self, method):
@@ -126,41 +131,6 @@ def test_tuple_categories(self):
126131
result = CategoricalDtype(categories)
127132
assert all(result.categories == categories)
128133

129-
@pytest.mark.parametrize('dtype', [
130-
CategoricalDtype(list('abc'), False),
131-
CategoricalDtype(list('abc'), True)])
132-
@pytest.mark.parametrize('new_dtype', [
133-
'category',
134-
CategoricalDtype(None, False),
135-
CategoricalDtype(None, True),
136-
CategoricalDtype(list('abc'), False),
137-
CategoricalDtype(list('abc'), True),
138-
CategoricalDtype(list('cba'), False),
139-
CategoricalDtype(list('cba'), True),
140-
CategoricalDtype(list('wxyz'), False),
141-
CategoricalDtype(list('wxyz'), True)])
142-
def test_update_dtype(self, dtype, new_dtype):
143-
if isinstance(new_dtype, string_types) and new_dtype == 'category':
144-
expected_categories = dtype.categories
145-
expected_ordered = dtype.ordered
146-
else:
147-
expected_categories = new_dtype.categories
148-
if expected_categories is None:
149-
expected_categories = dtype.categories
150-
expected_ordered = new_dtype.ordered
151-
152-
result = dtype._update_dtype(new_dtype)
153-
tm.assert_index_equal(result.categories, expected_categories)
154-
assert result.ordered is expected_ordered
155-
156-
@pytest.mark.parametrize('bad_dtype', [
157-
'foo', object, np.int64, PeriodDtype('Q')])
158-
def test_update_dtype_errors(self, bad_dtype):
159-
dtype = CategoricalDtype(list('abc'), False)
160-
msg = 'a CategoricalDtype must be passed to perform an update, '
161-
with tm.assert_raises_regex(ValueError, msg):
162-
dtype._update_dtype(bad_dtype)
163-
164134

165135
class TestDatetimeTZDtype(Base):
166136

@@ -611,39 +581,37 @@ def test_caching(self):
611581

612582
class TestCategoricalDtypeParametrized(object):
613583

614-
@pytest.mark.parametrize('categories, ordered', [
615-
(['a', 'b', 'c', 'd'], False),
616-
(['a', 'b', 'c', 'd'], True),
617-
(np.arange(1000), False),
618-
(np.arange(1000), True),
619-
(['a', 'b', 10, 2, 1.3, True], False),
620-
([True, False], True),
621-
([True, False], False),
622-
(pd.date_range('2017', periods=4), True),
623-
(pd.date_range('2017', periods=4), False),
624-
])
584+
@pytest.mark.parametrize('categories', [
585+
list('abcd'),
586+
np.arange(1000),
587+
['a', 'b', 10, 2, 1.3, True],
588+
[True, False],
589+
pd.date_range('2017', periods=4)])
625590
def test_basic(self, categories, ordered):
626591
c1 = CategoricalDtype(categories, ordered=ordered)
627592
tm.assert_index_equal(c1.categories, pd.Index(categories))
628593
assert c1.ordered is ordered
629594

630595
def test_order_matters(self):
631596
categories = ['a', 'b']
632-
c1 = CategoricalDtype(categories, ordered=False)
633-
c2 = CategoricalDtype(categories, ordered=True)
597+
c1 = CategoricalDtype(categories, ordered=True)
598+
c2 = CategoricalDtype(categories, ordered=False)
599+
c3 = CategoricalDtype(categories, ordered=None)
634600
assert c1 is not c2
601+
assert c1 is not c3
635602

636-
def test_unordered_same(self):
637-
c1 = CategoricalDtype(['a', 'b'])
638-
c2 = CategoricalDtype(['b', 'a'])
603+
@pytest.mark.parametrize('ordered', [False, None])
604+
def test_unordered_same(self, ordered):
605+
c1 = CategoricalDtype(['a', 'b'], ordered=ordered)
606+
c2 = CategoricalDtype(['b', 'a'], ordered=ordered)
639607
assert hash(c1) == hash(c2)
640608

641609
def test_categories(self):
642610
result = CategoricalDtype(['a', 'b', 'c'])
643611
tm.assert_index_equal(result.categories, pd.Index(['a', 'b', 'c']))
644-
assert result.ordered is False
612+
assert result.ordered is None
645613

646-
def test_equal_but_different(self):
614+
def test_equal_but_different(self, ordered):
647615
c1 = CategoricalDtype([1, 2, 3])
648616
c2 = CategoricalDtype([1., 2., 3.])
649617
assert c1 is not c2
@@ -654,9 +622,11 @@ def test_equal_but_different(self):
654622
([1, 2, 3], [3, 2, 1]),
655623
])
656624
def test_order_hashes_different(self, v1, v2):
657-
c1 = CategoricalDtype(v1)
625+
c1 = CategoricalDtype(v1, ordered=False)
658626
c2 = CategoricalDtype(v2, ordered=True)
627+
c3 = CategoricalDtype(v1, ordered=None)
659628
assert c1 is not c2
629+
assert c1 is not c3
660630

661631
def test_nan_invalid(self):
662632
with pytest.raises(ValueError):
@@ -671,26 +641,35 @@ def test_same_categories_different_order(self):
671641
c2 = CategoricalDtype(['b', 'a'], ordered=True)
672642
assert c1 is not c2
673643

674-
@pytest.mark.parametrize('ordered, other, expected', [
675-
(True, CategoricalDtype(['a', 'b'], True), True),
676-
(False, CategoricalDtype(['a', 'b'], False), True),
677-
(True, CategoricalDtype(['a', 'b'], False), False),
678-
(False, CategoricalDtype(['a', 'b'], True), False),
679-
(True, CategoricalDtype([1, 2], False), False),
680-
(False, CategoricalDtype([1, 2], True), False),
681-
(False, CategoricalDtype(None, True), True),
682-
(True, CategoricalDtype(None, True), True),
683-
(False, CategoricalDtype(None, False), True),
684-
(True, CategoricalDtype(None, False), True),
685-
(True, 'category', True),
686-
(False, 'category', True),
687-
(True, 'not a category', False),
688-
(False, 'not a category', False),
689-
])
690-
def test_categorical_equality(self, ordered, other, expected):
691-
c1 = CategoricalDtype(['a', 'b'], ordered)
644+
@pytest.mark.parametrize('ordered1', [True, False, None])
645+
@pytest.mark.parametrize('ordered2', [True, False, None])
646+
def test_categorical_equality(self, ordered1, ordered2):
647+
# same categories
648+
c1 = CategoricalDtype(list('abc'), ordered1)
649+
c2 = CategoricalDtype(list('abc'), ordered2)
650+
result = c1 == c2
651+
expected = (ordered1 is ordered2) or not any([ordered1, ordered2])
652+
assert result is expected
653+
654+
# different categories
655+
c2 = CategoricalDtype([1, 2, 3], ordered2)
656+
assert c1 != c2
657+
658+
# none categories
659+
c1 = CategoricalDtype(list('abc'), ordered1)
660+
c2 = CategoricalDtype(None, ordered2)
661+
c3 = CategoricalDtype(None, ordered1)
662+
assert c1 == c2
663+
assert c2 == c1
664+
assert c2 == c3
665+
666+
@pytest.mark.parametrize('categories', [list('abc'), None])
667+
@pytest.mark.parametrize('other', ['category', 'not a category'])
668+
def test_categorical_equality_strings(self, categories, ordered, other):
669+
c1 = CategoricalDtype(categories, ordered)
692670
result = c1 == other
693-
assert result == expected
671+
expected = other == 'category'
672+
assert result is expected
694673

695674
def test_invalid_raises(self):
696675
with tm.assert_raises_regex(TypeError, 'ordered'):
@@ -731,12 +710,12 @@ def test_from_categorical_dtype_both(self):
731710
c1, categories=[1, 2], ordered=False)
732711
assert result == CategoricalDtype([1, 2], ordered=False)
733712

734-
def test_str_vs_repr(self):
735-
c1 = CategoricalDtype(['a', 'b'])
713+
def test_str_vs_repr(self, ordered):
714+
c1 = CategoricalDtype(['a', 'b'], ordered=ordered)
736715
assert str(c1) == 'category'
737716
# Py2 will have unicode prefixes
738-
pat = r"CategoricalDtype\(categories=\[.*\], ordered=False\)"
739-
assert re.match(pat, repr(c1))
717+
pat = r"CategoricalDtype\(categories=\[.*\], ordered={ordered}\)"
718+
assert re.match(pat.format(ordered=ordered), repr(c1))
740719

741720
def test_categorical_categories(self):
742721
# GH17884
@@ -745,6 +724,41 @@ def test_categorical_categories(self):
745724
c1 = CategoricalDtype(CategoricalIndex(['a', 'b']))
746725
tm.assert_index_equal(c1.categories, pd.Index(['a', 'b']))
747726

727+
@pytest.mark.parametrize('new_categories', [
728+
list('abc'), list('cba'), list('wxyz'), None])
729+
@pytest.mark.parametrize('new_ordered', [True, False, None])
730+
def test_update_dtype(self, ordered, new_categories, new_ordered):
731+
dtype = CategoricalDtype(list('abc'), ordered)
732+
new_dtype = CategoricalDtype(new_categories, new_ordered)
733+
734+
expected_categories = new_dtype.categories
735+
if expected_categories is None:
736+
expected_categories = dtype.categories
737+
738+
expected_ordered = new_dtype.ordered
739+
if expected_ordered is None:
740+
expected_ordered = dtype.ordered
741+
742+
result = dtype._update_dtype(new_dtype)
743+
tm.assert_index_equal(result.categories, expected_categories)
744+
assert result.ordered is expected_ordered
745+
746+
def test_update_dtype_string(self, ordered):
747+
dtype = CategoricalDtype(list('abc'), ordered)
748+
expected_categories = dtype.categories
749+
expected_ordered = dtype.ordered
750+
result = dtype._update_dtype('category')
751+
tm.assert_index_equal(result.categories, expected_categories)
752+
assert result.ordered is expected_ordered
753+
754+
@pytest.mark.parametrize('bad_dtype', [
755+
'foo', object, np.int64, PeriodDtype('Q')])
756+
def test_update_dtype_errors(self, bad_dtype):
757+
dtype = CategoricalDtype(list('abc'), False)
758+
msg = 'a CategoricalDtype must be passed to perform an update, '
759+
with tm.assert_raises_regex(ValueError, msg):
760+
dtype._update_dtype(bad_dtype)
761+
748762

749763
class DummyArray(ExtensionArray):
750764
pass

0 commit comments

Comments
 (0)