Skip to content

Commit 71f5b2a

Browse files
committed
API: Allow ordered=None in CategoricalDtype
1 parent 6485a36 commit 71f5b2a

File tree

4 files changed

+109
-91
lines changed

4 files changed

+109
-91
lines changed

doc/source/whatsnew/v0.23.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -507,6 +507,7 @@ Other API Changes
507507
- Set operations (union, difference...) on :class:`IntervalIndex` with incompatible index types will now raise a ``TypeError`` rather than a ``ValueError`` (:issue:`19329`)
508508
- :class:`DateOffset` objects render more simply, e.g. "<DateOffset: days=1>" instead of "<DateOffset: kwds={'days': 1}>" (:issue:`19403`)
509509
- :func:`pandas.merge` provides a more informative error message when trying to merge on timezone-aware and timezone-naive columns (:issue:`15800`)
510+
- The default value of the ``ordered`` parameter for :class:`~pandas.api.types.CategoricalDtype` has changed from ``False`` to ``None``. Behavior should remain consistent for downstream objects, such as :class:`Categorical` (:issue:`18790`)
510511

511512
.. _whatsnew_0230.deprecations:
512513

pandas/core/arrays/categorical.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,7 @@ class Categorical(ExtensionArray, PandasObject):
243243
# For comparisons, so that numpy uses our implementation if the compare
244244
# ops, which raise
245245
__array_priority__ = 1000
246-
_dtype = CategoricalDtype()
246+
_dtype = CategoricalDtype(ordered=False)
247247
_deprecations = frozenset(['labels'])
248248
_typ = 'categorical'
249249

@@ -294,7 +294,7 @@ def __init__(self, values, categories=None, ordered=None, dtype=None,
294294

295295
if fastpath:
296296
self._codes = coerce_indexer_dtype(values, categories)
297-
self._dtype = dtype
297+
self._dtype = self._dtype._update_dtype(dtype)
298298
return
299299

300300
# null_mask indicates missing values we want to exclude from inference.
@@ -358,7 +358,7 @@ def __init__(self, values, categories=None, ordered=None, dtype=None,
358358
full_codes[~null_mask] = codes
359359
codes = full_codes
360360

361-
self._dtype = dtype
361+
self._dtype = self._dtype._update_dtype(dtype)
362362
self._codes = coerce_indexer_dtype(codes, dtype.categories)
363363

364364
@property

pandas/core/dtypes/dtypes.py

+14-11
Original file line numberDiff line numberDiff line change
@@ -159,11 +159,11 @@ class CategoricalDtype(PandasExtensionDtype):
159159
_metadata = ['categories', 'ordered']
160160
_cache = {}
161161

162-
def __init__(self, categories=None, ordered=False):
162+
def __init__(self, categories=None, ordered=None):
163163
self._finalize(categories, ordered, fastpath=False)
164164

165165
@classmethod
166-
def _from_fastpath(cls, categories=None, ordered=False):
166+
def _from_fastpath(cls, categories=None, ordered=None):
167167
self = cls.__new__(cls)
168168
self._finalize(categories, ordered, fastpath=True)
169169
return self
@@ -180,9 +180,7 @@ def _from_categorical_dtype(cls, dtype, categories=None, ordered=None):
180180

181181
def _finalize(self, categories, ordered, fastpath=False):
182182

183-
if ordered is None:
184-
ordered = False
185-
else:
183+
if ordered is not None:
186184
self._validate_ordered(ordered)
187185

188186
if categories is not None:
@@ -220,10 +218,10 @@ def __eq__(self, other):
220218
# CDT(., .) = CDT(None, False) and *all*
221219
# CDT(., .) = CDT(None, True).
222220
return True
223-
elif self.ordered:
224-
return other.ordered and self.categories.equals(other.categories)
225-
elif other.ordered:
226-
return False
221+
elif self.ordered or other.ordered:
222+
# at least one ordered
223+
return ((self.ordered == other.ordered) and
224+
self.categories.equals(other.categories))
227225
else:
228226
# both unordered; this could probably be optimized / cached
229227
return hash(self) == hash(other)
@@ -361,11 +359,16 @@ def _update_dtype(self, dtype):
361359
'got {dtype!r}').format(dtype=dtype)
362360
raise ValueError(msg)
363361

364-
# dtype is CDT: keep current categories if None (ordered can't be None)
362+
# dtype is CDT: keep current categories/ordered if None
365363
new_categories = dtype.categories
366364
if new_categories is None:
367365
new_categories = self.categories
368-
return CategoricalDtype(new_categories, dtype.ordered)
366+
367+
new_ordered = dtype.ordered
368+
if new_ordered is None:
369+
new_ordered = self.ordered
370+
371+
return CategoricalDtype(new_categories, new_ordered)
369372

370373
@property
371374
def categories(self):

pandas/tests/dtypes/test_dtypes.py

+91-77
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,11 @@
2424
import pandas.util.testing as tm
2525

2626

27+
@pytest.fixture(params=[True, False, None])
28+
def ordered(request):
29+
return request.param
30+
31+
2732
class Base(object):
2833

2934
def setup_method(self, method):
@@ -124,41 +129,6 @@ def test_tuple_categories(self):
124129
result = CategoricalDtype(categories)
125130
assert all(result.categories == categories)
126131

127-
@pytest.mark.parametrize('dtype', [
128-
CategoricalDtype(list('abc'), False),
129-
CategoricalDtype(list('abc'), True)])
130-
@pytest.mark.parametrize('new_dtype', [
131-
'category',
132-
CategoricalDtype(None, False),
133-
CategoricalDtype(None, True),
134-
CategoricalDtype(list('abc'), False),
135-
CategoricalDtype(list('abc'), True),
136-
CategoricalDtype(list('cba'), False),
137-
CategoricalDtype(list('cba'), True),
138-
CategoricalDtype(list('wxyz'), False),
139-
CategoricalDtype(list('wxyz'), True)])
140-
def test_update_dtype(self, dtype, new_dtype):
141-
if isinstance(new_dtype, string_types) and new_dtype == 'category':
142-
expected_categories = dtype.categories
143-
expected_ordered = dtype.ordered
144-
else:
145-
expected_categories = new_dtype.categories
146-
if expected_categories is None:
147-
expected_categories = dtype.categories
148-
expected_ordered = new_dtype.ordered
149-
150-
result = dtype._update_dtype(new_dtype)
151-
tm.assert_index_equal(result.categories, expected_categories)
152-
assert result.ordered is expected_ordered
153-
154-
@pytest.mark.parametrize('bad_dtype', [
155-
'foo', object, np.int64, PeriodDtype('Q')])
156-
def test_update_dtype_errors(self, bad_dtype):
157-
dtype = CategoricalDtype(list('abc'), False)
158-
msg = 'a CategoricalDtype must be passed to perform an update, '
159-
with tm.assert_raises_regex(ValueError, msg):
160-
dtype._update_dtype(bad_dtype)
161-
162132

163133
class TestDatetimeTZDtype(Base):
164134

@@ -609,39 +579,37 @@ def test_caching(self):
609579

610580
class TestCategoricalDtypeParametrized(object):
611581

612-
@pytest.mark.parametrize('categories, ordered', [
613-
(['a', 'b', 'c', 'd'], False),
614-
(['a', 'b', 'c', 'd'], True),
615-
(np.arange(1000), False),
616-
(np.arange(1000), True),
617-
(['a', 'b', 10, 2, 1.3, True], False),
618-
([True, False], True),
619-
([True, False], False),
620-
(pd.date_range('2017', periods=4), True),
621-
(pd.date_range('2017', periods=4), False),
622-
])
582+
@pytest.mark.parametrize('categories', [
583+
list('abcd'),
584+
np.arange(1000),
585+
['a', 'b', 10, 2, 1.3, True],
586+
[True, False],
587+
pd.date_range('2017', periods=4)])
623588
def test_basic(self, categories, ordered):
624589
c1 = CategoricalDtype(categories, ordered=ordered)
625590
tm.assert_index_equal(c1.categories, pd.Index(categories))
626591
assert c1.ordered is ordered
627592

628593
def test_order_matters(self):
629594
categories = ['a', 'b']
630-
c1 = CategoricalDtype(categories, ordered=False)
631-
c2 = CategoricalDtype(categories, ordered=True)
595+
c1 = CategoricalDtype(categories, ordered=True)
596+
c2 = CategoricalDtype(categories, ordered=False)
597+
c3 = CategoricalDtype(categories, ordered=None)
632598
assert c1 is not c2
599+
assert c1 is not c3
633600

634-
def test_unordered_same(self):
635-
c1 = CategoricalDtype(['a', 'b'])
636-
c2 = CategoricalDtype(['b', 'a'])
601+
@pytest.mark.parametrize('ordered', [False, None])
602+
def test_unordered_same(self, ordered):
603+
c1 = CategoricalDtype(['a', 'b'], ordered=ordered)
604+
c2 = CategoricalDtype(['b', 'a'], ordered=ordered)
637605
assert hash(c1) == hash(c2)
638606

639607
def test_categories(self):
640608
result = CategoricalDtype(['a', 'b', 'c'])
641609
tm.assert_index_equal(result.categories, pd.Index(['a', 'b', 'c']))
642-
assert result.ordered is False
610+
assert result.ordered is None
643611

644-
def test_equal_but_different(self):
612+
def test_equal_but_different(self, ordered):
645613
c1 = CategoricalDtype([1, 2, 3])
646614
c2 = CategoricalDtype([1., 2., 3.])
647615
assert c1 is not c2
@@ -652,9 +620,11 @@ def test_equal_but_different(self):
652620
([1, 2, 3], [3, 2, 1]),
653621
])
654622
def test_order_hashes_different(self, v1, v2):
655-
c1 = CategoricalDtype(v1)
623+
c1 = CategoricalDtype(v1, ordered=False)
656624
c2 = CategoricalDtype(v2, ordered=True)
625+
c3 = CategoricalDtype(v1, ordered=None)
657626
assert c1 is not c2
627+
assert c1 is not c3
658628

659629
def test_nan_invalid(self):
660630
with pytest.raises(ValueError):
@@ -669,26 +639,35 @@ def test_same_categories_different_order(self):
669639
c2 = CategoricalDtype(['b', 'a'], ordered=True)
670640
assert c1 is not c2
671641

672-
@pytest.mark.parametrize('ordered, other, expected', [
673-
(True, CategoricalDtype(['a', 'b'], True), True),
674-
(False, CategoricalDtype(['a', 'b'], False), True),
675-
(True, CategoricalDtype(['a', 'b'], False), False),
676-
(False, CategoricalDtype(['a', 'b'], True), False),
677-
(True, CategoricalDtype([1, 2], False), False),
678-
(False, CategoricalDtype([1, 2], True), False),
679-
(False, CategoricalDtype(None, True), True),
680-
(True, CategoricalDtype(None, True), True),
681-
(False, CategoricalDtype(None, False), True),
682-
(True, CategoricalDtype(None, False), True),
683-
(True, 'category', True),
684-
(False, 'category', True),
685-
(True, 'not a category', False),
686-
(False, 'not a category', False),
687-
])
688-
def test_categorical_equality(self, ordered, other, expected):
689-
c1 = CategoricalDtype(['a', 'b'], ordered)
642+
@pytest.mark.parametrize('ordered1', [True, False, None])
643+
@pytest.mark.parametrize('ordered2', [True, False, None])
644+
def test_categorical_equality(self, ordered1, ordered2):
645+
# same categories
646+
c1 = CategoricalDtype(list('abc'), ordered1)
647+
c2 = CategoricalDtype(list('abc'), ordered2)
648+
result = c1 == c2
649+
expected = (ordered1 is ordered2) or not any([ordered1, ordered2])
650+
assert result is expected
651+
652+
# different categories
653+
c2 = CategoricalDtype([1, 2, 3], ordered2)
654+
assert c1 != c2
655+
656+
# none categories
657+
c1 = CategoricalDtype(list('abc'), ordered1)
658+
c2 = CategoricalDtype(None, ordered2)
659+
c3 = CategoricalDtype(None, ordered1)
660+
assert c1 == c2
661+
assert c2 == c1
662+
assert c2 == c3
663+
664+
@pytest.mark.parametrize('categories', [list('abc'), None])
665+
@pytest.mark.parametrize('other', ['category', 'not a category'])
666+
def test_categorical_equality_strings(self, categories, ordered, other):
667+
c1 = CategoricalDtype(categories, ordered)
690668
result = c1 == other
691-
assert result == expected
669+
expected = other == 'category'
670+
assert result is expected
692671

693672
def test_invalid_raises(self):
694673
with tm.assert_raises_regex(TypeError, 'ordered'):
@@ -729,16 +708,51 @@ def test_from_categorical_dtype_both(self):
729708
c1, categories=[1, 2], ordered=False)
730709
assert result == CategoricalDtype([1, 2], ordered=False)
731710

732-
def test_str_vs_repr(self):
733-
c1 = CategoricalDtype(['a', 'b'])
711+
def test_str_vs_repr(self, ordered):
712+
c1 = CategoricalDtype(['a', 'b'], ordered=ordered)
734713
assert str(c1) == 'category'
735714
# Py2 will have unicode prefixes
736-
pat = r"CategoricalDtype\(categories=\[.*\], ordered=False\)"
737-
assert re.match(pat, repr(c1))
715+
pat = r"CategoricalDtype\(categories=\[.*\], ordered={ordered}\)"
716+
assert re.match(pat.format(ordered=ordered), repr(c1))
738717

739718
def test_categorical_categories(self):
740719
# GH17884
741720
c1 = CategoricalDtype(Categorical(['a', 'b']))
742721
tm.assert_index_equal(c1.categories, pd.Index(['a', 'b']))
743722
c1 = CategoricalDtype(CategoricalIndex(['a', 'b']))
744723
tm.assert_index_equal(c1.categories, pd.Index(['a', 'b']))
724+
725+
@pytest.mark.parametrize('new_categories', [
726+
list('abc'), list('cba'), list('wxyz'), None])
727+
@pytest.mark.parametrize('new_ordered', [True, False, None])
728+
def test_update_dtype(self, ordered, new_categories, new_ordered):
729+
dtype = CategoricalDtype(list('abc'), ordered)
730+
new_dtype = CategoricalDtype(new_categories, new_ordered)
731+
732+
expected_categories = new_dtype.categories
733+
if expected_categories is None:
734+
expected_categories = dtype.categories
735+
736+
expected_ordered = new_dtype.ordered
737+
if expected_ordered is None:
738+
expected_ordered = dtype.ordered
739+
740+
result = dtype._update_dtype(new_dtype)
741+
tm.assert_index_equal(result.categories, expected_categories)
742+
assert result.ordered is expected_ordered
743+
744+
def test_update_dtype_string(self, ordered):
745+
dtype = CategoricalDtype(list('abc'), ordered)
746+
expected_categories = dtype.categories
747+
expected_ordered = dtype.ordered
748+
result = dtype._update_dtype('category')
749+
tm.assert_index_equal(result.categories, expected_categories)
750+
assert result.ordered is expected_ordered
751+
752+
@pytest.mark.parametrize('bad_dtype', [
753+
'foo', object, np.int64, PeriodDtype('Q')])
754+
def test_update_dtype_errors(self, bad_dtype):
755+
dtype = CategoricalDtype(list('abc'), False)
756+
msg = 'a CategoricalDtype must be passed to perform an update, '
757+
with tm.assert_raises_regex(ValueError, msg):
758+
dtype._update_dtype(bad_dtype)

0 commit comments

Comments
 (0)