Skip to content

Commit 4dce349

Browse files
committed
Drop support for NaN categories in Categorical
Deprecated in 0.17.0. xref pandas-devgh-10748 xref pandas-devgh-13648
1 parent da92411 commit 4dce349

File tree

5 files changed

+31
-202
lines changed

5 files changed

+31
-202
lines changed

doc/source/categorical.rst

+9
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,15 @@ Categories must be unique or a `ValueError` is raised:
230230
except ValueError as e:
231231
print("ValueError: " + str(e))
232232
233+
Categories must also not be ``NaN`` or a `ValueError` is raised:
234+
235+
.. ipython:: python
236+
237+
try:
238+
s.cat.categories = [1,2,np.nan]
239+
except ValueError as e:
240+
print("ValueError: " + str(e))
241+
233242
Appending new categories
234243
~~~~~~~~~~~~~~~~~~~~~~~~
235244

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -816,6 +816,7 @@ Removal of prior version deprecations/changes
816816
in favor of ``iloc`` and ``iat`` as explained :ref:`here <whatsnew_0170.deprecations>` (:issue:`10711`).
817817
- The deprecated ``DataFrame.iterkv()`` has been removed in favor of ``DataFrame.iteritems()`` (:issue:`10711`)
818818
- The ``Categorical`` constructor has dropped the ``name`` parameter (:issue:`10632`)
819+
- ``Categorical`` has dropped support for ``NaN`` categories (:issue:`10748`)
819820
- The ``take_last`` parameter has been dropped from ``duplicated()``, ``drop_duplicates()``, ``nlargest()``, and ``nsmallest()`` methods (:issue:`10236`, :issue:`10792`, :issue:`10920`)
820821
- ``Series``, ``Index``, and ``DataFrame`` have dropped the ``sort`` and ``order`` methods (:issue:`10726`)
821822
- Where clauses in ``pytables`` are only accepted as strings and expressions types and not other data-types (:issue:`12027`)

pandas/core/categorical.py

+3-10
Original file line numberDiff line numberDiff line change
@@ -545,18 +545,11 @@ def _validate_categories(cls, categories, fastpath=False):
545545

546546
if not fastpath:
547547

548-
# check properties of the categories
549-
# we don't allow NaNs in the categories themselves
550-
548+
# Categories cannot contain NaN.
551549
if categories.hasnans:
552-
# NaNs in cats deprecated in 0.17
553-
# GH 10748
554-
msg = ('\nSetting NaNs in `categories` is deprecated and '
555-
'will be removed in a future version of pandas.')
556-
warn(msg, FutureWarning, stacklevel=3)
557-
558-
# categories must be unique
550+
raise ValueError('Categorial categories cannot be NaN')
559551

552+
# Categories must be unique.
560553
if not categories.is_unique:
561554
raise ValueError('Categorical categories must be unique')
562555

pandas/tests/indexes/test_category.py

-14
Original file line numberDiff line numberDiff line change
@@ -183,11 +183,6 @@ def test_contains(self):
183183
self.assertFalse(0 in ci)
184184
self.assertFalse(1 in ci)
185185

186-
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
187-
ci = CategoricalIndex(
188-
list('aabbca'), categories=list('cabdef') + [np.nan])
189-
self.assertFalse(np.nan in ci)
190-
191186
ci = CategoricalIndex(
192187
list('aabbca') + [np.nan], categories=list('cabdef'))
193188
self.assertTrue(np.nan in ci)
@@ -541,7 +536,6 @@ def test_ensure_copied_data(self):
541536
self.assertIs(_base(index.values), _base(result.values))
542537

543538
def test_equals_categorical(self):
544-
545539
ci1 = CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=True)
546540
ci2 = CategoricalIndex(['a', 'b'], categories=['a', 'b', 'c'],
547541
ordered=True)
@@ -579,14 +573,6 @@ def test_equals_categorical(self):
579573
self.assertFalse(ci.equals(CategoricalIndex(list('aabca'))))
580574
self.assertTrue(ci.equals(ci.copy()))
581575

582-
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
583-
ci = CategoricalIndex(list('aabca'),
584-
categories=['c', 'a', 'b', np.nan])
585-
self.assertFalse(ci.equals(list('aabca')))
586-
self.assertFalse(ci.equals(CategoricalIndex(list('aabca'))))
587-
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
588-
self.assertTrue(ci.equals(ci.copy()))
589-
590576
ci = CategoricalIndex(list('aabca') + [np.nan],
591577
categories=['c', 'a', 'b'])
592578
self.assertFalse(ci.equals(list('aabca')))

pandas/tests/test_categorical.py

+18-178
Original file line numberDiff line numberDiff line change
@@ -160,12 +160,6 @@ def f():
160160

161161
self.assertRaises(ValueError, f)
162162

163-
def f():
164-
with tm.assert_produces_warning(FutureWarning):
165-
Categorical([1, 2], [1, 2, np.nan, np.nan])
166-
167-
self.assertRaises(ValueError, f)
168-
169163
# The default should be unordered
170164
c1 = Categorical(["a", "b", "c", "a"])
171165
self.assertFalse(c1.ordered)
@@ -222,29 +216,21 @@ def f():
222216
cat = pd.Categorical([np.nan, 1., 2., 3.])
223217
self.assertTrue(is_float_dtype(cat.categories))
224218

225-
# Deprecating NaNs in categoires (GH #10748)
226-
# preserve int as far as possible by converting to object if NaN is in
227-
# categories
228-
with tm.assert_produces_warning(FutureWarning):
229-
cat = pd.Categorical([np.nan, 1, 2, 3],
230-
categories=[np.nan, 1, 2, 3])
231-
self.assertTrue(is_object_dtype(cat.categories))
232-
233219
# This doesn't work -> this would probably need some kind of "remember
234220
# the original type" feature to try to cast the array interface result
235221
# to...
236222

237223
# vals = np.asarray(cat[cat.notnull()])
238224
# self.assertTrue(is_integer_dtype(vals))
239-
with tm.assert_produces_warning(FutureWarning):
240-
cat = pd.Categorical([np.nan, "a", "b", "c"],
241-
categories=[np.nan, "a", "b", "c"])
242-
self.assertTrue(is_object_dtype(cat.categories))
243-
# but don't do it for floats
244-
with tm.assert_produces_warning(FutureWarning):
245-
cat = pd.Categorical([np.nan, 1., 2., 3.],
246-
categories=[np.nan, 1., 2., 3.])
247-
self.assertTrue(is_float_dtype(cat.categories))
225+
226+
# Cannot have NaN in categories
227+
def f(null_value):
228+
pd.Categorical([null_value, "a", "b", "c"],
229+
categories=[null_value, "a", "b", "c"])
230+
231+
self.assertRaises(ValueError, f, np.nan)
232+
self.assertRaises(ValueError, f, pd.NaT)
233+
self.assertRaises(ValueError, f, None)
248234

249235
# corner cases
250236
cat = pd.Categorical([1])
@@ -418,6 +404,12 @@ def f():
418404

419405
self.assertRaises(ValueError, f)
420406

407+
# NaN categories included
408+
def f():
409+
Categorical.from_codes([0, 1, 2], ["a", "b", np.nan])
410+
411+
self.assertRaises(ValueError, f)
412+
421413
# too negative
422414
def f():
423415
Categorical.from_codes([-2, 1, 2], ["a", "b", "c"])
@@ -649,30 +641,6 @@ def test_describe(self):
649641
name='categories'))
650642
tm.assert_frame_equal(desc, expected)
651643

652-
# NA as a category
653-
with tm.assert_produces_warning(FutureWarning):
654-
cat = pd.Categorical(["a", "c", "c", np.nan],
655-
categories=["b", "a", "c", np.nan])
656-
result = cat.describe()
657-
658-
expected = DataFrame([[0, 0], [1, 0.25], [2, 0.5], [1, 0.25]],
659-
columns=['counts', 'freqs'],
660-
index=pd.CategoricalIndex(['b', 'a', 'c', np.nan],
661-
name='categories'))
662-
tm.assert_frame_equal(result, expected, check_categorical=False)
663-
664-
# NA as an unused category
665-
with tm.assert_produces_warning(FutureWarning):
666-
cat = pd.Categorical(["a", "c", "c"],
667-
categories=["b", "a", "c", np.nan])
668-
result = cat.describe()
669-
670-
exp_idx = pd.CategoricalIndex(
671-
['b', 'a', 'c', np.nan], name='categories')
672-
expected = DataFrame([[0, 0], [1, 1 / 3.], [2, 2 / 3.], [0, 0]],
673-
columns=['counts', 'freqs'], index=exp_idx)
674-
tm.assert_frame_equal(result, expected, check_categorical=False)
675-
676644
def test_print(self):
677645
expected = ["[a, b, b, a, a, c, c, c]",
678646
"Categories (3, object): [a < b < c]"]
@@ -1119,90 +1087,18 @@ def test_nan_handling(self):
11191087
self.assert_numpy_array_equal(c._codes,
11201088
np.array([0, -1, -1, 0], dtype=np.int8))
11211089

1122-
# If categories have nan included, the code should point to that
1123-
# instead
1124-
with tm.assert_produces_warning(FutureWarning):
1125-
c = Categorical(["a", "b", np.nan, "a"],
1126-
categories=["a", "b", np.nan])
1127-
self.assert_index_equal(c.categories, Index(["a", "b", np.nan]))
1128-
self.assert_numpy_array_equal(c._codes,
1129-
np.array([0, 1, 2, 0], dtype=np.int8))
1130-
c[1] = np.nan
1131-
self.assert_index_equal(c.categories, Index(["a", "b", np.nan]))
1132-
self.assert_numpy_array_equal(c._codes,
1133-
np.array([0, 2, 2, 0], dtype=np.int8))
1134-
1135-
# Changing categories should also make the replaced category np.nan
1136-
c = Categorical(["a", "b", "c", "a"])
1137-
with tm.assert_produces_warning(FutureWarning):
1138-
c.categories = ["a", "b", np.nan] # noqa
1139-
1140-
self.assert_index_equal(c.categories, Index(["a", "b", np.nan]))
1141-
self.assert_numpy_array_equal(c._codes,
1142-
np.array([0, 1, 2, 0], dtype=np.int8))
1143-
11441090
# Adding nan to categories should make assigned nan point to the
11451091
# category!
11461092
c = Categorical(["a", "b", np.nan, "a"])
11471093
self.assert_index_equal(c.categories, Index(["a", "b"]))
11481094
self.assert_numpy_array_equal(c._codes,
11491095
np.array([0, 1, -1, 0], dtype=np.int8))
1150-
with tm.assert_produces_warning(FutureWarning):
1151-
c.set_categories(["a", "b", np.nan], rename=True, inplace=True)
1152-
1153-
self.assert_index_equal(c.categories, Index(["a", "b", np.nan]))
1154-
self.assert_numpy_array_equal(c._codes,
1155-
np.array([0, 1, -1, 0], dtype=np.int8))
1156-
c[1] = np.nan
1157-
self.assert_index_equal(c.categories, Index(["a", "b", np.nan]))
1158-
self.assert_numpy_array_equal(c._codes,
1159-
np.array([0, 2, -1, 0], dtype=np.int8))
1160-
1161-
# Remove null categories (GH 10156)
1162-
cases = [([1.0, 2.0, np.nan], [1.0, 2.0]),
1163-
(['a', 'b', None], ['a', 'b']),
1164-
([pd.Timestamp('2012-05-01'), pd.NaT],
1165-
[pd.Timestamp('2012-05-01')])]
1166-
1167-
null_values = [np.nan, None, pd.NaT]
1168-
1169-
for with_null, without in cases:
1170-
with tm.assert_produces_warning(FutureWarning):
1171-
base = Categorical([], with_null)
1172-
expected = Categorical([], without)
1173-
1174-
for nullval in null_values:
1175-
result = base.remove_categories(nullval)
1176-
self.assert_categorical_equal(result, expected)
1177-
1178-
# Different null values are indistinguishable
1179-
for i, j in [(0, 1), (0, 2), (1, 2)]:
1180-
nulls = [null_values[i], null_values[j]]
1181-
1182-
def f():
1183-
with tm.assert_produces_warning(FutureWarning):
1184-
Categorical([], categories=nulls)
1185-
1186-
self.assertRaises(ValueError, f)
11871096

11881097
def test_isnull(self):
11891098
exp = np.array([False, False, True])
11901099
c = Categorical(["a", "b", np.nan])
11911100
res = c.isnull()
1192-
self.assert_numpy_array_equal(res, exp)
1193-
1194-
with tm.assert_produces_warning(FutureWarning):
1195-
c = Categorical(["a", "b", np.nan], categories=["a", "b", np.nan])
1196-
res = c.isnull()
1197-
self.assert_numpy_array_equal(res, exp)
11981101

1199-
# test both nan in categories and as -1
1200-
exp = np.array([True, False, True])
1201-
c = Categorical(["a", "b", np.nan])
1202-
with tm.assert_produces_warning(FutureWarning):
1203-
c.set_categories(["a", "b", np.nan], rename=True, inplace=True)
1204-
c[0] = np.nan
1205-
res = c.isnull()
12061102
self.assert_numpy_array_equal(res, exp)
12071103

12081104
def test_codes_immutable(self):
@@ -1487,45 +1383,10 @@ def test_slicing_directly(self):
14871383

14881384
def test_set_item_nan(self):
14891385
cat = pd.Categorical([1, 2, 3])
1490-
exp = pd.Categorical([1, np.nan, 3], categories=[1, 2, 3])
14911386
cat[1] = np.nan
1492-
tm.assert_categorical_equal(cat, exp)
14931387

1494-
# if nan in categories, the proper code should be set!
1495-
cat = pd.Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
1496-
with tm.assert_produces_warning(FutureWarning):
1497-
cat.set_categories([1, 2, 3, np.nan], rename=True, inplace=True)
1498-
cat[1] = np.nan
1499-
exp = np.array([0, 3, 2, -1], dtype=np.int8)
1500-
self.assert_numpy_array_equal(cat.codes, exp)
1501-
1502-
cat = pd.Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
1503-
with tm.assert_produces_warning(FutureWarning):
1504-
cat.set_categories([1, 2, 3, np.nan], rename=True, inplace=True)
1505-
cat[1:3] = np.nan
1506-
exp = np.array([0, 3, 3, -1], dtype=np.int8)
1507-
self.assert_numpy_array_equal(cat.codes, exp)
1508-
1509-
cat = pd.Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
1510-
with tm.assert_produces_warning(FutureWarning):
1511-
cat.set_categories([1, 2, 3, np.nan], rename=True, inplace=True)
1512-
cat[1:3] = [np.nan, 1]
1513-
exp = np.array([0, 3, 0, -1], dtype=np.int8)
1514-
self.assert_numpy_array_equal(cat.codes, exp)
1515-
1516-
cat = pd.Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
1517-
with tm.assert_produces_warning(FutureWarning):
1518-
cat.set_categories([1, 2, 3, np.nan], rename=True, inplace=True)
1519-
cat[1:3] = [np.nan, np.nan]
1520-
exp = np.array([0, 3, 3, -1], dtype=np.int8)
1521-
self.assert_numpy_array_equal(cat.codes, exp)
1522-
1523-
cat = pd.Categorical([1, 2, np.nan, 3], categories=[1, 2, 3])
1524-
with tm.assert_produces_warning(FutureWarning):
1525-
cat.set_categories([1, 2, 3, np.nan], rename=True, inplace=True)
1526-
cat[pd.isnull(cat)] = np.nan
1527-
exp = np.array([0, 1, 3, 2], dtype=np.int8)
1528-
self.assert_numpy_array_equal(cat.codes, exp)
1388+
exp = pd.Categorical([1, np.nan, 3], categories=[1, 2, 3])
1389+
tm.assert_categorical_equal(cat, exp)
15291390

15301391
def test_shift(self):
15311392
# GH 9416
@@ -2026,33 +1887,12 @@ def test_sideeffects_free(self):
20261887

20271888
def test_nan_handling(self):
20281889

2029-
# Nans are represented as -1 in labels
1890+
# NaNs are represented as -1 in labels
20301891
s = Series(Categorical(["a", "b", np.nan, "a"]))
20311892
self.assert_index_equal(s.cat.categories, Index(["a", "b"]))
20321893
self.assert_numpy_array_equal(s.values.codes,
20331894
np.array([0, 1, -1, 0], dtype=np.int8))
20341895

2035-
# If categories have nan included, the label should point to that
2036-
# instead
2037-
with tm.assert_produces_warning(FutureWarning):
2038-
s2 = Series(Categorical(["a", "b", np.nan, "a"],
2039-
categories=["a", "b", np.nan]))
2040-
2041-
exp_cat = Index(["a", "b", np.nan])
2042-
self.assert_index_equal(s2.cat.categories, exp_cat)
2043-
self.assert_numpy_array_equal(s2.values.codes,
2044-
np.array([0, 1, 2, 0], dtype=np.int8))
2045-
2046-
# Changing categories should also make the replaced category np.nan
2047-
s3 = Series(Categorical(["a", "b", "c", "a"]))
2048-
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
2049-
s3.cat.categories = ["a", "b", np.nan]
2050-
2051-
exp_cat = Index(["a", "b", np.nan])
2052-
self.assert_index_equal(s3.cat.categories, exp_cat)
2053-
self.assert_numpy_array_equal(s3.values.codes,
2054-
np.array([0, 1, 2, 0], dtype=np.int8))
2055-
20561896
def test_cat_accessor(self):
20571897
s = Series(Categorical(["a", "b", np.nan, "a"]))
20581898
self.assert_index_equal(s.cat.categories, Index(["a", "b"]))

0 commit comments

Comments
 (0)