Skip to content

Commit 8577542

Browse files
committed
DEPR: No NaNs in categories
1 parent a06816e commit 8577542

File tree

4 files changed

+103
-72
lines changed

4 files changed

+103
-72
lines changed

doc/source/categorical.rst

+6-13
Original file line numberDiff line numberDiff line change
@@ -634,21 +634,17 @@ pandas primarily uses the value `np.nan` to represent missing data. It is by
634634
default not included in computations. See the :ref:`Missing Data section
635635
<missing_data>`
636636

637-
There are two ways a `np.nan` can be represented in categorical data: either the value is not
638-
available ("missing value") or `np.nan` is a valid category.
637+
Missing values should **not** be included in the Categorical's ``categories``.
638+
Instead, it is understood that NaN is different, and is always a possibility.
639+
When working with the Categorical's ``codes``, missing values will always have
640+
a code of ``-1``.
639641

640642
.. ipython:: python
641643
642-
s = pd.Series(["a","b",np.nan,"a"], dtype="category")
644+
s = pd.Series(["a","b",np.nan,"a"], dtype="category"
645+
)
643646
# only two categories
644647
s
645-
s2 = pd.Series(["a","b","c","a"], dtype="category")
646-
s2.cat.categories = [1,2,np.nan]
647-
# three categories, np.nan included
648-
s2
649-
650-
.. note::
651-
As integer `Series` can't include NaN, the categories were converted to `object`.
652648
653649
.. note::
654650
Missing value methods like ``isnull`` and ``fillna`` will take both missing values as well as
@@ -657,9 +653,6 @@ available ("missing value") or `np.nan` is a valid category.
657653
.. ipython:: python
658654
659655
c = pd.Series(["a","b",np.nan], dtype="category")
660-
c.cat.set_categories(["a","b",np.nan], inplace=True)
661-
# will be inserted as a NA category:
662-
c[0] = np.nan
663656
s = pd.Series(c)
664657
s
665658
pd.isnull(s)

doc/source/whatsnew/v0.17.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -646,6 +646,7 @@ Deprecations
646646
===================== =================================
647647

648648
- ``Categorical.name`` was deprecated to make ``Categorical`` more ``numpy.ndarray`` like. Use ``Series(cat, name="whatever")`` instead (:issue:`10482`).
649+
- Setting missing values (NaN) in a ``Categorical``'s ``categories`` will issue a warning (:issue:`10748`). You can still have missing values in the ``values``.
649650
- ``drop_duplicates`` and ``duplicated``'s ``take_last`` keyword was deprecated in favor of ``keep``. (:issue:`6511`, :issue:`8505`)
650651
- ``Series.nsmallest`` and ``nlargest``'s ``take_last`` keyword was deprecated in favor of ``keep``. (:issue:`10792`)
651652
- ``DataFrame.combineAdd`` and ``DataFrame.combineMult`` are deprecated. They

pandas/core/categorical.py

+16-10
Original file line numberDiff line numberDiff line change
@@ -443,12 +443,18 @@ def _validate_categories(cls, categories):
443443
raise ValueError('Categorical categories must be unique')
444444
return categories
445445

446-
def _set_categories(self, categories):
446+
def _set_categories(self, categories, validate=True):
447447
""" Sets new categories """
448-
categories = self._validate_categories(categories)
449-
if not self._categories is None and len(categories) != len(self._categories):
450-
raise ValueError("new categories need to have the same number of items than the old "
451-
"categories!")
448+
if validate:
449+
categories = self._validate_categories(categories)
450+
if not self._categories is None and len(categories) != len(self._categories):
451+
raise ValueError("new categories need to have the same number of items than the old "
452+
"categories!")
453+
if np.any(isnull(categories)):
454+
# NaNs in cats deprecated in 0.17, remove in 0.18 or 0.19 GH 10748
455+
msg = ('\nSetting NaNs in `categories` is deprecated and '
456+
'will be removed in a future version of pandas.')
457+
warn(msg, FutureWarning, stacklevel=9)
452458
self._categories = categories
453459

454460
def _get_categories(self):
@@ -581,11 +587,11 @@ def set_categories(self, new_categories, ordered=None, rename=False, inplace=Fal
581587
if not cat._categories is None and len(new_categories) < len(cat._categories):
582588
# remove all _codes which are larger and set to -1/NaN
583589
self._codes[self._codes >= len(new_categories)] = -1
584-
cat._categories = new_categories
590+
cat._set_categories(new_categories, validate=False)
585591
else:
586592
values = cat.__array__()
587593
cat._codes = _get_codes_for_values(values, new_categories)
588-
cat._categories = new_categories
594+
cat._set_categories(new_categories, validate=False)
589595

590596
if ordered is None:
591597
ordered = self.ordered
@@ -708,7 +714,7 @@ def add_categories(self, new_categories, inplace=False):
708714
new_categories = list(self._categories) + list(new_categories)
709715
new_categories = self._validate_categories(new_categories)
710716
cat = self if inplace else self.copy()
711-
cat._categories = new_categories
717+
cat._set_categories(new_categories, validate=False)
712718
cat._codes = _coerce_indexer_dtype(cat._codes, new_categories)
713719
if not inplace:
714720
return cat
@@ -791,7 +797,7 @@ def remove_unused_categories(self, inplace=False):
791797
from pandas.core.index import _ensure_index
792798
new_categories = _ensure_index(new_categories)
793799
cat._codes = _get_codes_for_values(cat.__array__(), new_categories)
794-
cat._categories = new_categories
800+
cat._set_categories(new_categories, validate=False)
795801
if not inplace:
796802
return cat
797803

@@ -1171,7 +1177,7 @@ def order(self, inplace=False, ascending=True, na_position='last'):
11711177
Category.sort
11721178
"""
11731179
warn("order is deprecated, use sort_values(...)",
1174-
FutureWarning, stacklevel=2)
1180+
FutureWarning, stacklevel=3)
11751181
return self.sort_values(inplace=inplace, ascending=ascending, na_position=na_position)
11761182

11771183
def sort(self, inplace=True, ascending=True, na_position='last'):

pandas/tests/test_categorical.py

+80-49
Original file line numberDiff line numberDiff line change
@@ -187,17 +187,21 @@ def f():
187187
cat = pd.Categorical([np.nan, 1., 2., 3. ])
188188
self.assertTrue(com.is_float_dtype(cat.categories))
189189

190+
# Deprecating NaNs in categoires (GH #10748)
190191
# preserve int as far as possible by converting to object if NaN is in categories
191-
cat = pd.Categorical([np.nan, 1, 2, 3], categories=[np.nan, 1, 2, 3])
192+
with tm.assert_produces_warning(FutureWarning):
193+
cat = pd.Categorical([np.nan, 1, 2, 3], categories=[np.nan, 1, 2, 3])
192194
self.assertTrue(com.is_object_dtype(cat.categories))
193195
# This doesn't work -> this would probably need some kind of "remember the original type"
194196
# feature to try to cast the array interface result to...
195197
#vals = np.asarray(cat[cat.notnull()])
196198
#self.assertTrue(com.is_integer_dtype(vals))
197-
cat = pd.Categorical([np.nan,"a", "b", "c"], categories=[np.nan,"a", "b", "c"])
199+
with tm.assert_produces_warning(FutureWarning):
200+
cat = pd.Categorical([np.nan,"a", "b", "c"], categories=[np.nan,"a", "b", "c"])
198201
self.assertTrue(com.is_object_dtype(cat.categories))
199202
# but don't do it for floats
200-
cat = pd.Categorical([np.nan, 1., 2., 3.], categories=[np.nan, 1., 2., 3.])
203+
with tm.assert_produces_warning(FutureWarning):
204+
cat = pd.Categorical([np.nan, 1., 2., 3.], categories=[np.nan, 1., 2., 3.])
201205
self.assertTrue(com.is_float_dtype(cat.categories))
202206

203207

@@ -465,17 +469,19 @@ def test_describe(self):
465469
tm.assert_frame_equal(desc, expected)
466470

467471
# NA as a category
468-
cat = pd.Categorical(["a","c","c",np.nan], categories=["b","a","c",np.nan])
469-
result = cat.describe()
472+
with tm.assert_produces_warning(FutureWarning):
473+
cat = pd.Categorical(["a","c","c",np.nan], categories=["b","a","c",np.nan])
474+
result = cat.describe()
470475

471476
expected = DataFrame([[0,0],[1,0.25],[2,0.5],[1,0.25]],
472477
columns=['counts','freqs'],
473478
index=Index(['b','a','c',np.nan],name='categories'))
474479
tm.assert_frame_equal(result,expected)
475480

476481
# NA as an unused category
477-
cat = pd.Categorical(["a","c","c"], categories=["b","a","c",np.nan])
478-
result = cat.describe()
482+
with tm.assert_produces_warning(FutureWarning):
483+
cat = pd.Categorical(["a","c","c"], categories=["b","a","c",np.nan])
484+
result = cat.describe()
479485

480486
expected = DataFrame([[0,0],[1,1/3.],[2,2/3.],[0,0]],
481487
columns=['counts','freqs'],
@@ -827,29 +833,37 @@ def test_nan_handling(self):
827833
self.assert_numpy_array_equal(c._codes , np.array([0,-1,-1,0]))
828834

829835
# If categories have nan included, the code should point to that instead
830-
c = Categorical(["a","b",np.nan,"a"], categories=["a","b",np.nan])
831-
self.assert_numpy_array_equal(c.categories , np.array(["a","b",np.nan],dtype=np.object_))
832-
self.assert_numpy_array_equal(c._codes , np.array([0,1,2,0]))
836+
with tm.assert_produces_warning(FutureWarning):
837+
c = Categorical(["a","b",np.nan,"a"], categories=["a","b",np.nan])
838+
self.assert_numpy_array_equal(c.categories, np.array(["a","b",np.nan],
839+
dtype=np.object_))
840+
self.assert_numpy_array_equal(c._codes, np.array([0,1,2,0]))
833841
c[1] = np.nan
834-
self.assert_numpy_array_equal(c.categories , np.array(["a","b",np.nan],dtype=np.object_))
835-
self.assert_numpy_array_equal(c._codes , np.array([0,2,2,0]))
842+
self.assert_numpy_array_equal(c.categories, np.array(["a","b",np.nan],
843+
dtype=np.object_))
844+
self.assert_numpy_array_equal(c._codes, np.array([0,2,2,0]))
836845

837846
# Changing categories should also make the replaced category np.nan
838847
c = Categorical(["a","b","c","a"])
839-
c.categories = ["a","b",np.nan]
840-
self.assert_numpy_array_equal(c.categories , np.array(["a","b",np.nan],dtype=np.object_))
841-
self.assert_numpy_array_equal(c._codes , np.array([0,1,2,0]))
848+
with tm.assert_produces_warning(FutureWarning):
849+
c.categories = ["a","b",np.nan]
850+
self.assert_numpy_array_equal(c.categories, np.array(["a","b",np.nan],
851+
dtype=np.object_))
852+
self.assert_numpy_array_equal(c._codes, np.array([0,1,2,0]))
842853

843854
# Adding nan to categories should make assigned nan point to the category!
844855
c = Categorical(["a","b",np.nan,"a"])
845856
self.assert_numpy_array_equal(c.categories , np.array(["a","b"]))
846857
self.assert_numpy_array_equal(c._codes , np.array([0,1,-1,0]))
847-
c.set_categories(["a","b",np.nan], rename=True, inplace=True)
848-
self.assert_numpy_array_equal(c.categories , np.array(["a","b",np.nan],dtype=np.object_))
849-
self.assert_numpy_array_equal(c._codes , np.array([0,1,-1,0]))
858+
with tm.assert_produces_warning(FutureWarning):
859+
c.set_categories(["a","b",np.nan], rename=True, inplace=True)
860+
self.assert_numpy_array_equal(c.categories, np.array(["a","b",np.nan],
861+
dtype=np.object_))
862+
self.assert_numpy_array_equal(c._codes, np.array([0,1,-1,0]))
850863
c[1] = np.nan
851-
self.assert_numpy_array_equal(c.categories , np.array(["a","b",np.nan],dtype=np.object_))
852-
self.assert_numpy_array_equal(c._codes , np.array([0,2,-1,0]))
864+
self.assert_numpy_array_equal(c.categories , np.array(["a","b",np.nan],
865+
dtype=np.object_))
866+
self.assert_numpy_array_equal(c._codes, np.array([0,2,-1,0]))
853867

854868
# Remove null categories (GH 10156)
855869
cases = [
@@ -861,11 +875,13 @@ def test_nan_handling(self):
861875
null_values = [np.nan, None, pd.NaT]
862876

863877
for with_null, without in cases:
864-
base = Categorical([], with_null)
878+
with tm.assert_produces_warning(FutureWarning):
879+
base = Categorical([], with_null)
865880
expected = Categorical([], without)
866881

867-
for nullval in null_values:
868-
result = base.remove_categories(nullval)
882+
with tm.assert_produces_warning(FutureWarning):
883+
for nullval in null_values:
884+
result = base.remove_categories(nullval)
869885
self.assert_categorical_equal(result, expected)
870886

871887
# Different null values are indistinguishable
@@ -880,14 +896,16 @@ def test_isnull(self):
880896
res = c.isnull()
881897
self.assert_numpy_array_equal(res, exp)
882898

883-
c = Categorical(["a","b",np.nan], categories=["a","b",np.nan])
899+
with tm.assert_produces_warning(FutureWarning):
900+
c = Categorical(["a","b",np.nan], categories=["a","b",np.nan])
884901
res = c.isnull()
885902
self.assert_numpy_array_equal(res, exp)
886903

887904
# test both nan in categories and as -1
888905
exp = np.array([True, False, True])
889906
c = Categorical(["a","b",np.nan])
890-
c.set_categories(["a","b",np.nan], rename=True, inplace=True)
907+
with tm.assert_produces_warning(FutureWarning):
908+
c.set_categories(["a","b",np.nan], rename=True, inplace=True)
891909
c[0] = np.nan
892910
res = c.isnull()
893911
self.assert_numpy_array_equal(res, exp)
@@ -1087,31 +1105,36 @@ def test_set_item_nan(self):
10871105

10881106
# if nan in categories, the proper code should be set!
10891107
cat = pd.Categorical([1,2,3, np.nan], categories=[1,2,3])
1090-
cat.set_categories([1,2,3, np.nan], rename=True, inplace=True)
1108+
with tm.assert_produces_warning(FutureWarning):
1109+
cat.set_categories([1,2,3, np.nan], rename=True, inplace=True)
10911110
cat[1] = np.nan
10921111
exp = np.array([0,3,2,-1])
10931112
self.assert_numpy_array_equal(cat.codes, exp)
10941113

10951114
cat = pd.Categorical([1,2,3, np.nan], categories=[1,2,3])
1096-
cat.set_categories([1,2,3, np.nan], rename=True, inplace=True)
1115+
with tm.assert_produces_warning(FutureWarning):
1116+
cat.set_categories([1,2,3, np.nan], rename=True, inplace=True)
10971117
cat[1:3] = np.nan
10981118
exp = np.array([0,3,3,-1])
10991119
self.assert_numpy_array_equal(cat.codes, exp)
11001120

11011121
cat = pd.Categorical([1,2,3, np.nan], categories=[1,2,3])
1102-
cat.set_categories([1,2,3, np.nan], rename=True, inplace=True)
1122+
with tm.assert_produces_warning(FutureWarning):
1123+
cat.set_categories([1,2,3, np.nan], rename=True, inplace=True)
11031124
cat[1:3] = [np.nan, 1]
11041125
exp = np.array([0,3,0,-1])
11051126
self.assert_numpy_array_equal(cat.codes, exp)
11061127

11071128
cat = pd.Categorical([1,2,3, np.nan], categories=[1,2,3])
1108-
cat.set_categories([1,2,3, np.nan], rename=True, inplace=True)
1129+
with tm.assert_produces_warning(FutureWarning):
1130+
cat.set_categories([1,2,3, np.nan], rename=True, inplace=True)
11091131
cat[1:3] = [np.nan, np.nan]
11101132
exp = np.array([0,3,3,-1])
11111133
self.assert_numpy_array_equal(cat.codes, exp)
11121134

11131135
cat = pd.Categorical([1,2, np.nan, 3], categories=[1,2,3])
1114-
cat.set_categories([1,2,3, np.nan], rename=True, inplace=True)
1136+
with tm.assert_produces_warning(FutureWarning):
1137+
cat.set_categories([1,2,3, np.nan], rename=True, inplace=True)
11151138
cat[pd.isnull(cat)] = np.nan
11161139
exp = np.array([0,1,3,2])
11171140
self.assert_numpy_array_equal(cat.codes, exp)
@@ -1555,14 +1578,16 @@ def test_nan_handling(self):
15551578
self.assert_numpy_array_equal(s.values.codes, np.array([0,1,-1,0]))
15561579

15571580
# If categories have nan included, the label should point to that instead
1558-
s2 = Series(Categorical(["a","b",np.nan,"a"], categories=["a","b",np.nan]))
1581+
with tm.assert_produces_warning(FutureWarning):
1582+
s2 = Series(Categorical(["a","b",np.nan,"a"], categories=["a","b",np.nan]))
15591583
self.assert_numpy_array_equal(s2.cat.categories,
15601584
np.array(["a","b",np.nan], dtype=np.object_))
15611585
self.assert_numpy_array_equal(s2.values.codes, np.array([0,1,2,0]))
15621586

15631587
# Changing categories should also make the replaced category np.nan
15641588
s3 = Series(Categorical(["a","b","c","a"]))
1565-
s3.cat.categories = ["a","b",np.nan]
1589+
with tm.assert_produces_warning(FutureWarning):
1590+
s3.cat.categories = ["a","b",np.nan]
15661591
self.assert_numpy_array_equal(s3.cat.categories,
15671592
np.array(["a","b",np.nan], dtype=np.object_))
15681593
self.assert_numpy_array_equal(s3.values.codes, np.array([0,1,2,0]))
@@ -2415,28 +2440,32 @@ def test_value_counts_with_nan(self):
24152440
s.value_counts(dropna=False, sort=False),
24162441
pd.Series([2, 1, 3], index=["a", "b", np.nan]))
24172442

2418-
s = pd.Series(pd.Categorical(["a", "b", "a"], categories=["a", "b", np.nan]))
2419-
tm.assert_series_equal(
2420-
s.value_counts(dropna=True),
2421-
pd.Series([2, 1], index=["a", "b"]))
2422-
tm.assert_series_equal(
2423-
s.value_counts(dropna=False),
2424-
pd.Series([2, 1, 0], index=["a", "b", np.nan]))
2443+
with tm.assert_produces_warning(FutureWarning):
2444+
s = pd.Series(pd.Categorical(["a", "b", "a"], categories=["a", "b", np.nan]))
2445+
tm.assert_series_equal(
2446+
s.value_counts(dropna=True),
2447+
pd.Series([2, 1], index=["a", "b"]))
2448+
tm.assert_series_equal(
2449+
s.value_counts(dropna=False),
2450+
pd.Series([2, 1, 0], index=["a", "b", np.nan]))
24252451

2426-
s = pd.Series(pd.Categorical(["a", "b", None, "a", None, None], categories=["a", "b", np.nan]))
2427-
tm.assert_series_equal(
2428-
s.value_counts(dropna=True),
2429-
pd.Series([2, 1], index=["a", "b"]))
2430-
tm.assert_series_equal(
2431-
s.value_counts(dropna=False),
2432-
pd.Series([3, 2, 1], index=[np.nan, "a", "b"]))
2452+
with tm.assert_produces_warning(FutureWarning):
2453+
s = pd.Series(pd.Categorical(["a", "b", None, "a", None, None],
2454+
categories=["a", "b", np.nan]))
2455+
tm.assert_series_equal(
2456+
s.value_counts(dropna=True),
2457+
pd.Series([2, 1], index=["a", "b"]))
2458+
tm.assert_series_equal(
2459+
s.value_counts(dropna=False),
2460+
pd.Series([3, 2, 1], index=[np.nan, "a", "b"]))
24332461

24342462
def test_groupby(self):
24352463

24362464
cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], categories=["a","b","c","d"], ordered=True)
24372465
data = DataFrame({"a":[1,1,1,2,2,2,3,4,5], "b":cats})
24382466

2439-
expected = DataFrame({ 'a' : Series([1,2,4,np.nan],index=Index(['a','b','c','d'],name='b')) })
2467+
expected = DataFrame({'a': Series([1, 2, 4, np.nan],
2468+
index=Index(['a', 'b', 'c', 'd'], name='b'))})
24402469
result = data.groupby("b").mean()
24412470
tm.assert_frame_equal(result, expected)
24422471

@@ -3454,11 +3483,13 @@ def f():
34543483

34553484
# make sure that fillna takes both missing values and NA categories into account
34563485
c = Categorical(["a","b",np.nan])
3457-
c.set_categories(["a","b",np.nan], rename=True, inplace=True)
3486+
with tm.assert_produces_warning(FutureWarning):
3487+
c.set_categories(["a","b",np.nan], rename=True, inplace=True)
34583488
c[0] = np.nan
34593489
df = pd.DataFrame({"cats":c, "vals":[1,2,3]})
34603490
df_exp = pd.DataFrame({"cats": Categorical(["a","b","a"]), "vals": [1,2,3]})
3461-
res = df.fillna("a")
3491+
with tm.assert_produces_warning(FutureWarning):
3492+
res = df.fillna("a")
34623493
tm.assert_frame_equal(res, df_exp)
34633494

34643495

0 commit comments

Comments
 (0)