Skip to content

Commit f784e9a

Browse files
author
Tom Augspurger
committed
Merge pull request #10929 from TomAugspurger/depr-categorical-nans
[DEPR]: Deprecate setting nans in categories
2 parents 30f672c + 8d87f3b commit f784e9a

File tree

6 files changed

+162
-81
lines changed

6 files changed

+162
-81
lines changed

asv_bench/benchmarks/categoricals.py

+19-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from .pandas_vb_common import *
2-
2+
import string
33

44
class concat_categorical(object):
55
goal_time = 0.2
@@ -25,3 +25,21 @@ def time_value_counts(self):
2525

2626
def time_value_counts_dropna(self):
2727
self.ts.value_counts(dropna=True)
28+
29+
class categorical_constructor(object):
30+
goal_time = 0.2
31+
32+
def setup(self):
33+
n = 5
34+
N = 1e6
35+
self.categories = list(string.ascii_letters[:n])
36+
self.cat_idx = Index(self.categories)
37+
self.values = np.tile(self.categories, N)
38+
self.codes = np.tile(range(n), N)
39+
40+
def time_regular_constructor(self):
41+
Categorical(self.values, self.categories)
42+
43+
def time_fastpath(self):
44+
Categorical(self.codes, self.cat_idx, fastpath=True)
45+

doc/source/categorical.rst

+13-16
Original file line numberDiff line numberDiff line change
@@ -632,41 +632,35 @@ Missing Data
632632

633633
pandas primarily uses the value `np.nan` to represent missing data. It is by
634634
default not included in computations. See the :ref:`Missing Data section
635-
<missing_data>`
635+
<missing_data>`.
636636

637-
There are two ways a `np.nan` can be represented in categorical data: either the value is not
638-
available ("missing value") or `np.nan` is a valid category.
637+
Missing values should **not** be included in the Categorical's ``categories``,
638+
only in the ``values``.
639+
Instead, it is understood that NaN is different, and is always a possibility.
640+
When working with the Categorical's ``codes``, missing values will always have
641+
a code of ``-1``.
639642

640643
.. ipython:: python
641644
642645
s = pd.Series(["a","b",np.nan,"a"], dtype="category")
643646
# only two categories
644647
s
645-
s2 = pd.Series(["a","b","c","a"], dtype="category")
646-
s2.cat.categories = [1,2,np.nan]
647-
# three categories, np.nan included
648-
s2
648+
s.codes
649649
650-
.. note::
651-
As integer `Series` can't include NaN, the categories were converted to `object`.
652650
653-
.. note::
654-
Missing value methods like ``isnull`` and ``fillna`` will take both missing values as well as
655-
`np.nan` categories into account:
651+
Methods for working with missing data, e.g. :meth:`~Series.isnull`, :meth:`~Series.fillna`,
652+
:meth:`~Series.dropna`, all work normally:
656653

657654
.. ipython:: python
658655
659656
c = pd.Series(["a","b",np.nan], dtype="category")
660-
c.cat.set_categories(["a","b",np.nan], inplace=True)
661-
# will be inserted as a NA category:
662-
c[0] = np.nan
663657
s = pd.Series(c)
664658
s
665659
pd.isnull(s)
666660
s.fillna("a")
667661
668662
Differences to R's `factor`
669-
~~~~~~~~~~~~~~~~~~~~~~~~~~~
663+
---------------------------
670664

671665
The following differences to R's factor functions can be observed:
672666

@@ -677,6 +671,9 @@ The following differences to R's factor functions can be observed:
677671
* In contrast to R's `factor` function, using categorical data as the sole input to create a
678672
new categorical series will *not* remove unused categories but create a new categorical series
679673
which is equal to the passed in one!
674+
* R allows for missing values to be included in its `levels` (pandas' `categories`). Pandas
675+
does not allow `NaN` categories, but missing values can still be in the `values`.
676+
680677

681678
Gotchas
682679
-------

doc/source/whatsnew/v0.17.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -652,6 +652,7 @@ Deprecations
652652
===================== =================================
653653

654654
- ``Categorical.name`` was deprecated to make ``Categorical`` more ``numpy.ndarray`` like. Use ``Series(cat, name="whatever")`` instead (:issue:`10482`).
655+
- Setting missing values (NaN) in a ``Categorical``'s ``categories`` will issue a warning (:issue:`10748`). You can still have missing values in the ``values``.
655656
- ``drop_duplicates`` and ``duplicated``'s ``take_last`` keyword was deprecated in favor of ``keep``. (:issue:`6511`, :issue:`8505`)
656657
- ``Series.nsmallest`` and ``nlargest``'s ``take_last`` keyword was deprecated in favor of ``keep``. (:issue:`10792`)
657658
- ``DataFrame.combineAdd`` and ``DataFrame.combineMult`` are deprecated. They

pandas/core/base.py

+1
Original file line numberDiff line numberDiff line change
@@ -392,6 +392,7 @@ def argmin(self, axis=None):
392392
"""
393393
return nanops.nanargmin(self.values)
394394

395+
@cache_readonly
395396
def hasnans(self):
396397
""" return if I have any nans; enables various perf speedups """
397398
return com.isnull(self).any()

pandas/core/categorical.py

+44-15
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,7 @@ def __init__(self, values, categories=None, ordered=False, name=None, fastpath=F
207207
if fastpath:
208208
# fast path
209209
self._codes = _coerce_indexer_dtype(values, categories)
210-
self.categories = categories
210+
self._categories = self._validate_categories(categories, fastpath=isinstance(categories, ABCIndexClass))
211211
self._ordered = ordered
212212
return
213213

@@ -274,6 +274,8 @@ def __init__(self, values, categories=None, ordered=False, name=None, fastpath=F
274274
### FIXME ####
275275
raise NotImplementedError("> 1 ndim Categorical are not supported at this time")
276276

277+
categories = self._validate_categories(categories)
278+
277279
else:
278280
# there were two ways if categories are present
279281
# - the old one, where each value is a int pointer to the levels array -> not anymore
@@ -282,7 +284,6 @@ def __init__(self, values, categories=None, ordered=False, name=None, fastpath=F
282284

283285
# make sure that we always have the same type here, no matter what we get passed in
284286
categories = self._validate_categories(categories)
285-
286287
codes = _get_codes_for_values(values, categories)
287288

288289
# TODO: check for old style usage. These warnings should be removes after 0.18/ in 2016
@@ -295,7 +296,7 @@ def __init__(self, values, categories=None, ordered=False, name=None, fastpath=F
295296
"'Categorical.from_codes(codes, categories)'?", RuntimeWarning, stacklevel=2)
296297

297298
self.set_ordered(ordered or False, inplace=True)
298-
self.categories = categories
299+
self._categories = categories
299300
self._codes = _coerce_indexer_dtype(codes, categories)
300301

301302
def copy(self):
@@ -421,9 +422,15 @@ def _get_labels(self):
421422
_categories = None
422423

423424
@classmethod
424-
def _validate_categories(cls, categories):
425+
def _validate_categories(cls, categories, fastpath=False):
425426
"""
426427
Validates that we have good categories
428+
429+
Parameters
430+
----------
431+
fastpath : boolean (default: False)
432+
Don't perform validation of the categories for uniqueness or nulls
433+
427434
"""
428435
if not isinstance(categories, ABCIndexClass):
429436
dtype = None
@@ -439,16 +446,40 @@ def _validate_categories(cls, categories):
439446

440447
from pandas import Index
441448
categories = Index(categories, dtype=dtype)
442-
if not categories.is_unique:
443-
raise ValueError('Categorical categories must be unique')
449+
450+
if not fastpath:
451+
452+
# check properties of the categories
453+
# we don't allow NaNs in the categories themselves
454+
455+
if categories.hasnans:
456+
# NaNs in cats deprecated in 0.17, remove in 0.18 or 0.19 GH 10748
457+
msg = ('\nSetting NaNs in `categories` is deprecated and '
458+
'will be removed in a future version of pandas.')
459+
warn(msg, FutureWarning, stacklevel=5)
460+
461+
# categories must be unique
462+
463+
if not categories.is_unique:
464+
raise ValueError('Categorical categories must be unique')
465+
444466
return categories
445467

446-
def _set_categories(self, categories):
447-
""" Sets new categories """
448-
categories = self._validate_categories(categories)
449-
if not self._categories is None and len(categories) != len(self._categories):
468+
def _set_categories(self, categories, fastpath=False):
469+
""" Sets new categories
470+
471+
Parameters
472+
----------
473+
fastpath : boolean (default: False)
474+
Don't perform validation of the categories for uniqueness or nulls
475+
476+
"""
477+
478+
categories = self._validate_categories(categories, fastpath=fastpath)
479+
if not fastpath and not self._categories is None and len(categories) != len(self._categories):
450480
raise ValueError("new categories need to have the same number of items than the old "
451481
"categories!")
482+
452483
self._categories = categories
453484

454485
def _get_categories(self):
@@ -581,11 +612,10 @@ def set_categories(self, new_categories, ordered=None, rename=False, inplace=Fal
581612
if not cat._categories is None and len(new_categories) < len(cat._categories):
582613
# remove all _codes which are larger and set to -1/NaN
583614
self._codes[self._codes >= len(new_categories)] = -1
584-
cat._categories = new_categories
585615
else:
586616
values = cat.__array__()
587617
cat._codes = _get_codes_for_values(values, new_categories)
588-
cat._categories = new_categories
618+
cat._categories = new_categories
589619

590620
if ordered is None:
591621
ordered = self.ordered
@@ -706,9 +736,8 @@ def add_categories(self, new_categories, inplace=False):
706736
msg = "new categories must not include old categories: %s" % str(already_included)
707737
raise ValueError(msg)
708738
new_categories = list(self._categories) + list(new_categories)
709-
new_categories = self._validate_categories(new_categories)
710739
cat = self if inplace else self.copy()
711-
cat._categories = new_categories
740+
cat._categories = self._validate_categories(new_categories)
712741
cat._codes = _coerce_indexer_dtype(cat._codes, new_categories)
713742
if not inplace:
714743
return cat
@@ -1171,7 +1200,7 @@ def order(self, inplace=False, ascending=True, na_position='last'):
11711200
Category.sort
11721201
"""
11731202
warn("order is deprecated, use sort_values(...)",
1174-
FutureWarning, stacklevel=2)
1203+
FutureWarning, stacklevel=3)
11751204
return self.sort_values(inplace=inplace, ascending=ascending, na_position=na_position)
11761205

11771206
def sort(self, inplace=True, ascending=True, na_position='last'):

0 commit comments

Comments
 (0)