DEPR: No NaNs in categories

TomAugspurger · TomAugspurger · commit e757e8a817f7 · 2015-09-01T07:50:21.000-05:00
diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py
@@ -1,5 +1,5 @@
 from .pandas_vb_common import *
-
+import string
 
 class concat_categorical(object):
     goal_time = 0.2
@@ -25,3 +25,21 @@ def time_value_counts(self):
 
     def time_value_counts_dropna(self):
         self.ts.value_counts(dropna=True)
+
+class categorical_constructor(object):
+    goal_time = 0.2
+
+    def setup(self):
+        n = 5
+        N = 1e6
+        self.categories = list(string.ascii_letters[:n])
+        self.cat_idx = Index(self.categories)
+        self.values = np.tile(self.categories, N)
+        self.codes = np.tile(range(n), N)
+
+    def time_regular_constructor(self):
+        Categorical(self.values, self.categories)
+
+    def time_fastpath(self):
+        Categorical(self.codes, self.cat_idx, fastpath=True)
+
diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst
@@ -632,41 +632,35 @@ Missing Data
 
 pandas primarily uses the value `np.nan` to represent missing data. It is by
 default not included in computations. See the :ref:`Missing Data section
-<missing_data>`
+<missing_data>`.
 
-There are two ways a `np.nan` can be represented in categorical data: either the value is not
-available ("missing value") or `np.nan` is a valid category.
+Missing values should **not** be included in the Categorical's ``categories``,
+only in the ``values``.
+Instead, it is understood that NaN is different, and is always a possibility.
+When working with the Categorical's ``codes``, missing values will always have
+a code of ``-1``.
 
 .. ipython:: python
 
     s = pd.Series(["a","b",np.nan,"a"], dtype="category")
     # only two categories
     s
-    s2 = pd.Series(["a","b","c","a"], dtype="category")
-    s2.cat.categories = [1,2,np.nan]
-    # three categories, np.nan included
-    s2
+    s.codes
 
-.. note::
-    As integer `Series` can't include NaN, the categories were converted to `object`.
 
-.. note::
-    Missing value methods like ``isnull`` and ``fillna`` will take both missing values as well as
-    `np.nan` categories into account:
+Methods for working with missing data, e.g. :meth:`~Series.isnull`, :meth:`~Series.fillna`,
+:meth:`~Series.dropna`, all work normally:
 
 .. ipython:: python
 
     c = pd.Series(["a","b",np.nan], dtype="category")
-    c.cat.set_categories(["a","b",np.nan], inplace=True)
-    # will be inserted as a NA category:
-    c[0] = np.nan
     s = pd.Series(c)
     s
     pd.isnull(s)
     s.fillna("a")
 
 Differences to R's `factor`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
+---------------------------
 
 The following differences to R's factor functions can be observed:
 
@@ -677,6 +671,9 @@ The following differences to R's factor functions can be observed:
 * In contrast to R's `factor` function, using categorical data as the sole input to create a
   new categorical series will *not* remove unused categories but create a new categorical series
   which is equal to the passed in one!
+* R allows for missing values to be included in its `levels` (pandas' `categories`). Pandas
+  does not allow `NaN` categories, but missing values can still be in the `values`.
+
 
 Gotchas
 -------
diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
@@ -652,6 +652,7 @@ Deprecations
   =====================  =================================
 
 - ``Categorical.name`` was deprecated to make ``Categorical`` more ``numpy.ndarray`` like. Use ``Series(cat, name="whatever")`` instead (:issue:`10482`).
+- Setting missing values (NaN) in a ``Categorical``'s ``categories`` will issue a warning (:issue:`10748`). You can still have missing values in the ``values``.
 - ``drop_duplicates`` and ``duplicated``'s ``take_last`` keyword was deprecated in favor of ``keep``. (:issue:`6511`, :issue:`8505`)
 - ``Series.nsmallest`` and ``nlargest``'s ``take_last`` keyword was deprecated in favor of ``keep``. (:issue:`10792`)
 - ``DataFrame.combineAdd`` and ``DataFrame.combineMult`` are deprecated. They
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -443,12 +443,18 @@ def _validate_categories(cls, categories):
             raise ValueError('Categorical categories must be unique')
         return categories
 
-    def _set_categories(self, categories):
+    def _set_categories(self, categories, validate=True):
         """ Sets new categories """
-        categories = self._validate_categories(categories)
-        if not self._categories is None and len(categories) != len(self._categories):
-            raise ValueError("new categories need to have the same number of items than the old "
-                             "categories!")
+        if validate:
+            categories = self._validate_categories(categories)
+            if not self._categories is None and len(categories) != len(self._categories):
+                raise ValueError("new categories need to have the same number of items than the old "
+                                 "categories!")
+        if np.any(isnull(categories)):
+            # NaNs in cats deprecated in 0.17, remove in 0.18 or 0.19 GH 10748
+            msg = ('\nSetting NaNs in `categories` is deprecated and '
+                   'will be removed in a future version of pandas.')
+            warn(msg, FutureWarning, stacklevel=9)
         self._categories = categories
 
     def _get_categories(self):
@@ -581,11 +587,11 @@ def set_categories(self, new_categories, ordered=None, rename=False, inplace=Fal
             if not cat._categories is None and len(new_categories) < len(cat._categories):
                 # remove all _codes which are larger and set to -1/NaN
                 self._codes[self._codes >= len(new_categories)] = -1
-            cat._categories = new_categories
+            cat._set_categories(new_categories, validate=False)
         else:
             values = cat.__array__()
             cat._codes = _get_codes_for_values(values, new_categories)
-            cat._categories = new_categories
+            cat._set_categories(new_categories, validate=False)
 
         if ordered is None:
             ordered = self.ordered
@@ -708,7 +714,7 @@ def add_categories(self, new_categories, inplace=False):
         new_categories = list(self._categories) + list(new_categories)
         new_categories = self._validate_categories(new_categories)
         cat = self if inplace else self.copy()
-        cat._categories = new_categories
+        cat._set_categories(new_categories, validate=False)
         cat._codes = _coerce_indexer_dtype(cat._codes, new_categories)
         if not inplace:
             return cat
@@ -791,7 +797,7 @@ def remove_unused_categories(self, inplace=False):
         from pandas.core.index import _ensure_index
         new_categories = _ensure_index(new_categories)
         cat._codes = _get_codes_for_values(cat.__array__(), new_categories)
-        cat._categories = new_categories
+        cat._set_categories(new_categories, validate=False)
         if not inplace:
             return cat
 
@@ -1171,7 +1177,7 @@ def order(self, inplace=False, ascending=True, na_position='last'):
         Category.sort
         """
         warn("order is deprecated, use sort_values(...)",
-             FutureWarning, stacklevel=2)
+             FutureWarning, stacklevel=3)
         return self.sort_values(inplace=inplace, ascending=ascending, na_position=na_position)
 
     def sort(self, inplace=True, ascending=True, na_position='last'):
diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py