DEPR: Drop support for NaN categories in Categorical

jreback · jreback · commit 80280ec576ab · 2017-03-27T10:45:39.000-04:00
Deprecated in 0.17.0. xref pandas-dev#10748 xref pandas-dev#13648 Author: Jeff Reback <jeff@reback.net> Author: gfyoung <gfyoung17@gmail.com> Closes pandas-dev#15806 from gfyoung/categories-nan-drop and squashes the following commits: 318175b [Jeff Reback] TST: test pd.NaT with correct dtype 4dce349 [gfyoung] Drop support for NaN categories in Categorical
diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst
@@ -230,6 +230,15 @@ Categories must be unique or a `ValueError` is raised:
     except ValueError as e:
         print("ValueError: " + str(e))
 
+Categories must also not be ``NaN`` or a `ValueError` is raised:
+
+.. ipython:: python
+
+    try:
+        s.cat.categories = [1,2,np.nan]
+    except ValueError as e:
+        print("ValueError: " + str(e))
+
 Appending new categories
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -816,6 +816,7 @@ Removal of prior version deprecations/changes
   in favor of ``iloc`` and ``iat`` as explained :ref:`here <whatsnew_0170.deprecations>` (:issue:`10711`).
 - The deprecated ``DataFrame.iterkv()`` has been removed in favor of ``DataFrame.iteritems()`` (:issue:`10711`)
 - The ``Categorical`` constructor has dropped the ``name`` parameter (:issue:`10632`)
+- ``Categorical`` has dropped support for ``NaN`` categories (:issue:`10748`)
 - The ``take_last`` parameter has been dropped from ``duplicated()``, ``drop_duplicates()``, ``nlargest()``, and ``nsmallest()`` methods (:issue:`10236`, :issue:`10792`, :issue:`10920`)
 - ``Series``, ``Index``, and ``DataFrame`` have dropped the ``sort`` and ``order`` methods (:issue:`10726`)
 - Where clauses in ``pytables`` are only accepted as strings and expressions types and not other data-types (:issue:`12027`)
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -545,18 +545,11 @@ def _validate_categories(cls, categories, fastpath=False):
 
         if not fastpath:
 
-            # check properties of the categories
-            # we don't allow NaNs in the categories themselves
-
+            # Categories cannot contain NaN.
             if categories.hasnans:
-                # NaNs in cats deprecated in 0.17
-                # GH 10748
-                msg = ('\nSetting NaNs in `categories` is deprecated and '
-                       'will be removed in a future version of pandas.')
-                warn(msg, FutureWarning, stacklevel=3)
-
-            # categories must be unique
+                raise ValueError('Categorial categories cannot be null')
 
+            # Categories must be unique.
             if not categories.is_unique:
                 raise ValueError('Categorical categories must be unique')
 
diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py
@@ -183,11 +183,6 @@ def test_contains(self):
         self.assertFalse(0 in ci)
         self.assertFalse(1 in ci)
 
-        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
-            ci = CategoricalIndex(
-                list('aabbca'), categories=list('cabdef') + [np.nan])
-        self.assertFalse(np.nan in ci)
-
         ci = CategoricalIndex(
             list('aabbca') + [np.nan], categories=list('cabdef'))
         self.assertTrue(np.nan in ci)
@@ -541,7 +536,6 @@ def test_ensure_copied_data(self):
             self.assertIs(_base(index.values), _base(result.values))
 
     def test_equals_categorical(self):
-
         ci1 = CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=True)
         ci2 = CategoricalIndex(['a', 'b'], categories=['a', 'b', 'c'],
                                ordered=True)
@@ -579,14 +573,6 @@ def test_equals_categorical(self):
         self.assertFalse(ci.equals(CategoricalIndex(list('aabca'))))
         self.assertTrue(ci.equals(ci.copy()))
 
-        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
-            ci = CategoricalIndex(list('aabca'),
-                                  categories=['c', 'a', 'b', np.nan])
-        self.assertFalse(ci.equals(list('aabca')))
-        self.assertFalse(ci.equals(CategoricalIndex(list('aabca'))))
-        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
-            self.assertTrue(ci.equals(ci.copy()))
-
         ci = CategoricalIndex(list('aabca') + [np.nan],
                               categories=['c', 'a', 'b'])
         self.assertFalse(ci.equals(list('aabca')))
diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 # pylint: disable=E1101,E1103,W0232
 
+import pytest
 import sys
 from datetime import datetime
 from distutils.version import LooseVersion
@@ -17,7 +18,8 @@
 import pandas.compat as compat
 import pandas.util.testing as tm
 from pandas import (Categorical, Index, Series, DataFrame, PeriodIndex,
-                    Timestamp, CategoricalIndex, isnull)
+                    Timestamp, CategoricalIndex, DatetimeIndex,
+                    isnull, NaT)
 from pandas.compat import range, lrange, u, PY3
 from pandas.core.config import option_context
 
@@ -160,12 +162,6 @@ def f():
 
         self.assertRaises(ValueError, f)
 
-        def f():
-            with tm.assert_produces_warning(FutureWarning):
-                Categorical([1, 2], [1, 2, np.nan, np.nan])
-
-        self.assertRaises(ValueError, f)
-
         # The default should be unordered
         c1 = Categorical(["a", "b", "c", "a"])
         self.assertFalse(c1.ordered)
@@ -222,29 +218,12 @@ def f():
         cat = pd.Categorical([np.nan, 1., 2., 3.])
         self.assertTrue(is_float_dtype(cat.categories))
 
-        # Deprecating NaNs in categoires (GH #10748)
-        # preserve int as far as possible by converting to object if NaN is in
-        # categories
-        with tm.assert_produces_warning(FutureWarning):
-            cat = pd.Categorical([np.nan, 1, 2, 3],
-                                 categories=[np.nan, 1, 2, 3])
-        self.assertTrue(is_object_dtype(cat.categories))
-
         # This doesn't work -> this would probably need some kind of "remember
         # the original type" feature to try to cast the array interface result
         # to...
 
         # vals = np.asarray(cat[cat.notnull()])
         # self.assertTrue(is_integer_dtype(vals))
-        with tm.assert_produces_warning(FutureWarning):
-            cat = pd.Categorical([np.nan, "a", "b", "c"],
-                                 categories=[np.nan, "a", "b", "c"])
-        self.assertTrue(is_object_dtype(cat.categories))
-        # but don't do it for floats
-        with tm.assert_produces_warning(FutureWarning):
-            cat = pd.Categorical([np.nan, 1., 2., 3.],
-                                 categories=[np.nan, 1., 2., 3.])
-        self.assertTrue(is_float_dtype(cat.categories))
 
         # corner cases
         cat = pd.Categorical([1])
@@ -295,6 +274,22 @@ def f():
             c = Categorical(np.array([], dtype='int64'),  # noqa
                             categories=[3, 2, 1], ordered=True)
 
+    def test_constructor_with_null(self):
+
+        # Cannot have NaN in categories
+        with pytest.raises(ValueError):
+            pd.Categorical([np.nan, "a", "b", "c"],
+                           categories=[np.nan, "a", "b", "c"])
+
+        with pytest.raises(ValueError):
+            pd.Categorical([None, "a", "b", "c"],
+                           categories=[None, "a", "b", "c"])
+
+        with pytest.raises(ValueError):
+            pd.Categorical(DatetimeIndex(['nat', '20160101']),
+                           categories=[NaT, Timestamp('20160101')])
+
+
     def test_constructor_with_index(self):
         ci = CategoricalIndex(list('aabbca'), categories=list('cab'))
         tm.assert_categorical_equal(ci.values, Categorical(ci))
@@ -418,6 +413,12 @@ def f():
 
         self.assertRaises(ValueError, f)
 
+        # NaN categories included
+        def f():
+            Categorical.from_codes([0, 1, 2], ["a", "b", np.nan])
+
+        self.assertRaises(ValueError, f)
+
         # too negative
         def f():
             Categorical.from_codes([-2, 1, 2], ["a", "b", "c"])
@@ -649,30 +650,6 @@ def test_describe(self):
                                                        name='categories'))
         tm.assert_frame_equal(desc, expected)
 
-        # NA as a category
-        with tm.assert_produces_warning(FutureWarning):
-            cat = pd.Categorical(["a", "c", "c", np.nan],
-                                 categories=["b", "a", "c", np.nan])
-            result = cat.describe()
-
-        expected = DataFrame([[0, 0], [1, 0.25], [2, 0.5], [1, 0.25]],
-                             columns=['counts', 'freqs'],
-                             index=pd.CategoricalIndex(['b', 'a', 'c', np.nan],
-                                                       name='categories'))
-        tm.assert_frame_equal(result, expected, check_categorical=False)
-
-        # NA as an unused category
-        with tm.assert_produces_warning(FutureWarning):
-            cat = pd.Categorical(["a", "c", "c"],
-                                 categories=["b", "a", "c", np.nan])
-            result = cat.describe()
-
-        exp_idx = pd.CategoricalIndex(
-            ['b', 'a', 'c', np.nan], name='categories')
-        expected = DataFrame([[0, 0], [1, 1 / 3.], [2, 2 / 3.], [0, 0]],
-                             columns=['counts', 'freqs'], index=exp_idx)
-        tm.assert_frame_equal(result, expected, check_categorical=False)
-
     def test_print(self):
         expected = ["[a, b, b, a, a, c, c, c]",
                     "Categories (3, object): [a < b < c]"]
@@ -1119,90 +1096,18 @@ def test_nan_handling(self):
         self.assert_numpy_array_equal(c._codes,
                                       np.array([0, -1, -1, 0], dtype=np.int8))
 
-        # If categories have nan included, the code should point to that
-        # instead
-        with tm.assert_produces_warning(FutureWarning):
-            c = Categorical(["a", "b", np.nan, "a"],
-                            categories=["a", "b", np.nan])
-        self.assert_index_equal(c.categories, Index(["a", "b", np.nan]))
-        self.assert_numpy_array_equal(c._codes,
-                                      np.array([0, 1, 2, 0], dtype=np.int8))
-        c[1] = np.nan
-        self.assert_index_equal(c.categories, Index(["a", "b", np.nan]))
-        self.assert_numpy_array_equal(c._codes,
-                                      np.array([0, 2, 2, 0], dtype=np.int8))
-
-        # Changing categories should also make the replaced category np.nan
-        c = Categorical(["a", "b", "c", "a"])
-        with tm.assert_produces_warning(FutureWarning):
-            c.categories = ["a", "b", np.nan]  # noqa
-
-        self.assert_index_equal(c.categories, Index(["a", "b", np.nan]))
-        self.assert_numpy_array_equal(c._codes,
-                                      np.array([0, 1, 2, 0], dtype=np.int8))
-
         # Adding nan to categories should make assigned nan point to the
         # category!
         c = Categorical(["a", "b", np.nan, "a"])
         self.assert_index_equal(c.categories, Index(["a", "b"]))
         self.assert_numpy_array_equal(c._codes,
                                       np.array([0, 1, -1, 0], dtype=np.int8))
-        with tm.assert_produces_warning(FutureWarning):
-            c.set_categories(["a", "b", np.nan], rename=True, inplace=True)
-
-        self.assert_index_equal(c.categories, Index(["a", "b", np.nan]))
-        self.assert_numpy_array_equal(c._codes,
-                                      np.array([0, 1, -1, 0], dtype=np.int8))
-        c[1] = np.nan
-        self.assert_index_equal(c.categories, Index(["a", "b", np.nan]))
-        self.assert_numpy_array_equal(c._codes,
-                                      np.array([0, 2, -1, 0], dtype=np.int8))
-
-        # Remove null categories (GH 10156)
-        cases = [([1.0, 2.0, np.nan], [1.0, 2.0]),
-                 (['a', 'b', None], ['a', 'b']),
-                 ([pd.Timestamp('2012-05-01'), pd.NaT],
-                  [pd.Timestamp('2012-05-01')])]
-
-        null_values = [np.nan, None, pd.NaT]
-
-        for with_null, without in cases:
-            with tm.assert_produces_warning(FutureWarning):
-                base = Categorical([], with_null)
-            expected = Categorical([], without)
-
-            for nullval in null_values:
-                result = base.remove_categories(nullval)
-            self.assert_categorical_equal(result, expected)
-
-        # Different null values are indistinguishable
-        for i, j in [(0, 1), (0, 2), (1, 2)]:
-            nulls = [null_values[i], null_values[j]]
-
-            def f():
-                with tm.assert_produces_warning(FutureWarning):
-                    Categorical([], categories=nulls)
-
-            self.assertRaises(ValueError, f)
 
     def test_isnull(self):
         exp = np.array([False, False, True])
         c = Categorical(["a", "b", np.nan])
         res = c.isnull()
-        self.assert_numpy_array_equal(res, exp)
 
-        with tm.assert_produces_warning(FutureWarning):
-            c = Categorical(["a", "b", np.nan], categories=["a", "b", np.nan])
-        res = c.isnull()
-        self.assert_numpy_array_equal(res, exp)
-
-        # test both nan in categories and as -1
-        exp = np.array([True, False, True])
-        c = Categorical(["a", "b", np.nan])
-        with tm.assert_produces_warning(FutureWarning):
-            c.set_categories(["a", "b", np.nan], rename=True, inplace=True)
-        c[0] = np.nan
-        res = c.isnull()
         self.assert_numpy_array_equal(res, exp)
 
     def test_codes_immutable(self):
@@ -1487,45 +1392,10 @@ def test_slicing_directly(self):
 
     def test_set_item_nan(self):
         cat = pd.Categorical([1, 2, 3])
-        exp = pd.Categorical([1, np.nan, 3], categories=[1, 2, 3])
         cat[1] = np.nan
-        tm.assert_categorical_equal(cat, exp)
 
-        # if nan in categories, the proper code should be set!
-        cat = pd.Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
-        with tm.assert_produces_warning(FutureWarning):
-            cat.set_categories([1, 2, 3, np.nan], rename=True, inplace=True)
-        cat[1] = np.nan
-        exp = np.array([0, 3, 2, -1], dtype=np.int8)
-        self.assert_numpy_array_equal(cat.codes, exp)
-
-        cat = pd.Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
-        with tm.assert_produces_warning(FutureWarning):
-            cat.set_categories([1, 2, 3, np.nan], rename=True, inplace=True)
-        cat[1:3] = np.nan
-        exp = np.array([0, 3, 3, -1], dtype=np.int8)
-        self.assert_numpy_array_equal(cat.codes, exp)
-
-        cat = pd.Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
-        with tm.assert_produces_warning(FutureWarning):
-            cat.set_categories([1, 2, 3, np.nan], rename=True, inplace=True)
-        cat[1:3] = [np.nan, 1]
-        exp = np.array([0, 3, 0, -1], dtype=np.int8)
-        self.assert_numpy_array_equal(cat.codes, exp)
-
-        cat = pd.Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
-        with tm.assert_produces_warning(FutureWarning):
-            cat.set_categories([1, 2, 3, np.nan], rename=True, inplace=True)
-        cat[1:3] = [np.nan, np.nan]
-        exp = np.array([0, 3, 3, -1], dtype=np.int8)
-        self.assert_numpy_array_equal(cat.codes, exp)
-
-        cat = pd.Categorical([1, 2, np.nan, 3], categories=[1, 2, 3])
-        with tm.assert_produces_warning(FutureWarning):
-            cat.set_categories([1, 2, 3, np.nan], rename=True, inplace=True)
-        cat[pd.isnull(cat)] = np.nan
-        exp = np.array([0, 1, 3, 2], dtype=np.int8)
-        self.assert_numpy_array_equal(cat.codes, exp)
+        exp = pd.Categorical([1, np.nan, 3], categories=[1, 2, 3])
+        tm.assert_categorical_equal(cat, exp)
 
     def test_shift(self):
         # GH 9416
@@ -2026,33 +1896,12 @@ def test_sideeffects_free(self):
 
     def test_nan_handling(self):
 
-        # Nans are represented as -1 in labels
+        # NaNs are represented as -1 in labels
         s = Series(Categorical(["a", "b", np.nan, "a"]))
         self.assert_index_equal(s.cat.categories, Index(["a", "b"]))
         self.assert_numpy_array_equal(s.values.codes,
                                       np.array([0, 1, -1, 0], dtype=np.int8))
 
-        # If categories have nan included, the label should point to that
-        # instead
-        with tm.assert_produces_warning(FutureWarning):
-            s2 = Series(Categorical(["a", "b", np.nan, "a"],
-                                    categories=["a", "b", np.nan]))
-
-        exp_cat = Index(["a", "b", np.nan])
-        self.assert_index_equal(s2.cat.categories, exp_cat)
-        self.assert_numpy_array_equal(s2.values.codes,
-                                      np.array([0, 1, 2, 0], dtype=np.int8))
-
-        # Changing categories should also make the replaced category np.nan
-        s3 = Series(Categorical(["a", "b", "c", "a"]))
-        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
-            s3.cat.categories = ["a", "b", np.nan]
-
-        exp_cat = Index(["a", "b", np.nan])
-        self.assert_index_equal(s3.cat.categories, exp_cat)
-        self.assert_numpy_array_equal(s3.values.codes,
-                                      np.array([0, 1, 2, 0], dtype=np.int8))
-
     def test_cat_accessor(self):
         s = Series(Categorical(["a", "b", np.nan, "a"]))
         self.assert_index_equal(s.cat.categories, Index(["a", "b"]))