API: Add dtype parameter to Categorical.from_codes (pandas-dev#24398)

topper-123 · Pingviinituutti · commit f3532a4a225a · 2019-02-28T10:26:56.000+02:00
* Add dtype to Categorical.from_codes
diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
@@ -403,6 +403,7 @@ Other Enhancements
 - :meth:`pandas.api.types.is_list_like` has gained a keyword ``allow_sets`` which is ``True`` by default; if ``False``,
   all instances of ``set`` will not be considered "list-like" anymore (:issue:`23061`)
 - :meth:`Index.to_frame` now supports overriding column name(s) (:issue:`22580`).
+- :meth:`Categorical.from_codes` now can take a ``dtype`` parameter as an alternative to passing ``categories`` and ``ordered`` (:issue:`24398`).
 - New attribute :attr:`__git_version__` will return git commit sha of current build (:issue:`21295`).
 - Compatibility with Matplotlib 3.0 (:issue:`22790`).
 - Added :meth:`Interval.overlaps`, :meth:`IntervalArray.overlaps`, and :meth:`IntervalIndex.overlaps` for determining overlaps between interval-like objects (:issue:`21998`)
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -603,13 +603,13 @@ def _from_inferred_categories(cls, inferred_categories, inferred_codes,
         return cls(codes, dtype=dtype, fastpath=True)
 
     @classmethod
-    def from_codes(cls, codes, categories, ordered=False):
+    def from_codes(cls, codes, categories=None, ordered=None, dtype=None):
         """
-        Make a Categorical type from codes and categories arrays.
+        Make a Categorical type from codes and categories or dtype.
 
-        This constructor is useful if you already have codes and categories and
-        so do not need the (computation intensive) factorization step, which is
-        usually done on the constructor.
+        This constructor is useful if you already have codes and
+        categories/dtype and so do not need the (computation intensive)
+        factorization step, which is usually done on the constructor.
 
         If your data does not follow this convention, please use the normal
         constructor.
@@ -618,16 +618,38 @@ def from_codes(cls, codes, categories, ordered=False):
         ----------
         codes : array-like, integers
             An integer array, where each integer points to a category in
-            categories or -1 for NaN
-        categories : index-like
+            categories or dtype.categories, or else is -1 for NaN
+        categories : index-like, optional
             The categories for the categorical. Items need to be unique.
-        ordered : boolean, (default False)
-            Whether or not this categorical is treated as a ordered
-            categorical. If not given, the resulting categorical will be
-            unordered.
-        """
-        dtype = CategoricalDtype._from_values_or_dtype(codes, categories,
-                                                       ordered)
+            If the categories are not given here, then they must be provided
+            in `dtype`.
+        ordered : bool, optional
+            Whether or not this categorical is treated as an ordered
+            categorical. If not given here or in `dtype`, the resulting
+            categorical will be unordered.
+        dtype : CategoricalDtype or the string "category", optional
+            If :class:`CategoricalDtype`, cannot be used together with
+            `categories` or `ordered`.
+
+            .. versionadded:: 0.24.0
+
+               When `dtype` is provided, neither `categories` nor `ordered`
+               should be provided.
+
+        Examples
+        --------
+        >>> dtype = pd.CategoricalDtype(['a', 'b'], ordered=True)
+        >>> pd.Categorical.from_codes(codes=[0, 1, 0, 1], dtype=dtype)
+        [a, b, a, b]
+        Categories (2, object): [a < b]
+        """
+        dtype = CategoricalDtype._from_values_or_dtype(categories=categories,
+                                                       ordered=ordered,
+                                                       dtype=dtype)
+        if dtype.categories is None:
+            msg = ("The categories must be provided in 'categories' or "
+                   "'dtype'. Both were None.")
+            raise ValueError(msg)
 
         codes = np.asarray(codes)  # #21767
         if not is_integer_dtype(codes):
@@ -642,12 +664,6 @@ def from_codes(cls, codes, categories, ordered=False):
             if msg:
                 raise ValueError(msg)
 
-        try:
-            codes = coerce_indexer_dtype(codes, categories)
-        except (ValueError, TypeError):
-            raise ValueError(
-                "codes need to be convertible to an arrays of integers")
-
         if len(codes) and (
                 codes.max() >= len(dtype.categories) or codes.min() < -1):
             raise ValueError("codes need to be between -1 and "
@@ -1265,8 +1281,7 @@ def shift(self, periods, fill_value=None):
             else:
                 codes[periods:] = fill_value
 
-        return self.from_codes(codes, categories=self.categories,
-                               ordered=self.ordered)
+        return self.from_codes(codes, dtype=self.dtype)
 
     def __array__(self, dtype=None):
         """
@@ -1887,9 +1902,7 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None):
 
         codes = take(self._codes, indexer, allow_fill=allow_fill,
                      fill_value=fill_value)
-        result = type(self).from_codes(codes,
-                                       categories=dtype.categories,
-                                       ordered=dtype.ordered)
+        result = type(self).from_codes(codes, dtype=dtype)
         return result
 
     take = take_nd
@@ -2078,9 +2091,7 @@ def __setitem__(self, key, value):
                 new_codes = _recode_for_categories(
                     value.codes, value.categories, self.categories
                 )
-                value = Categorical.from_codes(new_codes,
-                                               categories=self.categories,
-                                               ordered=self.ordered)
+                value = Categorical.from_codes(new_codes, dtype=self.dtype)
 
         rvalue = value if is_list_like(value) else [value]
 
diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
@@ -148,8 +148,7 @@ def _create_from_codes(self, codes, dtype=None, name=None):
             dtype = self.dtype
         if name is None:
             name = self.name
-        cat = Categorical.from_codes(codes, categories=dtype.categories,
-                                     ordered=dtype.ordered)
+        cat = Categorical.from_codes(codes, dtype=dtype)
         return CategoricalIndex(cat, name=name)
 
     @classmethod
diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py
@@ -77,7 +77,9 @@ def test_constructor_unsortable(self):
         assert not factor.ordered
 
         # this however will raise as cannot be sorted
-        with pytest.raises(TypeError):
+        msg = ("'values' is not ordered, please explicitly specify the "
+               "categories order by passing in a categories argument.")
+        with pytest.raises(TypeError, match=msg):
             Categorical(arr, ordered=True)
 
     def test_constructor_interval(self):
@@ -99,10 +101,11 @@ def test_constructor(self):
         tm.assert_numpy_array_equal(c2.__array__(), exp_arr)
 
         # categories must be unique
-        with pytest.raises(ValueError):
+        msg = "Categorical categories must be unique"
+        with pytest.raises(ValueError, match=msg):
             Categorical([1, 2], [1, 2, 2])
 
-        with pytest.raises(ValueError):
+        with pytest.raises(ValueError, match=msg):
             Categorical(["a", "b"], ["a", "b", "b"])
 
         # The default should be unordered
@@ -211,21 +214,23 @@ def test_constructor(self):
 
     def test_constructor_not_sequence(self):
         # https://github.com/pandas-dev/pandas/issues/16022
-        with pytest.raises(TypeError):
+        msg = r"^Parameter 'categories' must be list-like, was"
+        with pytest.raises(TypeError, match=msg):
             Categorical(['a', 'b'], categories='a')
 
     def test_constructor_with_null(self):
 
         # Cannot have NaN in categories
-        with pytest.raises(ValueError):
+        msg = "Categorial categories cannot be null"
+        with pytest.raises(ValueError, match=msg):
             Categorical([np.nan, "a", "b", "c"],
                         categories=[np.nan, "a", "b", "c"])
 
-        with pytest.raises(ValueError):
+        with pytest.raises(ValueError, match=msg):
             Categorical([None, "a", "b", "c"],
                         categories=[None, "a", "b", "c"])
 
-        with pytest.raises(ValueError):
+        with pytest.raises(ValueError, match=msg):
             Categorical(DatetimeIndex(['nat', '20160101']),
                         categories=[NaT, Timestamp('20160101')])
 
@@ -347,13 +352,14 @@ def test_constructor_with_dtype(self, ordered):
 
     def test_constructor_dtype_and_others_raises(self):
         dtype = CategoricalDtype(['a', 'b'], ordered=True)
-        with pytest.raises(ValueError, match="Cannot"):
+        msg = "Cannot specify `categories` or `ordered` together with `dtype`."
+        with pytest.raises(ValueError, match=msg):
             Categorical(['a', 'b'], categories=['a', 'b'], dtype=dtype)
 
-        with pytest.raises(ValueError, match="Cannot"):
+        with pytest.raises(ValueError, match=msg):
             Categorical(['a', 'b'], ordered=True, dtype=dtype)
 
-        with pytest.raises(ValueError, match="Cannot"):
+        with pytest.raises(ValueError, match=msg):
             Categorical(['a', 'b'], ordered=False, dtype=dtype)
 
     @pytest.mark.parametrize('categories', [
@@ -417,33 +423,44 @@ def test_constructor_with_categorical_categories(self):
     def test_from_codes(self):
 
         # too few categories
-        with pytest.raises(ValueError):
-            Categorical.from_codes([1, 2], [1, 2])
+        dtype = CategoricalDtype(categories=[1, 2])
+        msg = "codes need to be between "
+        with pytest.raises(ValueError, match=msg):
+            Categorical.from_codes([1, 2], categories=dtype.categories)
+        with pytest.raises(ValueError, match=msg):
+            Categorical.from_codes([1, 2], dtype=dtype)
 
         # no int codes
-        with pytest.raises(ValueError):
-            Categorical.from_codes(["a"], [1, 2])
+        msg = "codes need to be array-like integers"
+        with pytest.raises(ValueError, match=msg):
+            Categorical.from_codes(["a"], categories=dtype.categories)
+        with pytest.raises(ValueError, match=msg):
+            Categorical.from_codes(["a"], dtype=dtype)
 
         # no unique categories
-        with pytest.raises(ValueError):
-            Categorical.from_codes([0, 1, 2], ["a", "a", "b"])
+        with pytest.raises(ValueError,
+                           match="Categorical categories must be unique"):
+            Categorical.from_codes([0, 1, 2], categories=["a", "a", "b"])
 
         # NaN categories included
-        with pytest.raises(ValueError):
-            Categorical.from_codes([0, 1, 2], ["a", "b", np.nan])
+        with pytest.raises(ValueError,
+                           match="Categorial categories cannot be null"):
+            Categorical.from_codes([0, 1, 2], categories=["a", "b", np.nan])
 
         # too negative
-        with pytest.raises(ValueError):
-            Categorical.from_codes([-2, 1, 2], ["a", "b", "c"])
+        dtype = CategoricalDtype(categories=["a", "b", "c"])
+        msg = r"codes need to be between -1 and len\(categories\)-1"
+        with pytest.raises(ValueError, match=msg):
+            Categorical.from_codes([-2, 1, 2], categories=dtype.categories)
+        with pytest.raises(ValueError, match=msg):
+            Categorical.from_codes([-2, 1, 2], dtype=dtype)
 
         exp = Categorical(["a", "b", "c"], ordered=False)
-        res = Categorical.from_codes([0, 1, 2], ["a", "b", "c"])
+        res = Categorical.from_codes([0, 1, 2], categories=dtype.categories)
         tm.assert_categorical_equal(exp, res)
 
-        # Not available in earlier numpy versions
-        if hasattr(np.random, "choice"):
-            codes = np.random.choice([0, 1], 5, p=[0.9, 0.1])
-            Categorical.from_codes(codes, categories=["train", "test"])
+        res = Categorical.from_codes([0, 1, 2], dtype=dtype)
+        tm.assert_categorical_equal(exp, res)
 
     def test_from_codes_with_categorical_categories(self):
         # GH17884
@@ -458,28 +475,56 @@ def test_from_codes_with_categorical_categories(self):
         tm.assert_categorical_equal(result, expected)
 
         # non-unique Categorical still raises
-        with pytest.raises(ValueError):
+        with pytest.raises(ValueError,
+                           match="Categorical categories must be unique"):
             Categorical.from_codes([0, 1], Categorical(['a', 'b', 'a']))
 
     def test_from_codes_with_nan_code(self):
         # GH21767
         codes = [1, 2, np.nan]
-        categories = ['a', 'b', 'c']
-        with pytest.raises(ValueError):
-            Categorical.from_codes(codes, categories)
+        dtype = CategoricalDtype(categories=['a', 'b', 'c'])
+        with pytest.raises(ValueError,
+                           match="codes need to be array-like integers"):
+            Categorical.from_codes(codes, categories=dtype.categories)
+        with pytest.raises(ValueError,
+                           match="codes need to be array-like integers"):
+            Categorical.from_codes(codes, dtype=dtype)
 
     def test_from_codes_with_float(self):
         # GH21767
         codes = [1.0, 2.0, 0]  # integer, but in float dtype
-        categories = ['a', 'b', 'c']
+        dtype = CategoricalDtype(categories=['a', 'b', 'c'])
+
+        with tm.assert_produces_warning(FutureWarning):
+            cat = Categorical.from_codes(codes, dtype.categories)
+        tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype='i1'))
 
         with tm.assert_produces_warning(FutureWarning):
-            cat = Categorical.from_codes(codes, categories)
+            cat = Categorical.from_codes(codes, dtype=dtype)
         tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype='i1'))
 
         codes = [1.1, 2.0, 0]  # non-integer
-        with pytest.raises(ValueError):
-            Categorical.from_codes(codes, categories)
+        with pytest.raises(ValueError,
+                           match="codes need to be array-like integers"):
+            Categorical.from_codes(codes, dtype.categories)
+        with pytest.raises(ValueError,
+                           match="codes need to be array-like integers"):
+            Categorical.from_codes(codes, dtype=dtype)
+
+    def test_from_codes_with_dtype_raises(self):
+        msg = 'Cannot specify'
+        with pytest.raises(ValueError, match=msg):
+            Categorical.from_codes([0, 1], categories=['a', 'b'],
+                                   dtype=CategoricalDtype(['a', 'b']))
+
+        with pytest.raises(ValueError, match=msg):
+            Categorical.from_codes([0, 1], ordered=True,
+                                   dtype=CategoricalDtype(['a', 'b']))
+
+    def test_from_codes_neither(self):
+        msg = "Both were None"
+        with pytest.raises(ValueError, match=msg):
+            Categorical.from_codes([0, 1])
 
     @pytest.mark.parametrize('dtype', [None, 'category'])
     def test_from_inferred_categories(self, dtype):
@@ -515,14 +560,11 @@ def test_from_inferred_categories_coerces(self):
         expected = Categorical([1, 1, 2, np.nan])
         tm.assert_categorical_equal(result, expected)
 
-    def test_construction_with_ordered(self):
+    @pytest.mark.parametrize('ordered', [None, True, False])
+    def test_construction_with_ordered(self, ordered):
         # GH 9347, 9190
-        cat = Categorical([0, 1, 2])
-        assert not cat.ordered
-        cat = Categorical([0, 1, 2], ordered=False)
-        assert not cat.ordered
-        cat = Categorical([0, 1, 2], ordered=True)
-        assert cat.ordered
+        cat = Categorical([0, 1, 2], ordered=ordered)
+        assert cat.ordered == bool(ordered)
 
     @pytest.mark.xfail(reason="Imaginary values not supported in Categorical")
     def test_constructor_imaginary(self):
diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py
@@ -158,7 +158,7 @@ def test_construction_with_categorical_dtype(self):
         tm.assert_index_equal(result, expected, exact=True)
 
         # error when combining categories/ordered and dtype kwargs
-        msg = 'Cannot specify `categories` or `ordered` together with `dtype`.'
+        msg = "Cannot specify `categories` or `ordered` together with `dtype`."
         with pytest.raises(ValueError, match=msg):
             CategoricalIndex(data, categories=cats, dtype=dtype)