Add dtype to Categorical.from_codes

topper-123 · topper-123 · commit e2543dffe3df · 2018-12-22T20:55:28.000Z
diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
@@ -360,6 +360,7 @@ Other Enhancements
 - :meth:`pandas.api.types.is_list_like` has gained a keyword ``allow_sets`` which is ``True`` by default; if ``False``,
   all instances of ``set`` will not be considered "list-like" anymore (:issue:`23061`)
 - :meth:`Index.to_frame` now supports overriding column name(s) (:issue:`22580`).
+- :meth:`Categorical.from_codes` now can take a dtype parameter (:issue:`24398`).
 - New attribute :attr:`__git_version__` will return git commit sha of current build (:issue:`21295`).
 - Compatibility with Matplotlib 3.0 (:issue:`22790`).
 - Added :meth:`Interval.overlaps`, :meth:`IntervalArray.overlaps`, and :meth:`IntervalIndex.overlaps` for determining overlaps between interval-like objects (:issue:`21998`)
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -639,7 +639,7 @@ def _from_inferred_categories(cls, inferred_categories, inferred_codes,
         return cls(codes, dtype=dtype, fastpath=True)
 
     @classmethod
-    def from_codes(cls, codes, categories, ordered=False):
+    def from_codes(cls, codes, categories=None, ordered=None, dtype=None):
         """
         Make a Categorical type from codes and categories arrays.
 
@@ -657,11 +657,27 @@ def from_codes(cls, codes, categories, ordered=False):
             categories or -1 for NaN
         categories : index-like
             The categories for the categorical. Items need to be unique.
-        ordered : boolean, (default False)
+        ordered : boolean, optional
             Whether or not this categorical is treated as a ordered
             categorical. If not given, the resulting categorical will be
             unordered.
+
+            .. versionchanged:: 0.24.0
+
+                The default value has been changed to  ``None``. Previously
+                the default value was ``False``.
+        dtype : CategoricalDtype, optional
+            An instance of ``CategoricalDtype`` to use for this categorical.
+
+            .. versionadded:: 0.24.0
         """
+        if dtype is not None:
+            if categories is not None or ordered is not None:
+                raise ValueError("Cannot specify both `dtype` and `categories`"
+                                 " or `ordered`.")
+        else:
+            dtype = CategoricalDtype(categories, ordered)
+
         codes = np.asarray(codes)  # #21767
         if not is_integer_dtype(codes):
             msg = "codes need to be array-like integers"
@@ -675,20 +691,12 @@ def from_codes(cls, codes, categories, ordered=False):
             if msg:
                 raise ValueError(msg)
 
-        try:
-            codes = coerce_indexer_dtype(codes, categories)
-        except (ValueError, TypeError):
-            raise ValueError(
-                "codes need to be convertible to an arrays of integers")
-
-        categories = CategoricalDtype.validate_categories(categories)
-
-        if len(codes) and (codes.max() >= len(categories) or codes.min() < -1):
+        if len(codes) and (
+                codes.max() >= len(dtype.categories) or codes.min() < -1):
             raise ValueError("codes need to be between -1 and "
                              "len(categories)-1")
 
-        return cls(codes, categories=categories, ordered=ordered,
-                   fastpath=True)
+        return cls(codes, dtype=dtype, fastpath=True)
 
     _codes = None
 
@@ -1283,8 +1291,7 @@ def shift(self, periods):
             else:
                 codes[periods:] = -1
 
-        return self.from_codes(codes, categories=self.categories,
-                               ordered=self.ordered)
+        return self.from_codes(codes, dtype=self.dtype)
 
     def __array__(self, dtype=None):
         """
@@ -1902,9 +1909,7 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None):
 
         codes = take(self._codes, indexer, allow_fill=allow_fill,
                      fill_value=fill_value)
-        result = type(self).from_codes(codes,
-                                       categories=dtype.categories,
-                                       ordered=dtype.ordered)
+        result = type(self).from_codes(codes, dtype=dtype)
         return result
 
     take = take_nd
@@ -2093,9 +2098,7 @@ def __setitem__(self, key, value):
                 new_codes = _recode_for_categories(
                     value.codes, value.categories, self.categories
                 )
-                value = Categorical.from_codes(new_codes,
-                                               categories=self.categories,
-                                               ordered=self.ordered)
+                value = Categorical.from_codes(new_codes, dtype=self.dtype)
 
         rvalue = value if is_list_like(value) else [value]
 
diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
@@ -154,8 +154,7 @@ def _create_from_codes(self, codes, dtype=None, name=None):
             dtype = self.dtype
         if name is None:
             name = self.name
-        cat = Categorical.from_codes(codes, categories=dtype.categories,
-                                     ordered=dtype.ordered)
+        cat = Categorical.from_codes(codes, dtype=dtype)
         return CategoricalIndex(cat, name=name)
 
     @classmethod
diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py
@@ -417,33 +417,44 @@ def test_constructor_with_categorical_categories(self):
     def test_from_codes(self):
 
         # too few categories
+        dtype = CategoricalDtype(categories=[1, 2])
         with pytest.raises(ValueError):
-            Categorical.from_codes([1, 2], [1, 2])
+            Categorical.from_codes([1, 2], categories=dtype.categories)
+        with pytest.raises(ValueError):
+            Categorical.from_codes([1, 2], dtype=dtype)
 
         # no int codes
         with pytest.raises(ValueError):
-            Categorical.from_codes(["a"], [1, 2])
+            Categorical.from_codes(["a"], categories=dtype.categories)
+        with pytest.raises(ValueError):
+            Categorical.from_codes(["a"], dtype=dtype)
 
         # no unique categories
         with pytest.raises(ValueError):
-            Categorical.from_codes([0, 1, 2], ["a", "a", "b"])
+            Categorical.from_codes([0, 1, 2], categories=["a", "a", "b"])
 
         # NaN categories included
         with pytest.raises(ValueError):
-            Categorical.from_codes([0, 1, 2], ["a", "b", np.nan])
+            Categorical.from_codes([0, 1, 2], categories=["a", "b", np.nan])
 
         # too negative
+        dtype = CategoricalDtype(categories=["a", "b", "c"])
+        with pytest.raises(ValueError):
+            Categorical.from_codes([-2, 1, 2], categories=dtype.categories)
         with pytest.raises(ValueError):
-            Categorical.from_codes([-2, 1, 2], ["a", "b", "c"])
+            Categorical.from_codes([-2, 1, 2], dtype=dtype)
 
         exp = Categorical(["a", "b", "c"], ordered=False)
-        res = Categorical.from_codes([0, 1, 2], ["a", "b", "c"])
+        res = Categorical.from_codes([0, 1, 2], categories=dtype.categories)
+        tm.assert_categorical_equal(exp, res)
+
+        res = Categorical.from_codes([0, 1, 2], dtype=dtype)
         tm.assert_categorical_equal(exp, res)
 
-        # Not available in earlier numpy versions
-        if hasattr(np.random, "choice"):
-            codes = np.random.choice([0, 1], 5, p=[0.9, 0.1])
-            Categorical.from_codes(codes, categories=["train", "test"])
+        codes = np.random.choice([0, 1], 5, p=[0.9, 0.1])
+        dtype = CategoricalDtype(categories=["train", "test"])
+        Categorical.from_codes(codes, categories=dtype.categories)
+        Categorical.from_codes(codes, dtype=dtype)
 
     def test_from_codes_with_categorical_categories(self):
         # GH17884
@@ -464,22 +475,30 @@ def test_from_codes_with_categorical_categories(self):
     def test_from_codes_with_nan_code(self):
         # GH21767
         codes = [1, 2, np.nan]
-        categories = ['a', 'b', 'c']
+        dtype = CategoricalDtype(categories=['a', 'b', 'c'])
         with pytest.raises(ValueError):
-            Categorical.from_codes(codes, categories)
+            Categorical.from_codes(codes, categories=dtype.categories)
+        with pytest.raises(ValueError):
+            Categorical.from_codes(codes, dtype=dtype)
 
     def test_from_codes_with_float(self):
         # GH21767
         codes = [1.0, 2.0, 0]  # integer, but in float dtype
-        categories = ['a', 'b', 'c']
+        dtype = CategoricalDtype(categories=['a', 'b', 'c'])
 
         with tm.assert_produces_warning(FutureWarning):
-            cat = Categorical.from_codes(codes, categories)
+            cat = Categorical.from_codes(codes, dtype.categories)
+        tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype='i1'))
+
+        with tm.assert_produces_warning(FutureWarning):
+            cat = Categorical.from_codes(codes, dtype=dtype)
         tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype='i1'))
 
         codes = [1.1, 2.0, 0]  # non-integer
         with pytest.raises(ValueError):
-            Categorical.from_codes(codes, categories)
+            Categorical.from_codes(codes, dtype.categories)
+        with pytest.raises(ValueError):
+            Categorical.from_codes(codes, dtype=dtype)
 
     @pytest.mark.parametrize('dtype', [None, 'category'])
     def test_from_inferred_categories(self, dtype):
@@ -515,14 +534,11 @@ def test_from_inferred_categories_coerces(self):
         expected = Categorical([1, 1, 2, np.nan])
         tm.assert_categorical_equal(result, expected)
 
-    def test_construction_with_ordered(self):
+    @pytest.mark.parametrize('ordered', [None, True, False])
+    def test_construction_with_ordered(self, ordered):
         # GH 9347, 9190
-        cat = Categorical([0, 1, 2])
-        assert not cat.ordered
-        cat = Categorical([0, 1, 2], ordered=False)
-        assert not cat.ordered
-        cat = Categorical([0, 1, 2], ordered=True)
-        assert cat.ordered
+        cat = Categorical([0, 1, 2], ordered=ordered)
+        assert cat.ordered == bool(ordered)
 
     @pytest.mark.xfail(reason="Imaginary values not supported in Categorical")
     def test_constructor_imaginary(self):