pandas-dev · mroeschke · May 9, 2023 · May 6, 2023 · May 7, 2023 · May 7, 2023
diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -95,6 +95,7 @@ Other enhancements
 - Improved error message when creating a DataFrame with empty data (0 rows), no index and an incorrect number of columns. (:issue:`52084`)
 - Let :meth:`DataFrame.to_feather` accept a non-default :class:`Index` and non-string column names (:issue:`51787`)
 - Performance improvement in :func:`read_csv` (:issue:`52632`) with ``engine="c"``
+- :meth:`Categorical.from_codes` has gotten a ``validate`` parameter (:issue:`50975`)
 - Performance improvement in :func:`concat` with homogeneous ``np.float64`` or ``np.float32`` dtypes (:issue:`52685`)
 - Performance improvement in :meth:`DataFrame.filter` when ``items`` is given (:issue:`52941`)
 -

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -649,7 +649,12 @@ def _from_inferred_categories(
 
     @classmethod
     def from_codes(
-        cls, codes, categories=None, ordered=None, dtype: Dtype | None = None
+        cls,
+        codes,
+        categories=None,
+        ordered=None,
+        dtype: Dtype | None = None,
+        validate: bool = True,
     ) -> Self:
         """
         Make a Categorical type from codes and categories or dtype.
@@ -677,6 +682,11 @@ def from_codes(
         dtype : CategoricalDtype or "category", optional
             If :class:`CategoricalDtype`, cannot be used together with
             `categories` or `ordered`.
+        validate : bool, default True
+            If True, validate that the codes are valid for the dtype.
+            If False, don't validate that the codes are valid.
+
+            .. versionadded:: 2.1.0
 
         Returns
         -------
@@ -699,18 +709,8 @@ def from_codes(
             )
             raise ValueError(msg)
 
-        if isinstance(codes, ExtensionArray) and is_integer_dtype(codes.dtype):
-            # Avoid the implicit conversion of Int to object
-            if isna(codes).any():
-                raise ValueError("codes cannot contain NA values")
-            codes = codes.to_numpy(dtype=np.int64)
-        else:
-            codes = np.asarray(codes)
-        if len(codes) and codes.dtype.kind not in "iu":
-            raise ValueError("codes need to be array-like integers")
-
-        if len(codes) and (codes.max() >= len(dtype.categories) or codes.min() < -1):
-            raise ValueError("codes need to be between -1 and len(categories)-1")
+        if validate:
+            codes = cls._validate_codes_for_dtype(codes, dtype=dtype)
 
         return cls._simple_new(codes, dtype=dtype)
 
@@ -1325,7 +1325,7 @@ def map(
 
         if new_categories.is_unique and not new_categories.hasnans and na_val is np.nan:
             new_dtype = CategoricalDtype(new_categories, ordered=self.ordered)
-            return self.from_codes(self._codes.copy(), dtype=new_dtype)
+            return self.from_codes(self._codes.copy(), dtype=new_dtype, validate=False)
 
         if has_nans:
             new_categories = new_categories.insert(len(new_categories), na_val)
@@ -1378,6 +1378,22 @@ def _validate_scalar(self, fill_value):
             ) from None
         return fill_value
 
+    @classmethod
+    def _validate_codes_for_dtype(cls, codes, *, dtype: CategoricalDtype) -> np.ndarray:
+        if isinstance(codes, ExtensionArray) and is_integer_dtype(codes.dtype):
+            # Avoid the implicit conversion of Int to object
+            if isna(codes).any():
+                raise ValueError("codes cannot contain NA values")
+            codes = codes.to_numpy(dtype=np.int64)
+        else:
+            codes = np.asarray(codes)
+        if len(codes) and codes.dtype.kind not in "iu":
+            raise ValueError("codes need to be array-like integers")
+
+        if len(codes) and (codes.max() >= len(dtype.categories) or codes.min() < -1):
+            raise ValueError("codes need to be between -1 and len(categories)-1")
+        return codes
+
     # -------------------------------------------------------------
 
     @ravel_compat
@@ -2724,7 +2740,7 @@ def factorize_from_iterable(values) -> tuple[np.ndarray, Index]:
         # The Categorical we want to build has the same categories
         # as values but its codes are by def [0, ..., len(n_categories) - 1]
         cat_codes = np.arange(len(values.categories), dtype=values.codes.dtype)
-        cat = Categorical.from_codes(cat_codes, dtype=values.dtype)
+        cat = Categorical.from_codes(cat_codes, dtype=values.dtype, validate=False)
 
         categories = CategoricalIndex(cat)
         codes = values.codes

@@ -721,7 +721,7 @@ def group_index(self) -> Index:
             if self._sort and (codes == len(uniques)).any():
                 # Add NA value on the end when sorting
                 uniques = Categorical.from_codes(
-                    np.append(uniques.codes, [-1]), uniques.categories
+                    np.append(uniques.codes, [-1]), uniques.categories, validate=False
                 )
             elif len(codes) > 0:
                 # Need to determine proper placement of NA value when not sorting
@@ -730,8 +730,9 @@ def group_index(self) -> Index:
                 if cat.codes[na_idx] < 0:
                     # count number of unique codes that comes before the nan value
                     na_unique_idx = algorithms.nunique_ints(cat.codes[:na_idx])
+                    new_codes = np.insert(uniques.codes, na_unique_idx, -1)
                     uniques = Categorical.from_codes(
-                        np.insert(uniques.codes, na_unique_idx, -1), uniques.categories
+                        new_codes, uniques.categories, validate=False
                     )
         return Index._with_infer(uniques, name=self.name)
 
@@ -754,7 +755,7 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
                 ucodes = np.arange(len(categories))
 
             uniques = Categorical.from_codes(
-                codes=ucodes, categories=categories, ordered=cat.ordered
+                codes=ucodes, categories=categories, ordered=cat.ordered, validate=False
             )
 
             codes = cat.codes
@@ -800,7 +801,8 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
 
     @cache_readonly
     def groups(self) -> dict[Hashable, np.ndarray]:
-        return self._index.groupby(Categorical.from_codes(self.codes, self.group_index))
+        cats = Categorical.from_codes(self.codes, self.group_index, validate=False)
+        return self._index.groupby(cats)
 
 
 def get_grouper(

diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -2393,7 +2393,7 @@ def cats(level_codes):
             )
 
         return [
-            Categorical.from_codes(level_codes, cats(level_codes), ordered=True)
+            Categorical.from_codes(level_codes, cats(level_codes), True, validate=False)
             for level_codes in self.codes
         ]
 
@@ -2583,7 +2583,7 @@ def _get_indexer_level_0(self, target) -> npt.NDArray[np.intp]:
         """
         lev = self.levels[0]
         codes = self._codes[0]
-        cat = Categorical.from_codes(codes=codes, categories=lev)
+        cat = Categorical.from_codes(codes=codes, categories=lev, validate=False)
         ci = Index(cat)
         return ci.get_indexer_for(target)
 

diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py
@@ -411,7 +411,8 @@ def _bins_to_cuts(
     if isinstance(bins, IntervalIndex):
         # we have a fast-path here
         ids = bins.get_indexer(x)
-        result = Categorical.from_codes(ids, categories=bins, ordered=True)
+        cat_dtype = CategoricalDtype(bins, ordered=True)
+        result = Categorical.from_codes(ids, dtype=cat_dtype, validate=False)
         return result, bins
 
     unique_bins = algos.unique(bins)

diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
@@ -2520,7 +2520,7 @@ def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str):
                     codes[codes != -1] -= mask.astype(int).cumsum()._values
 
             converted = Categorical.from_codes(
-                codes, categories=categories, ordered=ordered
+                codes, categories=categories, ordered=ordered, validate=False
             )
 
         else:

diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py
@@ -509,12 +509,13 @@ def test_construction_with_null(self, klass, nulls_fixture):
 
         tm.assert_categorical_equal(result, expected)
 
-    def test_from_codes_nullable_int_categories(self, any_numeric_ea_dtype):
+    @pytest.mark.parametrize("validate", [True, False])
+    def test_from_codes_nullable_int_categories(self, any_numeric_ea_dtype, validate):
         # GH#39649
         cats = pd.array(range(5), dtype=any_numeric_ea_dtype)
         codes = np.random.randint(5, size=3)
         dtype = CategoricalDtype(cats)
-        arr = Categorical.from_codes(codes, dtype=dtype)
+        arr = Categorical.from_codes(codes, dtype=dtype, validate=validate)
         assert arr.categories.dtype == cats.dtype
         tm.assert_index_equal(arr.categories, Index(cats))
 
@@ -525,6 +526,17 @@ def test_from_codes_empty(self):
 
         tm.assert_categorical_equal(result, expected)
 
+    @pytest.mark.parametrize("validate", [True, False])
+    def test_from_codes_validate(self, validate):
+        # GH53122
+        dtype = CategoricalDtype(["a", "b"])
+        if validate:
+            with pytest.raises(ValueError, match="codes need to be between "):
+                Categorical.from_codes([4, 5], dtype=dtype, validate=validate)
+        else:
+            # passes, though has incorrect codes, but that's the user responsibility
+            Categorical.from_codes([4, 5], dtype=dtype, validate=validate)
+
     def test_from_codes_too_few_categories(self):
         dtype = CategoricalDtype(categories=[1, 2])
         msg = "codes need to be between "