diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 36b2aa3c28da5..a6302ca09f90a 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -95,6 +95,7 @@ Other enhancements - Improved error message when creating a DataFrame with empty data (0 rows), no index and an incorrect number of columns. (:issue:`52084`) - Let :meth:`DataFrame.to_feather` accept a non-default :class:`Index` and non-string column names (:issue:`51787`) - Performance improvement in :func:`read_csv` (:issue:`52632`) with ``engine="c"`` +- :meth:`Categorical.from_codes` has gotten a ``validate`` parameter (:issue:`50975`) - Performance improvement in :func:`concat` with homogeneous ``np.float64`` or ``np.float32`` dtypes (:issue:`52685`) - Performance improvement in :meth:`DataFrame.filter` when ``items`` is given (:issue:`52941`) - diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 41e05ffc70b2e..6eb21fae29612 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -649,7 +649,12 @@ def _from_inferred_categories( @classmethod def from_codes( - cls, codes, categories=None, ordered=None, dtype: Dtype | None = None + cls, + codes, + categories=None, + ordered=None, + dtype: Dtype | None = None, + validate: bool = True, ) -> Self: """ Make a Categorical type from codes and categories or dtype. @@ -677,6 +682,12 @@ def from_codes( dtype : CategoricalDtype or "category", optional If :class:`CategoricalDtype`, cannot be used together with `categories` or `ordered`. + validate : bool, default True + If True, validate that the codes are valid for the dtype. + If False, don't validate that the codes are valid. Be careful about skipping + validation, as invalid codes can lead to severe problems, such as segfaults. + + .. versionadded:: 2.1.0 Returns ------- @@ -699,18 +710,9 @@ def from_codes( ) raise ValueError(msg) - if isinstance(codes, ExtensionArray) and is_integer_dtype(codes.dtype): - # Avoid the implicit conversion of Int to object - if isna(codes).any(): - raise ValueError("codes cannot contain NA values") - codes = codes.to_numpy(dtype=np.int64) - else: - codes = np.asarray(codes) - if len(codes) and codes.dtype.kind not in "iu": - raise ValueError("codes need to be array-like integers") - - if len(codes) and (codes.max() >= len(dtype.categories) or codes.min() < -1): - raise ValueError("codes need to be between -1 and len(categories)-1") + if validate: + # beware: non-valid codes may segfault + codes = cls._validate_codes_for_dtype(codes, dtype=dtype) return cls._simple_new(codes, dtype=dtype) @@ -1325,7 +1327,7 @@ def map( if new_categories.is_unique and not new_categories.hasnans and na_val is np.nan: new_dtype = CategoricalDtype(new_categories, ordered=self.ordered) - return self.from_codes(self._codes.copy(), dtype=new_dtype) + return self.from_codes(self._codes.copy(), dtype=new_dtype, validate=False) if has_nans: new_categories = new_categories.insert(len(new_categories), na_val) @@ -1378,6 +1380,22 @@ def _validate_scalar(self, fill_value): ) from None return fill_value + @classmethod + def _validate_codes_for_dtype(cls, codes, *, dtype: CategoricalDtype) -> np.ndarray: + if isinstance(codes, ExtensionArray) and is_integer_dtype(codes.dtype): + # Avoid the implicit conversion of Int to object + if isna(codes).any(): + raise ValueError("codes cannot contain NA values") + codes = codes.to_numpy(dtype=np.int64) + else: + codes = np.asarray(codes) + if len(codes) and codes.dtype.kind not in "iu": + raise ValueError("codes need to be array-like integers") + + if len(codes) and (codes.max() >= len(dtype.categories) or codes.min() < -1): + raise ValueError("codes need to be between -1 and len(categories)-1") + return codes + # ------------------------------------------------------------- @ravel_compat @@ -2724,7 +2742,7 @@ def factorize_from_iterable(values) -> tuple[np.ndarray, Index]: # The Categorical we want to build has the same categories # as values but its codes are by def [0, ..., len(n_categories) - 1] cat_codes = np.arange(len(values.categories), dtype=values.codes.dtype) - cat = Categorical.from_codes(cat_codes, dtype=values.dtype) + cat = Categorical.from_codes(cat_codes, dtype=values.dtype, validate=False) categories = CategoricalIndex(cat) codes = values.codes diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index e9833a41e2795..316b896da126f 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -721,7 +721,7 @@ def group_index(self) -> Index: if self._sort and (codes == len(uniques)).any(): # Add NA value on the end when sorting uniques = Categorical.from_codes( - np.append(uniques.codes, [-1]), uniques.categories + np.append(uniques.codes, [-1]), uniques.categories, validate=False ) elif len(codes) > 0: # Need to determine proper placement of NA value when not sorting @@ -730,8 +730,9 @@ def group_index(self) -> Index: if cat.codes[na_idx] < 0: # count number of unique codes that comes before the nan value na_unique_idx = algorithms.nunique_ints(cat.codes[:na_idx]) + new_codes = np.insert(uniques.codes, na_unique_idx, -1) uniques = Categorical.from_codes( - np.insert(uniques.codes, na_unique_idx, -1), uniques.categories + new_codes, uniques.categories, validate=False ) return Index._with_infer(uniques, name=self.name) @@ -754,7 +755,7 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: ucodes = np.arange(len(categories)) uniques = Categorical.from_codes( - codes=ucodes, categories=categories, ordered=cat.ordered + codes=ucodes, categories=categories, ordered=cat.ordered, validate=False ) codes = cat.codes @@ -800,7 +801,8 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: @cache_readonly def groups(self) -> dict[Hashable, np.ndarray]: - return self._index.groupby(Categorical.from_codes(self.codes, self.group_index)) + cats = Categorical.from_codes(self.codes, self.group_index, validate=False) + return self._index.groupby(cats) def get_grouper( diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index deff5129ad64d..7a151cb811cbd 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2393,7 +2393,7 @@ def cats(level_codes): ) return [ - Categorical.from_codes(level_codes, cats(level_codes), ordered=True) + Categorical.from_codes(level_codes, cats(level_codes), True, validate=False) for level_codes in self.codes ] @@ -2583,7 +2583,7 @@ def _get_indexer_level_0(self, target) -> npt.NDArray[np.intp]: """ lev = self.levels[0] codes = self._codes[0] - cat = Categorical.from_codes(codes=codes, categories=lev) + cat = Categorical.from_codes(codes=codes, categories=lev, validate=False) ci = Index(cat) return ci.get_indexer_for(target) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 357353ed38d46..83d004c8b8e3e 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -411,7 +411,8 @@ def _bins_to_cuts( if isinstance(bins, IntervalIndex): # we have a fast-path here ids = bins.get_indexer(x) - result = Categorical.from_codes(ids, categories=bins, ordered=True) + cat_dtype = CategoricalDtype(bins, ordered=True) + result = Categorical.from_codes(ids, dtype=cat_dtype, validate=False) return result, bins unique_bins = algos.unique(bins) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 33ff24f5fc981..c9dab71805b62 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2520,7 +2520,7 @@ def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): codes[codes != -1] -= mask.astype(int).cumsum()._values converted = Categorical.from_codes( - codes, categories=categories, ordered=ordered + codes, categories=categories, ordered=ordered, validate=False ) else: diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index e6e829cdaf1c2..5eb7f37a4ae34 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -509,12 +509,13 @@ def test_construction_with_null(self, klass, nulls_fixture): tm.assert_categorical_equal(result, expected) - def test_from_codes_nullable_int_categories(self, any_numeric_ea_dtype): + @pytest.mark.parametrize("validate", [True, False]) + def test_from_codes_nullable_int_categories(self, any_numeric_ea_dtype, validate): # GH#39649 cats = pd.array(range(5), dtype=any_numeric_ea_dtype) codes = np.random.randint(5, size=3) dtype = CategoricalDtype(cats) - arr = Categorical.from_codes(codes, dtype=dtype) + arr = Categorical.from_codes(codes, dtype=dtype, validate=validate) assert arr.categories.dtype == cats.dtype tm.assert_index_equal(arr.categories, Index(cats)) @@ -525,6 +526,17 @@ def test_from_codes_empty(self): tm.assert_categorical_equal(result, expected) + @pytest.mark.parametrize("validate", [True, False]) + def test_from_codes_validate(self, validate): + # GH53122 + dtype = CategoricalDtype(["a", "b"]) + if validate: + with pytest.raises(ValueError, match="codes need to be between "): + Categorical.from_codes([4, 5], dtype=dtype, validate=validate) + else: + # passes, though has incorrect codes, but that's the user responsibility + Categorical.from_codes([4, 5], dtype=dtype, validate=validate) + def test_from_codes_too_few_categories(self): dtype = CategoricalDtype(categories=[1, 2]) msg = "codes need to be between "