diff --git a/doc/source/user_guide/10min.rst b/doc/source/user_guide/10min.rst index 0adb937de2b8b..c767fb1ebef7f 100644 --- a/doc/source/user_guide/10min.rst +++ b/doc/source/user_guide/10min.rst @@ -680,12 +680,12 @@ Converting the raw grades to a categorical data type: df["grade"] = df["raw_grade"].astype("category") df["grade"] -Rename the categories to more meaningful names (assigning to -:meth:`Series.cat.categories` is in place!): +Rename the categories to more meaningful names: .. ipython:: python - df["grade"].cat.categories = ["very good", "good", "very bad"] + new_categories = ["very good", "good", "very bad"] + df["grade"] = df["grade"].cat.rename_categories(new_categories) Reorder the categories and simultaneously add the missing categories (methods under :meth:`Series.cat` return a new :class:`Series` by default): diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index 0105cf99193dd..b5cb1d83a9f52 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -334,8 +334,7 @@ It's also possible to pass in the categories in a specific order: Renaming categories ~~~~~~~~~~~~~~~~~~~ -Renaming categories is done by assigning new values to the -``Series.cat.categories`` property or by using the +Renaming categories is done by using the :meth:`~pandas.Categorical.rename_categories` method: @@ -343,9 +342,8 @@ Renaming categories is done by assigning new values to the s = pd.Series(["a", "b", "c", "a"], dtype="category") s - s.cat.categories = ["Group %s" % g for g in s.cat.categories] - s - s = s.cat.rename_categories([1, 2, 3]) + new_categories = ["Group %s" % g for g in s.cat.categories] + s = s.cat.rename_categories(new_categories) s # You can also pass a dict-like object to map the renaming s = s.cat.rename_categories({1: "x", 2: "y", 3: "z"}) @@ -365,7 +363,7 @@ Categories must be unique or a ``ValueError`` is raised: .. ipython:: python try: - s.cat.categories = [1, 1, 1] + s = s.cat.rename_categories([1, 1, 1]) except ValueError as e: print("ValueError:", str(e)) @@ -374,7 +372,7 @@ Categories must also not be ``NaN`` or a ``ValueError`` is raised: .. ipython:: python try: - s.cat.categories = [1, 2, np.nan] + s = s.cat.rename_categories([1, 2, np.nan]) except ValueError as e: print("ValueError:", str(e)) @@ -702,7 +700,7 @@ of length "1". .. ipython:: python df.iat[0, 0] - df["cats"].cat.categories = ["x", "y", "z"] + df["cats"] = df["cats"].cat.rename_categories(["x", "y", "z"]) df.at["h", "cats"] # returns a string .. note:: @@ -960,7 +958,7 @@ relevant columns back to ``category`` and assign the right categories and catego s = pd.Series(pd.Categorical(["a", "b", "b", "a", "a", "d"])) # rename the categories - s.cat.categories = ["very good", "good", "bad"] + s = s.cat.rename_categories(["very good", "good", "bad"]) # reorder the categories and add missing categories s = s.cat.set_categories(["very bad", "bad", "medium", "good", "very good"]) df = pd.DataFrame({"cats": s, "vals": [1, 2, 3, 4, 5, 6]}) @@ -1164,6 +1162,7 @@ Constructing a ``Series`` from a ``Categorical`` will not copy the input change the original ``Categorical``: .. ipython:: python + :okwarning: cat = pd.Categorical([1, 2, 3, 10], categories=[1, 2, 3, 4, 10]) s = pd.Series(cat, name="cat") diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 25625dba1080f..5bde726791fd1 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -558,7 +558,8 @@ This matches the behavior of :meth:`Categorical.set_categories`. df = pd.read_csv(StringIO(data), dtype="category") df.dtypes df["col3"] - df["col3"].cat.categories = pd.to_numeric(df["col3"].cat.categories) + new_categories = pd.to_numeric(df["col3"].cat.categories) + df["col3"] = df["col3"].cat.rename_categories(new_categories) df["col3"] diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst index 113bbcf0a05bc..f2fdd23af1297 100644 --- a/doc/source/whatsnew/v0.19.0.rst +++ b/doc/source/whatsnew/v0.19.0.rst @@ -271,6 +271,7 @@ Individual columns can be parsed as a ``Categorical`` using a dict specification such as :func:`to_datetime`. .. ipython:: python + :okwarning: df = pd.read_csv(StringIO(data), dtype="category") df.dtypes diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 6e38024e02f36..f405e61db1879 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -775,6 +775,8 @@ Other Deprecations - Deprecated :meth:`Series.rank` returning an empty result when the dtype is non-numeric and ``numeric_only=True`` is provided; this will raise a ``TypeError`` in a future version (:issue:`47500`) - Deprecated argument ``errors`` for :meth:`Series.mask`, :meth:`Series.where`, :meth:`DataFrame.mask`, and :meth:`DataFrame.where` as ``errors`` had no effect on this methods (:issue:`47728`) - Deprecated arguments ``*args`` and ``**kwargs`` in :class:`Rolling`, :class:`Expanding`, and :class:`ExponentialMovingWindow` ops. (:issue:`47836`) +- Deprecated the ``inplace`` keyword in :meth:`Categorical.set_ordered`, :meth:`Categorical.as_ordered`, and :meth:`Categorical.as_unordered` (:issue:`37643`) +- Deprecated setting a categorical's categories with ``cat.categories = ['a', 'b', 'c']``, use :meth:`Categorical.rename_categories` instead (:issue:`37643`) - Deprecated unused arguments ``encoding`` and ``verbose`` in :meth:`Series.to_excel` and :meth:`DataFrame.to_excel` (:issue:`47912`) - Deprecated producing a single element when iterating over a :class:`DataFrameGroupBy` or a :class:`SeriesGroupBy` that has been grouped by a list of length 1; A tuple of length one will be returned instead (:issue:`42795`) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 2c3b7c2f2589d..127814dc58f4c 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -745,15 +745,14 @@ def categories(self) -> Index: @categories.setter def categories(self, categories) -> None: - new_dtype = CategoricalDtype(categories, ordered=self.ordered) - if self.dtype.categories is not None and len(self.dtype.categories) != len( - new_dtype.categories - ): - raise ValueError( - "new categories need to have the same number of " - "items as the old categories!" - ) - super().__init__(self._ndarray, new_dtype) + warn( + "Setting categories in-place is deprecated and will raise in a " + "future version. Use rename_categories instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + self._set_categories(categories) @property def ordered(self) -> Ordered: @@ -814,7 +813,7 @@ def _set_categories(self, categories, fastpath=False): ): raise ValueError( "new categories need to have the same number of " - "items than the old categories!" + "items as the old categories!" ) super().__init__(self._ndarray, new_dtype) @@ -836,7 +835,9 @@ def _set_dtype(self, dtype: CategoricalDtype) -> Categorical: return type(self)(codes, dtype=dtype, fastpath=True) @overload - def set_ordered(self, value, *, inplace: Literal[False] = ...) -> Categorical: + def set_ordered( + self, value, *, inplace: NoDefault | Literal[False] = ... + ) -> Categorical: ... @overload @@ -848,7 +849,9 @@ def set_ordered(self, value, *, inplace: bool) -> Categorical | None: ... @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "value"]) - def set_ordered(self, value, inplace: bool = False) -> Categorical | None: + def set_ordered( + self, value, inplace: bool | NoDefault = no_default + ) -> Categorical | None: """ Set the ordered attribute to the boolean value. @@ -859,7 +862,22 @@ def set_ordered(self, value, inplace: bool = False) -> Categorical | None: inplace : bool, default False Whether or not to set the ordered attribute in-place or return a copy of this categorical with ordered set to the value. + + .. deprecated:: 1.5.0 + """ + if inplace is not no_default: + warn( + "The `inplace` parameter in pandas.Categorical." + "set_ordered is deprecated and will be removed in " + "a future version. setting ordered-ness on categories will always " + "return a new Categorical object.", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + inplace = False + inplace = validate_bool_kwarg(inplace, "inplace") new_dtype = CategoricalDtype(self.categories, ordered=value) cat = self if inplace else self.copy() @@ -869,7 +887,7 @@ def set_ordered(self, value, inplace: bool = False) -> Categorical | None: return None @overload - def as_ordered(self, *, inplace: Literal[False] = ...) -> Categorical: + def as_ordered(self, *, inplace: NoDefault | Literal[False] = ...) -> Categorical: ... @overload @@ -877,7 +895,7 @@ def as_ordered(self, *, inplace: Literal[True]) -> None: ... @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) - def as_ordered(self, inplace: bool = False) -> Categorical | None: + def as_ordered(self, inplace: bool | NoDefault = no_default) -> Categorical | None: """ Set the Categorical to be ordered. @@ -887,16 +905,19 @@ def as_ordered(self, inplace: bool = False) -> Categorical | None: Whether or not to set the ordered attribute in-place or return a copy of this categorical with ordered set to True. + .. deprecated:: 1.5.0 + Returns ------- Categorical or None Ordered Categorical or None if ``inplace=True``. """ - inplace = validate_bool_kwarg(inplace, "inplace") + if inplace is not no_default: + inplace = validate_bool_kwarg(inplace, "inplace") return self.set_ordered(True, inplace=inplace) @overload - def as_unordered(self, *, inplace: Literal[False] = ...) -> Categorical: + def as_unordered(self, *, inplace: NoDefault | Literal[False] = ...) -> Categorical: ... @overload @@ -904,7 +925,9 @@ def as_unordered(self, *, inplace: Literal[True]) -> None: ... @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) - def as_unordered(self, inplace: bool = False) -> Categorical | None: + def as_unordered( + self, inplace: bool | NoDefault = no_default + ) -> Categorical | None: """ Set the Categorical to be unordered. @@ -914,12 +937,15 @@ def as_unordered(self, inplace: bool = False) -> Categorical | None: Whether or not to set the ordered attribute in-place or return a copy of this categorical with ordered set to False. + .. deprecated:: 1.5.0 + Returns ------- Categorical or None Unordered Categorical or None if ``inplace=True``. """ - inplace = validate_bool_kwarg(inplace, "inplace") + if inplace is not no_default: + inplace = validate_bool_kwarg(inplace, "inplace") return self.set_ordered(False, inplace=inplace) def set_categories( @@ -1108,11 +1134,11 @@ def rename_categories( cat = self if inplace else self.copy() if is_dict_like(new_categories): - cat.categories = [new_categories.get(item, item) for item in cat.categories] + new_categories = [new_categories.get(item, item) for item in cat.categories] elif callable(new_categories): - cat.categories = [new_categories(item) for item in cat.categories] - else: - cat.categories = new_categories + new_categories = [new_categories(item) for item in cat.categories] + + cat._set_categories(new_categories) if not inplace: return cat return None diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 3daa6d837349e..290f976e319f7 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1931,9 +1931,8 @@ def _do_convert_categoricals( categories = list(vl.values()) try: # Try to catch duplicate categories - # error: Incompatible types in assignment (expression has - # type "List[str]", variable has type "Index") - cat_data.categories = categories # type: ignore[assignment] + # TODO: if we get a non-copying rename_categories, use that + cat_data = cat_data.rename_categories(categories) except ValueError as err: vc = Series(categories).value_counts() repeated_cats = list(vc.index[vc > 1]) diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index 6b16ffffe9328..1a8dbe25c0b75 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -323,13 +323,22 @@ def test_validate_inplace_raises(self, value): f"received type {type(value).__name__}" ) with pytest.raises(ValueError, match=msg): - cat.set_ordered(value=True, inplace=value) + with tm.assert_produces_warning( + FutureWarning, match="Use rename_categories" + ): + cat.set_ordered(value=True, inplace=value) with pytest.raises(ValueError, match=msg): - cat.as_ordered(inplace=value) + with tm.assert_produces_warning( + FutureWarning, match="Use rename_categories" + ): + cat.as_ordered(inplace=value) with pytest.raises(ValueError, match=msg): - cat.as_unordered(inplace=value) + with tm.assert_produces_warning( + FutureWarning, match="Use rename_categories" + ): + cat.as_unordered(inplace=value) with pytest.raises(ValueError, match=msg): with tm.assert_produces_warning(FutureWarning): diff --git a/pandas/tests/arrays/categorical/test_api.py b/pandas/tests/arrays/categorical/test_api.py index 87b1bb88aeac3..3d5f1d3733254 100644 --- a/pandas/tests/arrays/categorical/test_api.py +++ b/pandas/tests/arrays/categorical/test_api.py @@ -34,22 +34,30 @@ def test_ordered_api(self): assert cat4.ordered def test_set_ordered(self): - + msg = ( + "The `inplace` parameter in pandas.Categorical.set_ordered is " + "deprecated and will be removed in a future version. setting " + "ordered-ness on categories will always return a new Categorical object" + ) cat = Categorical(["a", "b", "c", "a"], ordered=True) cat2 = cat.as_unordered() assert not cat2.ordered cat2 = cat.as_ordered() assert cat2.ordered - cat2.as_unordered(inplace=True) + with tm.assert_produces_warning(FutureWarning, match=msg): + cat2.as_unordered(inplace=True) assert not cat2.ordered - cat2.as_ordered(inplace=True) + with tm.assert_produces_warning(FutureWarning, match=msg): + cat2.as_ordered(inplace=True) assert cat2.ordered assert cat2.set_ordered(True).ordered assert not cat2.set_ordered(False).ordered - cat2.set_ordered(True, inplace=True) + with tm.assert_produces_warning(FutureWarning, match=msg): + cat2.set_ordered(True, inplace=True) assert cat2.ordered - cat2.set_ordered(False, inplace=True) + with tm.assert_produces_warning(FutureWarning, match=msg): + cat2.set_ordered(False, inplace=True) assert not cat2.ordered # removed in 0.19.0 diff --git a/pandas/tests/arrays/categorical/test_indexing.py b/pandas/tests/arrays/categorical/test_indexing.py index 940aa5ffff040..94e966642b925 100644 --- a/pandas/tests/arrays/categorical/test_indexing.py +++ b/pandas/tests/arrays/categorical/test_indexing.py @@ -194,7 +194,8 @@ def test_periodindex(self): def test_categories_assignments(self): cat = Categorical(["a", "b", "c", "a"]) exp = np.array([1, 2, 3, 1], dtype=np.int64) - cat.categories = [1, 2, 3] + with tm.assert_produces_warning(FutureWarning, match="Use rename_categories"): + cat.categories = [1, 2, 3] tm.assert_numpy_array_equal(cat.__array__(), exp) tm.assert_index_equal(cat.categories, Index([1, 2, 3])) @@ -216,8 +217,9 @@ def test_categories_assignments_wrong_length_raises(self, new_categories): "new categories need to have the same number of items " "as the old categories!" ) - with pytest.raises(ValueError, match=msg): - cat.categories = new_categories + with tm.assert_produces_warning(FutureWarning, match="Use rename_categories"): + with pytest.raises(ValueError, match=msg): + cat.categories = new_categories # Combinations of sorted/unique: @pytest.mark.parametrize( diff --git a/pandas/tests/series/accessors/test_cat_accessor.py b/pandas/tests/series/accessors/test_cat_accessor.py index 53158482fde46..750e84b8cde08 100644 --- a/pandas/tests/series/accessors/test_cat_accessor.py +++ b/pandas/tests/series/accessors/test_cat_accessor.py @@ -110,7 +110,8 @@ def test_categorical_delegations(self): ser = Series(Categorical(["a", "b", "c", "a"], ordered=True)) exp_categories = Index(["a", "b", "c"]) tm.assert_index_equal(ser.cat.categories, exp_categories) - ser.cat.categories = [1, 2, 3] + with tm.assert_produces_warning(FutureWarning, match="Use rename_categories"): + ser.cat.categories = [1, 2, 3] exp_categories = Index([1, 2, 3]) tm.assert_index_equal(ser.cat.categories, exp_categories) @@ -120,7 +121,8 @@ def test_categorical_delegations(self): assert ser.cat.ordered ser = ser.cat.as_unordered() assert not ser.cat.ordered - return_value = ser.cat.as_ordered(inplace=True) + with tm.assert_produces_warning(FutureWarning, match="The `inplace`"): + return_value = ser.cat.as_ordered(inplace=True) assert return_value is None assert ser.cat.ordered @@ -267,8 +269,10 @@ def test_set_categories_setitem(self): df = DataFrame({"Survived": [1, 0, 1], "Sex": [0, 1, 1]}, dtype="category") # change the dtype in-place - df["Survived"].cat.categories = ["No", "Yes"] - df["Sex"].cat.categories = ["female", "male"] + with tm.assert_produces_warning(FutureWarning, match="Use rename_categories"): + df["Survived"].cat.categories = ["No", "Yes"] + with tm.assert_produces_warning(FutureWarning, match="Use rename_categories"): + df["Sex"].cat.categories = ["female", "male"] # values should not be coerced to NaN assert list(df["Sex"]) == ["female", "male", "male"] diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index fc17f0b942d09..01fe6a529a86f 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -476,7 +476,8 @@ def test_categorical_sideeffects_free(self): cat = Categorical(["a", "b", "c", "a"]) s = Series(cat, copy=True) assert s.cat is not cat - s.cat.categories = [1, 2, 3] + with tm.assert_produces_warning(FutureWarning, match="Use rename_categories"): + s.cat.categories = [1, 2, 3] exp_s = np.array([1, 2, 3, 1], dtype=np.int64) exp_cat = np.array(["a", "b", "c", "a"], dtype=np.object_) tm.assert_numpy_array_equal(s.__array__(), exp_s) @@ -493,7 +494,8 @@ def test_categorical_sideeffects_free(self): cat = Categorical(["a", "b", "c", "a"]) s = Series(cat) assert s.values is cat - s.cat.categories = [1, 2, 3] + with tm.assert_produces_warning(FutureWarning, match="Use rename_categories"): + s.cat.categories = [1, 2, 3] exp_s = np.array([1, 2, 3, 1], dtype=np.int64) tm.assert_numpy_array_equal(s.__array__(), exp_s) tm.assert_numpy_array_equal(cat.__array__(), exp_s)