diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index cd012fe755337..b36d4b230573e 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -297,6 +297,7 @@ Categorical - Using date accessors on a categorical dtyped :class:`Series` of datetimes was not returning an object of the same type as if one used the :meth:`.str.` / :meth:`.dt.` on a :class:`Series` of that type. E.g. when accessing :meth:`Series.dt.tz_localize` on a :class:`Categorical` with duplicate entries, the accessor was skipping duplicates (:issue: `27952`) +- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` that would give incorrect results on categorical data (:issue:`26988`) Datetimelike diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 73d1db9bda8ed..4e3724ea58777 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2471,6 +2471,51 @@ def isin(self, values): code_values = code_values[null_mask | (code_values >= 0)] return algorithms.isin(self.codes, code_values) + def replace(self, to_replace, value, inplace: bool = False): + """ + Replaces all instances of one value with another + + Parameters + ---------- + to_replace: object + The value to be replaced + + value: object + The value to replace it with + + inplace: bool + Whether the operation is done in-place + + Returns + ------- + None if inplace is True, otherwise the new Categorical after replacement + + + Examples + -------- + >>> s = pd.Categorical([1, 2, 1, 3]) + >>> s.replace(1, 3) + [3, 3, 2, 3] + Categories (2, int64): [2, 3] + """ + inplace = validate_bool_kwarg(inplace, "inplace") + cat = self if inplace else self.copy() + if to_replace in cat.categories: + if isna(value): + cat.remove_categories(to_replace, inplace=True) + else: + categories = cat.categories.tolist() + index = categories.index(to_replace) + if value in cat.categories: + value_index = categories.index(value) + cat._codes[cat._codes == index] = value_index + cat.remove_categories(to_replace, inplace=True) + else: + categories[index] = value + cat.rename_categories(categories, inplace=True) + if not inplace: + return cat + # The Series.cat accessor diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 5508cf3ca522e..38e80525b3d13 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2924,6 +2924,30 @@ def where( ) return result + def replace( + self, + to_replace, + value, + inplace: bool = False, + filter=None, + regex: bool = False, + convert: bool = True, + ): + inplace = validate_bool_kwarg(inplace, "inplace") + result = self if inplace else self.copy() + if filter is None: # replace was called on a series + result.values.replace(to_replace, value, inplace=True) + if convert: + return result.convert(numeric=False, copy=not inplace) + else: + return result + else: # replace was called on a DataFrame + if not isna(value): + result.values.add_categories(value, inplace=True) + return super(CategoricalBlock, result).replace( + to_replace, value, inplace, filter, regex, convert + ) + # ----------------------------------------------------------------- # Constructor Helpers diff --git a/pandas/tests/arrays/categorical/test_algos.py b/pandas/tests/arrays/categorical/test_algos.py index 1a48ccf85f947..e076015c5f61d 100644 --- a/pandas/tests/arrays/categorical/test_algos.py +++ b/pandas/tests/arrays/categorical/test_algos.py @@ -59,6 +59,24 @@ def test_isin_cats(): tm.assert_numpy_array_equal(expected, result) +@pytest.mark.parametrize( + "to_replace, value, result", + [("b", "c", ["a", "c"]), ("c", "d", ["a", "b"]), ("b", None, ["a", None])], +) +def test_replace(to_replace, value, result): + # GH 26988 + cat = pd.Categorical(["a", "b"]) + expected = pd.Categorical(result) + result = cat.replace(to_replace, value) + tm.assert_categorical_equal(result, expected) + if to_replace == "b": # the "c" test is supposed to be unchanged + with pytest.raises(AssertionError): + # ensure non-inplace call does not affect original + tm.assert_categorical_equal(cat, expected) + cat.replace(to_replace, value, inplace=True) + tm.assert_categorical_equal(cat, expected) + + @pytest.mark.parametrize("empty", [[], pd.Series(), np.array([])]) def test_isin_empty(empty): s = pd.Categorical(["a", "b"]) diff --git a/pandas/tests/frame/test_replace.py b/pandas/tests/frame/test_replace.py index c30efa121262f..60b601b57e007 100644 --- a/pandas/tests/frame/test_replace.py +++ b/pandas/tests/frame/test_replace.py @@ -1296,6 +1296,24 @@ def test_replace_method(self, to_replace, method, expected): expected = DataFrame(expected) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( + "replace_dict, final_data", + [({"a": 1, "b": 1}, [[3, 3], [2, 2]]), ({"a": 1, "b": 2}, [[3, 1], [2, 3]])], + ) + def test_categorical_replace_with_dict(self, replace_dict, final_data): + # GH 26988 + df = DataFrame([[1, 1], [2, 2]], columns=["a", "b"], dtype="category") + expected = DataFrame(final_data, columns=["a", "b"], dtype="category") + expected["a"] = expected["a"].cat.set_categories([1, 2, 3]) + expected["b"] = expected["b"].cat.set_categories([1, 2, 3]) + result = df.replace(replace_dict, 3) + tm.assert_frame_equal(result, expected) + with pytest.raises(AssertionError): + # ensure non-inplace call does not affect original + tm.assert_frame_equal(df, expected) + df.replace(replace_dict, 3, inplace=True) + tm.assert_frame_equal(df, expected) + @pytest.mark.parametrize( "df, to_replace, exp", [ diff --git a/pandas/tests/series/test_replace.py b/pandas/tests/series/test_replace.py index ebfd468e034f9..8018ecf03960c 100644 --- a/pandas/tests/series/test_replace.py +++ b/pandas/tests/series/test_replace.py @@ -293,6 +293,29 @@ def test_replace_categorical(self, categorical, numeric): expected = pd.Series(numeric) tm.assert_series_equal(expected, result, check_dtype=False) + def test_replace_categorical_single(self): + # GH 26988 + dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific") + s = pd.Series(dti) + c = s.astype("category") + + expected = c.copy() + expected = expected.cat.add_categories("foo") + expected[2] = "foo" + expected = expected.cat.remove_unused_categories() + assert c[2] != "foo" + + result = c.replace(c[2], "foo") + tm.assert_series_equal(expected, result) + assert c[2] != "foo" # ensure non-inplace call does not alter original + + c.replace(c[2], "foo", inplace=True) + tm.assert_series_equal(expected, c) + + first_value = c[0] + c.replace(c[1], c[0], inplace=True) + assert c[0] == c[1] == first_value # test replacing with existing value + def test_replace_with_no_overflowerror(self): # GH 25616 # casts to object without Exception from OverflowError