From 7f3a5453db63a476c16ffe356a588624329c344a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 18 Jan 2023 23:36:23 +0100 Subject: [PATCH] Revert "BUG/PERF: Series.replace with dtype="category" (#49404)" This reverts commit a063af0e6d443c4b5826eed2102a6d3c988da9a0. --- doc/source/whatsnew/v2.0.0.rst | 3 -- pandas/core/arrays/categorical.py | 46 +++++++++++++------ pandas/core/internals/blocks.py | 16 ++----- .../tests/arrays/categorical/test_replace.py | 10 ---- 4 files changed, 37 insertions(+), 38 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 614832c5acd1b..7054d93457264 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -844,7 +844,6 @@ Performance improvements - Performance improvement in :func:`merge` and :meth:`DataFrame.join` when joining on a sorted :class:`MultiIndex` (:issue:`48504`) - Performance improvement in :func:`to_datetime` when parsing strings with timezone offsets (:issue:`50107`) - Performance improvement in :meth:`DataFrame.loc` and :meth:`Series.loc` for tuple-based indexing of a :class:`MultiIndex` (:issue:`48384`) -- Performance improvement for :meth:`Series.replace` with categorical dtype (:issue:`49404`) - Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`) - Performance improvement for :func:`concat` with extension array backed indexes (:issue:`49128`, :issue:`49178`) - Reduce memory usage of :meth:`DataFrame.to_pickle`/:meth:`Series.to_pickle` when using BZ2 or LZMA (:issue:`49068`) @@ -887,8 +886,6 @@ Bug fixes Categorical ^^^^^^^^^^^ - Bug in :meth:`Categorical.set_categories` losing dtype information (:issue:`48812`) -- Bug in :meth:`Series.replace` with categorical dtype when ``to_replace`` values overlap with new values (:issue:`49404`) -- Bug in :meth:`Series.replace` with categorical dtype losing nullable dtypes of underlying categories (:issue:`49404`) - Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` would reorder categories when used as a grouper (:issue:`48749`) - Bug in :class:`Categorical` constructor when constructing from a :class:`Categorical` object and ``dtype="category"`` losing ordered-ness (:issue:`49309`) - diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 64fdc7949f96b..14f334d72dbb1 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2273,24 +2273,42 @@ def isin(self, values) -> npt.NDArray[np.bool_]: return algorithms.isin(self.codes, code_values) def _replace(self, *, to_replace, value, inplace: bool = False): - from pandas import ( - Index, - Series, - ) - inplace = validate_bool_kwarg(inplace, "inplace") cat = self if inplace else self.copy() - ser = Series(cat.categories, copy=True) - ser = ser.replace(to_replace=to_replace, value=value) + # other cases, like if both to_replace and value are list-like or if + # to_replace is a dict, are handled separately in NDFrame + if not is_list_like(to_replace): + to_replace = [to_replace] + + categories = cat.categories.tolist() + removals = set() + for replace_value in to_replace: + if value == replace_value: + continue + if replace_value not in cat.categories: + continue + if isna(value): + removals.add(replace_value) + continue + + index = categories.index(replace_value) + + if value in cat.categories: + value_index = categories.index(value) + cat._codes[cat._codes == index] = value_index + removals.add(replace_value) + else: + categories[index] = value + cat._set_categories(categories) - all_values = Index(ser) - new_categories = Index(ser.dropna().drop_duplicates(keep="first")) - new_codes = recode_for_categories( - cat._codes, all_values, new_categories, copy=False - ) - new_dtype = CategoricalDtype(new_categories, ordered=self.dtype.ordered) - NDArrayBacked.__init__(cat, new_codes, new_dtype) + if len(removals): + new_categories = [c for c in categories if c not in removals] + new_dtype = CategoricalDtype(new_categories, ordered=self.dtype.ordered) + codes = recode_for_categories( + cat.codes, cat.categories, new_dtype.categories + ) + NDArrayBacked.__init__(cat, codes, new_dtype) if not inplace: return cat diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 6056cada27069..4bb4882574228 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -536,10 +536,12 @@ def replace( if isinstance(values, Categorical): # TODO: avoid special-casing - # GH49404 blk = self if inplace else self.copy() - values = cast(Categorical, blk.values) - values._replace(to_replace=to_replace, value=value, inplace=True) + # error: Item "ExtensionArray" of "Union[ndarray[Any, Any], + # ExtensionArray]" has no attribute "_replace" + blk.values._replace( # type: ignore[union-attr] + to_replace=to_replace, value=value, inplace=True + ) return [blk] if not self._can_hold_element(to_replace): @@ -649,14 +651,6 @@ def replace_list( """ values = self.values - if isinstance(values, Categorical): - # TODO: avoid special-casing - # GH49404 - blk = self if inplace else self.copy() - values = cast(Categorical, blk.values) - values._replace(to_replace=src_list, value=dest_list, inplace=True) - return [blk] - # Exclude anything that we know we won't contain pairs = [ (x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x) diff --git a/pandas/tests/arrays/categorical/test_replace.py b/pandas/tests/arrays/categorical/test_replace.py index 62a7bf0673a16..a3ba420c84a17 100644 --- a/pandas/tests/arrays/categorical/test_replace.py +++ b/pandas/tests/arrays/categorical/test_replace.py @@ -21,8 +21,6 @@ ((5, 6), 2, [1, 2, 3], False), ([1], [2], [2, 2, 3], False), ([1, 4], [5, 2], [5, 2, 3], False), - # GH49404 - ([1, 2, 3], [2, 3, 4], [2, 3, 4], False), # check_categorical sorts categories, which crashes on mixed dtypes (3, "4", [1, 2, "4"], False), ([1, 2, "3"], "5", ["5", "5", 3], True), @@ -67,11 +65,3 @@ def test_replace_categorical(to_replace, value, result, expected_error_msg): pd.Series(cat).replace(to_replace, value, inplace=True) tm.assert_categorical_equal(cat, expected) - - -def test_replace_categorical_ea_dtype(): - # GH49404 - cat = Categorical(pd.array(["a", "b"], dtype="string")) - result = pd.Series(cat).replace(["a", "b"], ["c", pd.NA])._values - expected = Categorical(pd.array(["c", pd.NA], dtype="string")) - tm.assert_categorical_equal(result, expected)