From eeef9fc0b8949b44949642450a8efdcbcc814da9 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sun, 30 Oct 2022 21:12:38 -0400 Subject: [PATCH 1/3] refactor Categorical._replace --- doc/source/whatsnew/v2.0.0.rst | 3 ++ pandas/core/arrays/categorical.py | 46 ++++++------------- pandas/core/internals/blocks.py | 16 ++++--- .../tests/arrays/categorical/test_replace.py | 10 ++++ 4 files changed, 37 insertions(+), 38 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 5614b7a2c0846..19296587011da 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -298,6 +298,7 @@ Performance improvements - Performance improvement for :class:`DatetimeIndex` constructor passing a list (:issue:`48609`) - Performance improvement in :func:`merge` and :meth:`DataFrame.join` when joining on a sorted :class:`MultiIndex` (:issue:`48504`) - Performance improvement in :meth:`DataFrame.loc` and :meth:`Series.loc` for tuple-based indexing of a :class:`MultiIndex` (:issue:`48384`) +- Performance improvement for :meth:`Series.replace` with categorical dtype (:issue:`#####`) - Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`) - Performance improvement for :func:`concat` with extension array backed indexes (:issue:`49128`, :issue:`49178`) - Reduce memory usage of :meth:`DataFrame.to_pickle`/:meth:`Series.to_pickle` when using BZ2 or LZMA (:issue:`49068`) @@ -319,6 +320,8 @@ Bug fixes Categorical ^^^^^^^^^^^ - Bug in :meth:`Categorical.set_categories` losing dtype information (:issue:`48812`) +- Bug in :meth:`Series.replace` with categorical dtype when ``to_replace`` values overlap with new values (:issue:`#####`) +- Bug in :meth:`Series.replace` with categorical dtype losing nullable dtypes of underlying categories (:issue:`#####`) - Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` would reorder categories when used as a grouper (:issue:`48749`) Datetimelike diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index becca2b668290..d3805dea40ae7 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2286,42 +2286,24 @@ def isin(self, values) -> npt.NDArray[np.bool_]: return algorithms.isin(self.codes, code_values) def _replace(self, *, to_replace, value, inplace: bool = False): + from pandas import ( + Index, + Series, + ) + inplace = validate_bool_kwarg(inplace, "inplace") cat = self if inplace else self.copy() - # other cases, like if both to_replace and value are list-like or if - # to_replace is a dict, are handled separately in NDFrame - if not is_list_like(to_replace): - to_replace = [to_replace] - - categories = cat.categories.tolist() - removals = set() - for replace_value in to_replace: - if value == replace_value: - continue - if replace_value not in cat.categories: - continue - if isna(value): - removals.add(replace_value) - continue - - index = categories.index(replace_value) - - if value in cat.categories: - value_index = categories.index(value) - cat._codes[cat._codes == index] = value_index - removals.add(replace_value) - else: - categories[index] = value - cat._set_categories(categories) + ser = Series(cat.categories, copy=True) + ser.replace(to_replace=to_replace, value=value, inplace=True) - if len(removals): - new_categories = [c for c in categories if c not in removals] - new_dtype = CategoricalDtype(new_categories, ordered=self.dtype.ordered) - codes = recode_for_categories( - cat.codes, cat.categories, new_dtype.categories - ) - NDArrayBacked.__init__(cat, codes, new_dtype) + all_values = Index(ser) + new_categories = Index(ser.dropna().drop_duplicates(keep="first")) + new_codes = recode_for_categories( + cat._codes, all_values, new_categories, copy=False + ) + new_dtype = CategoricalDtype(new_categories, ordered=self.dtype.ordered) + NDArrayBacked.__init__(cat, new_codes, new_dtype) if not inplace: return cat diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index f944c74ac37fd..fec485c9a7993 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -575,13 +575,10 @@ def replace( values = self.values if isinstance(values, Categorical): - # TODO: avoid special-casing + # GH (TODO) blk = self if inplace else self.copy() - # error: Item "ExtensionArray" of "Union[ndarray[Any, Any], - # ExtensionArray]" has no attribute "_replace" - blk.values._replace( # type: ignore[union-attr] - to_replace=to_replace, value=value, inplace=True - ) + values = cast(Categorical, blk.values) + values._replace(to_replace=to_replace, value=value, inplace=True) return [blk] if not self._can_hold_element(to_replace): @@ -688,6 +685,13 @@ def replace_list( """ values = self.values + if isinstance(values, Categorical): + # GH (TODO) + blk = self if inplace else self.copy() + values = cast(Categorical, blk.values) + values._replace(to_replace=src_list, value=dest_list, inplace=True) + return [blk] + # Exclude anything that we know we won't contain pairs = [ (x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x) diff --git a/pandas/tests/arrays/categorical/test_replace.py b/pandas/tests/arrays/categorical/test_replace.py index a3ba420c84a17..84db9d89b31cc 100644 --- a/pandas/tests/arrays/categorical/test_replace.py +++ b/pandas/tests/arrays/categorical/test_replace.py @@ -21,6 +21,8 @@ ((5, 6), 2, [1, 2, 3], False), ([1], [2], [2, 2, 3], False), ([1, 4], [5, 2], [5, 2, 3], False), + # GH # (TODO) + ([1, 2, 3], [2, 3, 4], [2, 3, 4], False), # check_categorical sorts categories, which crashes on mixed dtypes (3, "4", [1, 2, "4"], False), ([1, 2, "3"], "5", ["5", "5", 3], True), @@ -65,3 +67,11 @@ def test_replace_categorical(to_replace, value, result, expected_error_msg): pd.Series(cat).replace(to_replace, value, inplace=True) tm.assert_categorical_equal(cat, expected) + + +def test_replace_categorical_ea_dtype(): + # GH# + cat = Categorical(pd.array(["a", "b"], dtype="string")) + result = pd.Series(cat).replace(["a", "b"], ["c", pd.NA])._values + expected = Categorical(pd.array(["c", pd.NA], dtype="string")) + tm.assert_categorical_equal(result, expected) From 05c2025bec021cf7ad3c3843bd1397c8e290ed25 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sun, 30 Oct 2022 21:26:25 -0400 Subject: [PATCH 2/3] gh refs --- doc/source/whatsnew/v2.0.0.rst | 6 +++--- pandas/core/internals/blocks.py | 4 ++-- pandas/tests/arrays/categorical/test_replace.py | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 19296587011da..8156196d33b70 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -298,7 +298,7 @@ Performance improvements - Performance improvement for :class:`DatetimeIndex` constructor passing a list (:issue:`48609`) - Performance improvement in :func:`merge` and :meth:`DataFrame.join` when joining on a sorted :class:`MultiIndex` (:issue:`48504`) - Performance improvement in :meth:`DataFrame.loc` and :meth:`Series.loc` for tuple-based indexing of a :class:`MultiIndex` (:issue:`48384`) -- Performance improvement for :meth:`Series.replace` with categorical dtype (:issue:`#####`) +- Performance improvement for :meth:`Series.replace` with categorical dtype (:issue:`49404`) - Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`) - Performance improvement for :func:`concat` with extension array backed indexes (:issue:`49128`, :issue:`49178`) - Reduce memory usage of :meth:`DataFrame.to_pickle`/:meth:`Series.to_pickle` when using BZ2 or LZMA (:issue:`49068`) @@ -320,8 +320,8 @@ Bug fixes Categorical ^^^^^^^^^^^ - Bug in :meth:`Categorical.set_categories` losing dtype information (:issue:`48812`) -- Bug in :meth:`Series.replace` with categorical dtype when ``to_replace`` values overlap with new values (:issue:`#####`) -- Bug in :meth:`Series.replace` with categorical dtype losing nullable dtypes of underlying categories (:issue:`#####`) +- Bug in :meth:`Series.replace` with categorical dtype when ``to_replace`` values overlap with new values (:issue:`49404`) +- Bug in :meth:`Series.replace` with categorical dtype losing nullable dtypes of underlying categories (:issue:`49404`) - Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` would reorder categories when used as a grouper (:issue:`48749`) Datetimelike diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index fec485c9a7993..bc9b00fe07582 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -575,7 +575,7 @@ def replace( values = self.values if isinstance(values, Categorical): - # GH (TODO) + # GH49404 blk = self if inplace else self.copy() values = cast(Categorical, blk.values) values._replace(to_replace=to_replace, value=value, inplace=True) @@ -686,7 +686,7 @@ def replace_list( values = self.values if isinstance(values, Categorical): - # GH (TODO) + # GH49404 blk = self if inplace else self.copy() values = cast(Categorical, blk.values) values._replace(to_replace=src_list, value=dest_list, inplace=True) diff --git a/pandas/tests/arrays/categorical/test_replace.py b/pandas/tests/arrays/categorical/test_replace.py index 84db9d89b31cc..62a7bf0673a16 100644 --- a/pandas/tests/arrays/categorical/test_replace.py +++ b/pandas/tests/arrays/categorical/test_replace.py @@ -21,7 +21,7 @@ ((5, 6), 2, [1, 2, 3], False), ([1], [2], [2, 2, 3], False), ([1, 4], [5, 2], [5, 2, 3], False), - # GH # (TODO) + # GH49404 ([1, 2, 3], [2, 3, 4], [2, 3, 4], False), # check_categorical sorts categories, which crashes on mixed dtypes (3, "4", [1, 2, "4"], False), @@ -70,7 +70,7 @@ def test_replace_categorical(to_replace, value, result, expected_error_msg): def test_replace_categorical_ea_dtype(): - # GH# + # GH49404 cat = Categorical(pd.array(["a", "b"], dtype="string")) result = pd.Series(cat).replace(["a", "b"], ["c", pd.NA])._values expected = Categorical(pd.array(["c", pd.NA], dtype="string")) From b84fdfeae6cad512f4e8ad39dd2971f5bb1f956d Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Mon, 31 Oct 2022 19:56:39 -0400 Subject: [PATCH 3/3] cleanup --- pandas/core/arrays/categorical.py | 2 +- pandas/core/internals/blocks.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index d3805dea40ae7..5f769e5fd8467 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2295,7 +2295,7 @@ def _replace(self, *, to_replace, value, inplace: bool = False): cat = self if inplace else self.copy() ser = Series(cat.categories, copy=True) - ser.replace(to_replace=to_replace, value=value, inplace=True) + ser = ser.replace(to_replace=to_replace, value=value) all_values = Index(ser) new_categories = Index(ser.dropna().drop_duplicates(keep="first")) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index e0d548fd98c0e..06fc70c0af2dd 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -574,6 +574,7 @@ def replace( values = self.values if isinstance(values, Categorical): + # TODO: avoid special-casing # GH49404 blk = self if inplace else self.copy() values = cast(Categorical, blk.values) @@ -688,6 +689,7 @@ def replace_list( values = self.values if isinstance(values, Categorical): + # TODO: avoid special-casing # GH49404 blk = self if inplace else self.copy() values = cast(Categorical, blk.values)