From a8718efe0a1cb390b5dd8c7f1bbaea6d2022ea35 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Fri, 13 Jan 2023 11:59:31 -0800
Subject: [PATCH 1/8] PERF: Use less memory in replace

---
 asv_bench/benchmarks/series_methods.py | 27 ++++++++++++++++++++++++++
 pandas/core/internals/blocks.py        | 21 ++++++++++++--------
 2 files changed, 40 insertions(+), 8 deletions(-)

diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py
index a0dd52e9f17e4..e75120207b744 100644
--- a/asv_bench/benchmarks/series_methods.py
+++ b/asv_bench/benchmarks/series_methods.py
@@ -401,4 +401,31 @@ def time_to_numpy_copy(self):
         self.ser.to_numpy(copy=True)
 
 
+class Replace:
+
+    param_names = ["num_to_replace"]
+    params = [100, 10000, 1_000_000]
+
+    def setup(self, num_to_replace):
+        N = 1_000_000
+        self.arr = np.random.randn(N)
+        self.arr1 = self.arr.copy()
+        np.random.shuffle(self.arr1)
+        self.ser = Series(self.arr)
+
+        self.to_replace_list = np.random.choice(self.arr, num_to_replace)
+        self.values_list = np.random.choice(self.arr1, num_to_replace)
+
+        self.replace_dict = dict(zip(self.to_replace_list, self.values_list))
+
+    def time_replace_dict(self):
+        self.ser.replace(self.replace_dict)
+
+    def peakmem_replace_dict(self):
+        self.ser.replace(self.replace_dict)
+
+    def time_replace_list(self):
+        self.ser.replace(self.to_replace_list, self.values_list)
+
+
 from .pandas_vb_common import setup  # noqa: F401 isort:skip
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
index f7787aa52623b..54137f888f9b3 100644
--- a/pandas/core/internals/blocks.py
+++ b/pandas/core/internals/blocks.py
@@ -664,21 +664,26 @@ def replace_list(
             # Calculate the mask once, prior to the call of comp
             # in order to avoid repeating the same computations
             mask = ~isna(values)
-            masks = [
-                compare_or_regex_search(values, s[0], regex=regex, mask=mask)
+            masks = (
+                extract_bool_array(
+                    compare_or_regex_search(values, s[0], regex=regex, mask=mask)
+                )
                 for s in pairs
-            ]
+            )
         else:
             # GH#38086 faster if we know we dont need to check for regex
-            masks = [missing.mask_missing(values, s[0]) for s in pairs]
+            # masks = [missing.mask_missing(values, s[0]) for s in pairs]
+            masks = (
+                extract_bool_array(missing.mask_missing(values, s[0])) for s in pairs
+            )
 
         # error: Argument 1 to "extract_bool_array" has incompatible type
         # "Union[ExtensionArray, ndarray, bool]"; expected "Union[ExtensionArray,
         # ndarray]"
-        masks = [extract_bool_array(x) for x in masks]  # type: ignore[arg-type]
+        # masks = [extract_bool_array(x) for x in masks]  # type: ignore[arg-type]
 
         rb = [self if inplace else self.copy()]
-        for i, (src, dest) in enumerate(pairs):
+        for i, ((src, dest), mask) in enumerate(zip(pairs, masks)):
             convert = i == src_len  # only convert once at the end
             new_rb: list[Block] = []
 
@@ -687,9 +692,9 @@ def replace_list(
             # where to index into the mask
             for blk_num, blk in enumerate(rb):
                 if len(rb) == 1:
-                    m = masks[i]
+                    m = mask
                 else:
-                    mib = masks[i]
+                    mib = mask
                     assert not isinstance(mib, bool)
                     m = mib[blk_num : blk_num + 1]
 

From 167d78983bbcc9eac01b6118197af519be28348f Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Fri, 13 Jan 2023 15:29:27 -0800
Subject: [PATCH 2/8] Fixes

---
 asv_bench/benchmarks/series_methods.py | 11 +++++++----
 pandas/core/internals/blocks.py        |  6 ++----
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py
index e75120207b744..424cba2c339e2 100644
--- a/asv_bench/benchmarks/series_methods.py
+++ b/asv_bench/benchmarks/series_methods.py
@@ -404,7 +404,7 @@ def time_to_numpy_copy(self):
 class Replace:
 
     param_names = ["num_to_replace"]
-    params = [100, 10000, 1_000_000]
+    params = [100, 1000]
 
     def setup(self, num_to_replace):
         N = 1_000_000
@@ -418,13 +418,16 @@ def setup(self, num_to_replace):
 
         self.replace_dict = dict(zip(self.to_replace_list, self.values_list))
 
-    def time_replace_dict(self):
+    def time_replace_dict(self, num_to_replace):
         self.ser.replace(self.replace_dict)
 
-    def peakmem_replace_dict(self):
+    def peakmem_replace_dict(self, num_to_replace):
         self.ser.replace(self.replace_dict)
 
-    def time_replace_list(self):
+    def time_replace_list(self, num_to_replace):
+        self.ser.replace(self.to_replace_list, self.values_list)
+
+    def peakmem_replace_list(self, num_to_replace):
         self.ser.replace(self.to_replace_list, self.values_list)
 
 
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
index 54137f888f9b3..a81b53c6f01bd 100644
--- a/pandas/core/internals/blocks.py
+++ b/pandas/core/internals/blocks.py
@@ -663,16 +663,15 @@ def replace_list(
         if is_string_dtype(values.dtype):
             # Calculate the mask once, prior to the call of comp
             # in order to avoid repeating the same computations
-            mask = ~isna(values)
+            na_mask = ~isna(values)
             masks = (
                 extract_bool_array(
-                    compare_or_regex_search(values, s[0], regex=regex, mask=mask)
+                    compare_or_regex_search(values, s[0], regex=regex, mask=na_mask)
                 )
                 for s in pairs
             )
         else:
             # GH#38086 faster if we know we dont need to check for regex
-            # masks = [missing.mask_missing(values, s[0]) for s in pairs]
             masks = (
                 extract_bool_array(missing.mask_missing(values, s[0])) for s in pairs
             )
@@ -681,7 +680,6 @@ def replace_list(
         # "Union[ExtensionArray, ndarray, bool]"; expected "Union[ExtensionArray,
         # ndarray]"
         # masks = [extract_bool_array(x) for x in masks]  # type: ignore[arg-type]
-
         rb = [self if inplace else self.copy()]
         for i, ((src, dest), mask) in enumerate(zip(pairs, masks)):
             convert = i == src_len  # only convert once at the end

From 3902ad1a5c387d90bf48f6522b5e2a090f7937b9 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Thu, 2 Feb 2023 16:13:49 -0500
Subject: [PATCH 3/8] fix cascading case

---
 pandas/core/internals/blocks.py             |  9 ++++++++-
 pandas/tests/series/methods/test_replace.py | 12 ++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
index a81b53c6f01bd..ce1166b2bcc4b 100644
--- a/pandas/core/internals/blocks.py
+++ b/pandas/core/internals/blocks.py
@@ -670,16 +670,23 @@ def replace_list(
                 )
                 for s in pairs
             )
+            # Materialize if inplace = True, since the masks can change
+            # as we replace
+            if inplace:
+                masks = list(masks)
         else:
             # GH#38086 faster if we know we dont need to check for regex
             masks = (
                 extract_bool_array(missing.mask_missing(values, s[0])) for s in pairs
             )
+            # Materialize if inplace = True, since the masks can change
+            # as we replace
+            if inplace:
+                masks = list(masks)
 
         # error: Argument 1 to "extract_bool_array" has incompatible type
         # "Union[ExtensionArray, ndarray, bool]"; expected "Union[ExtensionArray,
         # ndarray]"
-        # masks = [extract_bool_array(x) for x in masks]  # type: ignore[arg-type]
         rb = [self if inplace else self.copy()]
         for i, ((src, dest), mask) in enumerate(zip(pairs, masks)):
             convert = i == src_len  # only convert once at the end
diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py
index 1188c9452520c..77f24694f852b 100644
--- a/pandas/tests/series/methods/test_replace.py
+++ b/pandas/tests/series/methods/test_replace.py
@@ -298,6 +298,18 @@ def test_replace2(self):
         assert (ser[6:10] == -1).all()
         assert (ser[20:30] == -1).all()
 
+    @pytest.mark.parametrize("inplace", [True, False])
+    def test_replace_cascade(self, inplace):
+        # Test that replaced values are not replaced again
+        ser = pd.Series([1, 2, 3])
+        expected = pd.Series([2, 3, 4])
+
+        res = ser.replace([1, 2, 3], [2, 3, 4], inplace=inplace)
+        if inplace:
+            tm.assert_series_equal(ser, expected)
+        else:
+            tm.assert_series_equal(res, expected)
+
     def test_replace_with_dictlike_and_string_dtype(self, nullable_string_dtype):
         # GH 32621, GH#44940
         ser = pd.Series(["one", "two", np.nan], dtype=nullable_string_dtype)

From f95d3d2b16e5e93381772670e6033326cbb73d54 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Thu, 2 Feb 2023 20:27:21 -0500
Subject: [PATCH 4/8] fix typing

---
 pandas/core/internals/blocks.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
index 56907cc87c3d2..be25bb42eaf49 100644
--- a/pandas/core/internals/blocks.py
+++ b/pandas/core/internals/blocks.py
@@ -669,29 +669,26 @@ def replace_list(
             # Calculate the mask once, prior to the call of comp
             # in order to avoid repeating the same computations
             na_mask = ~isna(values)
-            masks = (
+            masks: Iterable[ndarray[Any, dtype[bool_]] = (
                 extract_bool_array(
                     compare_or_regex_search(values, s[0], regex=regex, mask=na_mask)
                 )
                 for s in pairs
-            )
+            ) # type: ignore[arg-type]
             # Materialize if inplace = True, since the masks can change
             # as we replace
             if inplace:
                 masks = list(masks)
         else:
             # GH#38086 faster if we know we dont need to check for regex
-            masks = (
+            masks: Iterable[ndarray[Any, dtype[bool_]] = (
                 extract_bool_array(missing.mask_missing(values, s[0])) for s in pairs
-            )
+            ) # type: ignore[arg-type]
             # Materialize if inplace = True, since the masks can change
             # as we replace
             if inplace:
                 masks = list(masks)
 
-        # error: Argument 1 to "extract_bool_array" has incompatible type
-        # "Union[ExtensionArray, ndarray, bool]"; expected "Union[ExtensionArray,
-        # ndarray]"
         rb = [self if inplace else self.copy()]
         for i, ((src, dest), mask) in enumerate(zip(pairs, masks)):
             convert = i == src_len  # only convert once at the end

From 67ca2a196e2be5ad0fe46b986987e258590b7adf Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Thu, 2 Feb 2023 21:34:52 -0500
Subject: [PATCH 5/8] placate mypy

---
 pandas/core/internals/blocks.py | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
index be25bb42eaf49..d0589cf94d04d 100644
--- a/pandas/core/internals/blocks.py
+++ b/pandas/core/internals/blocks.py
@@ -669,25 +669,24 @@ def replace_list(
             # Calculate the mask once, prior to the call of comp
             # in order to avoid repeating the same computations
             na_mask = ~isna(values)
-            masks: Iterable[ndarray[Any, dtype[bool_]] = (
+            masks: Iterable[npt.NDArray[np.bool_]] = (
                 extract_bool_array(
-                    compare_or_regex_search(values, s[0], regex=regex, mask=na_mask)
+                    cast(
+                        ArrayLike,
+                        compare_or_regex_search(
+                            values, s[0], regex=regex, mask=na_mask
+                        ),
+                    )
                 )
                 for s in pairs
-            ) # type: ignore[arg-type]
-            # Materialize if inplace = True, since the masks can change
-            # as we replace
-            if inplace:
-                masks = list(masks)
+            )
         else:
             # GH#38086 faster if we know we dont need to check for regex
-            masks: Iterable[ndarray[Any, dtype[bool_]] = (
-                extract_bool_array(missing.mask_missing(values, s[0])) for s in pairs
-            ) # type: ignore[arg-type]
-            # Materialize if inplace = True, since the masks can change
-            # as we replace
-            if inplace:
-                masks = list(masks)
+            masks = (missing.mask_missing(values, s[0]) for s in pairs)
+        # Materialize if inplace = True, since the masks can change
+        # as we replace
+        if inplace:
+            masks = list(masks)
 
         rb = [self if inplace else self.copy()]
         for i, ((src, dest), mask) in enumerate(zip(pairs, masks)):
@@ -711,7 +710,7 @@ def replace_list(
                 result = blk._replace_coerce(
                     to_replace=src,
                     value=dest,
-                    mask=m,  # type: ignore[arg-type]
+                    mask=m,
                     inplace=inplace,
                     regex=regex,
                 )

From 1eee094ed62c74e38faf3d3b46e31db9c52b43ed Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Wed, 22 Feb 2023 12:04:31 -0500
Subject: [PATCH 6/8] add GH number

---
 pandas/tests/series/methods/test_replace.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py
index 0d5ee5f060f9d..2880e3f3e85db 100644
--- a/pandas/tests/series/methods/test_replace.py
+++ b/pandas/tests/series/methods/test_replace.py
@@ -301,6 +301,7 @@ def test_replace2(self):
     @pytest.mark.parametrize("inplace", [True, False])
     def test_replace_cascade(self, inplace):
         # Test that replaced values are not replaced again
+        # GH #50778
         ser = pd.Series([1, 2, 3])
         expected = pd.Series([2, 3, 4])
 

From a81611b28a9e22144e8a8e955daa32beee8f3a08 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Wed, 22 Feb 2023 17:43:11 -0500
Subject: [PATCH 7/8] whatsnew

---
 doc/source/whatsnew/v2.0.0.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
index 45a5d139349e9..d31f8083deb2f 100644
--- a/doc/source/whatsnew/v2.0.0.rst
+++ b/doc/source/whatsnew/v2.0.0.rst
@@ -1161,6 +1161,7 @@ Performance improvements
 - Fixed a reference leak in :func:`read_hdf` (:issue:`37441`)
 - Fixed a memory leak in :meth:`DataFrame.to_json` and :meth:`Series.to_json` when serializing datetimes and timedeltas (:issue:`40443`)
 - Decreased memory usage in many :class:`DataFrameGroupBy` methods (:issue:`51090`)
+- Performance improvement in :meth:`DataFrame.replace` and :meth:`Series.replace` when using a large dict for ``to_replace``
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_200.bug_fixes:

From b2c3cad216dbc8cdc3297af2a0eac634040ead0c Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Wed, 22 Feb 2023 17:46:10 -0500
Subject: [PATCH 8/8] fixes

---
 doc/source/whatsnew/v2.0.0.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
index d31f8083deb2f..64c486c2a1e73 100644
--- a/doc/source/whatsnew/v2.0.0.rst
+++ b/doc/source/whatsnew/v2.0.0.rst
@@ -1161,7 +1161,7 @@ Performance improvements
 - Fixed a reference leak in :func:`read_hdf` (:issue:`37441`)
 - Fixed a memory leak in :meth:`DataFrame.to_json` and :meth:`Series.to_json` when serializing datetimes and timedeltas (:issue:`40443`)
 - Decreased memory usage in many :class:`DataFrameGroupBy` methods (:issue:`51090`)
-- Performance improvement in :meth:`DataFrame.replace` and :meth:`Series.replace` when using a large dict for ``to_replace``
+- Performance improvement in :meth:`DataFrame.replace` and :meth:`Series.replace` when using a large dict for ``to_replace`` (:issue:`6697`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_200.bug_fixes: