PERF: Use generator expression for Blocks.replace_list (#50778)

lithomas1 · web-flow · commit 4179a4409239 · 2023-02-28T17:15:02.000-05:00
* PERF: Use less memory in replace

* Fixes

* fix cascading case

* fix typing

* placate mypy

* add GH number

* whatsnew

* fixes
diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py
@@ -386,4 +386,33 @@ def time_to_numpy_copy(self):
         self.ser.to_numpy(copy=True)
 
 
+class Replace:
+    param_names = ["num_to_replace"]
+    params = [100, 1000]
+
+    def setup(self, num_to_replace):
+        N = 1_000_000
+        self.arr = np.random.randn(N)
+        self.arr1 = self.arr.copy()
+        np.random.shuffle(self.arr1)
+        self.ser = Series(self.arr)
+
+        self.to_replace_list = np.random.choice(self.arr, num_to_replace)
+        self.values_list = np.random.choice(self.arr1, num_to_replace)
+
+        self.replace_dict = dict(zip(self.to_replace_list, self.values_list))
+
+    def time_replace_dict(self, num_to_replace):
+        self.ser.replace(self.replace_dict)
+
+    def peakmem_replace_dict(self, num_to_replace):
+        self.ser.replace(self.replace_dict)
+
+    def time_replace_list(self, num_to_replace):
+        self.ser.replace(self.to_replace_list, self.values_list)
+
+    def peakmem_replace_list(self, num_to_replace):
+        self.ser.replace(self.to_replace_list, self.values_list)
+
+
 from .pandas_vb_common import setup  # noqa: F401 isort:skip
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -1170,9 +1170,9 @@ Performance improvements
 - Fixed a reference leak in :func:`read_hdf` (:issue:`37441`)
 - Fixed a memory leak in :meth:`DataFrame.to_json` and :meth:`Series.to_json` when serializing datetimes and timedeltas (:issue:`40443`)
 - Decreased memory usage in many :class:`DataFrameGroupBy` methods (:issue:`51090`)
+- Performance improvement in :meth:`DataFrame.replace` and :meth:`Series.replace` when using a large dict for ``to_replace`` (:issue:`6697`)
 - Memory improvement in :class:`StataReader` when reading seekable files (:issue:`48922`)
 
-
 .. ---------------------------------------------------------------------------
 .. _whatsnew_200.bug_fixes:
 
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -724,24 +724,34 @@ def replace_list(
         if is_string_dtype(values.dtype):
             # Calculate the mask once, prior to the call of comp
             # in order to avoid repeating the same computations
-            mask = ~isna(values)
-            masks = [
-                compare_or_regex_search(values, s[0], regex=regex, mask=mask)
+            na_mask = ~isna(values)
+            masks: Iterable[npt.NDArray[np.bool_]] = (
+                extract_bool_array(
+                    cast(
+                        ArrayLike,
+                        compare_or_regex_search(
+                            values, s[0], regex=regex, mask=na_mask
+                        ),
+                    )
+                )
                 for s in pairs
-            ]
+            )
         else:
             # GH#38086 faster if we know we dont need to check for regex
-            masks = [missing.mask_missing(values, s[0]) for s in pairs]
-
-        masks = [extract_bool_array(x) for x in masks]
+            masks = (missing.mask_missing(values, s[0]) for s in pairs)
+        # Materialize if inplace = True, since the masks can change
+        # as we replace
+        if inplace:
+            masks = list(masks)
 
         if using_cow and inplace:
             # Don't set up refs here, otherwise we will think that we have
             # references when we check again later
             rb = [self]
         else:
             rb = [self if inplace else self.copy()]
-        for i, (src, dest) in enumerate(pairs):
+
+        for i, ((src, dest), mask) in enumerate(zip(pairs, masks)):
             convert = i == src_len  # only convert once at the end
             new_rb: list[Block] = []
 
@@ -750,9 +760,9 @@ def replace_list(
             # where to index into the mask
             for blk_num, blk in enumerate(rb):
                 if len(rb) == 1:
-                    m = masks[i]
+                    m = mask
                 else:
-                    mib = masks[i]
+                    mib = mask
                     assert not isinstance(mib, bool)
                     m = mib[blk_num : blk_num + 1]
 
@@ -762,7 +772,7 @@ def replace_list(
                 result = blk._replace_coerce(
                     to_replace=src,
                     value=dest,
-                    mask=m,  # type: ignore[arg-type]
+                    mask=m,
                     inplace=inplace,
                     regex=regex,
                     using_cow=using_cow,
diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py
@@ -298,6 +298,19 @@ def test_replace2(self):
         assert (ser[6:10] == -1).all()
         assert (ser[20:30] == -1).all()
 
+    @pytest.mark.parametrize("inplace", [True, False])
+    def test_replace_cascade(self, inplace):
+        # Test that replaced values are not replaced again
+        # GH #50778
+        ser = pd.Series([1, 2, 3])
+        expected = pd.Series([2, 3, 4])
+
+        res = ser.replace([1, 2, 3], [2, 3, 4], inplace=inplace)
+        if inplace:
+            tm.assert_series_equal(ser, expected)
+        else:
+            tm.assert_series_equal(res, expected)
+
     def test_replace_with_dictlike_and_string_dtype(self, nullable_string_dtype):
         # GH 32621, GH#44940
         ser = pd.Series(["one", "two", np.nan], dtype=nullable_string_dtype)