diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 06c4589297cda..204393cbb76f2 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -386,4 +386,33 @@ def time_to_numpy_copy(self): self.ser.to_numpy(copy=True) +class Replace: + param_names = ["num_to_replace"] + params = [100, 1000] + + def setup(self, num_to_replace): + N = 1_000_000 + self.arr = np.random.randn(N) + self.arr1 = self.arr.copy() + np.random.shuffle(self.arr1) + self.ser = Series(self.arr) + + self.to_replace_list = np.random.choice(self.arr, num_to_replace) + self.values_list = np.random.choice(self.arr1, num_to_replace) + + self.replace_dict = dict(zip(self.to_replace_list, self.values_list)) + + def time_replace_dict(self, num_to_replace): + self.ser.replace(self.replace_dict) + + def peakmem_replace_dict(self, num_to_replace): + self.ser.replace(self.replace_dict) + + def time_replace_list(self, num_to_replace): + self.ser.replace(self.to_replace_list, self.values_list) + + def peakmem_replace_list(self, num_to_replace): + self.ser.replace(self.to_replace_list, self.values_list) + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 224604a8ff8d0..e3a66896343b2 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1169,9 +1169,9 @@ Performance improvements - Fixed a reference leak in :func:`read_hdf` (:issue:`37441`) - Fixed a memory leak in :meth:`DataFrame.to_json` and :meth:`Series.to_json` when serializing datetimes and timedeltas (:issue:`40443`) - Decreased memory usage in many :class:`DataFrameGroupBy` methods (:issue:`51090`) +- Performance improvement in :meth:`DataFrame.replace` and :meth:`Series.replace` when using a large dict for ``to_replace`` (:issue:`6697`) - Memory improvement in :class:`StataReader` when reading seekable files (:issue:`48922`) - .. --------------------------------------------------------------------------- .. _whatsnew_200.bug_fixes: diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 35a7855b8240f..f13bbef3b51ff 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -722,23 +722,33 @@ def replace_list( if is_string_dtype(values.dtype): # Calculate the mask once, prior to the call of comp # in order to avoid repeating the same computations - mask = ~isna(values) - masks = [ - compare_or_regex_search(values, s[0], regex=regex, mask=mask) + na_mask = ~isna(values) + masks: Iterable[npt.NDArray[np.bool_]] = ( + extract_bool_array( + cast( + ArrayLike, + compare_or_regex_search( + values, s[0], regex=regex, mask=na_mask + ), + ) + ) for s in pairs - ] + ) else: # GH#38086 faster if we know we dont need to check for regex - masks = [missing.mask_missing(values, s[0]) for s in pairs] - - masks = [extract_bool_array(x) for x in masks] + masks = (missing.mask_missing(values, s[0]) for s in pairs) + # Materialize if inplace = True, since the masks can change + # as we replace + if inplace: + masks = list(masks) if using_cow and inplace: # TODO(CoW): Optimize rb = [self.copy()] else: rb = [self if inplace else self.copy()] - for i, (src, dest) in enumerate(pairs): + + for i, ((src, dest), mask) in enumerate(zip(pairs, masks)): convert = i == src_len # only convert once at the end new_rb: list[Block] = [] @@ -747,9 +757,9 @@ def replace_list( # where to index into the mask for blk_num, blk in enumerate(rb): if len(rb) == 1: - m = masks[i] + m = mask else: - mib = masks[i] + mib = mask assert not isinstance(mib, bool) m = mib[blk_num : blk_num + 1] @@ -759,7 +769,7 @@ def replace_list( result = blk._replace_coerce( to_replace=src, value=dest, - mask=m, # type: ignore[arg-type] + mask=m, inplace=inplace, regex=regex, ) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 44c224c120ea6..2880e3f3e85db 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -298,6 +298,19 @@ def test_replace2(self): assert (ser[6:10] == -1).all() assert (ser[20:30] == -1).all() + @pytest.mark.parametrize("inplace", [True, False]) + def test_replace_cascade(self, inplace): + # Test that replaced values are not replaced again + # GH #50778 + ser = pd.Series([1, 2, 3]) + expected = pd.Series([2, 3, 4]) + + res = ser.replace([1, 2, 3], [2, 3, 4], inplace=inplace) + if inplace: + tm.assert_series_equal(ser, expected) + else: + tm.assert_series_equal(res, expected) + def test_replace_with_dictlike_and_string_dtype(self, nullable_string_dtype): # GH 32621, GH#44940 ser = pd.Series(["one", "two", np.nan], dtype=nullable_string_dtype)