From a8718efe0a1cb390b5dd8c7f1bbaea6d2022ea35 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Fri, 13 Jan 2023 11:59:31 -0800 Subject: [PATCH 1/8] PERF: Use less memory in replace --- asv_bench/benchmarks/series_methods.py | 27 ++++++++++++++++++++++++++ pandas/core/internals/blocks.py | 21 ++++++++++++-------- 2 files changed, 40 insertions(+), 8 deletions(-) diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index a0dd52e9f17e4..e75120207b744 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -401,4 +401,31 @@ def time_to_numpy_copy(self): self.ser.to_numpy(copy=True) +class Replace: + + param_names = ["num_to_replace"] + params = [100, 10000, 1_000_000] + + def setup(self, num_to_replace): + N = 1_000_000 + self.arr = np.random.randn(N) + self.arr1 = self.arr.copy() + np.random.shuffle(self.arr1) + self.ser = Series(self.arr) + + self.to_replace_list = np.random.choice(self.arr, num_to_replace) + self.values_list = np.random.choice(self.arr1, num_to_replace) + + self.replace_dict = dict(zip(self.to_replace_list, self.values_list)) + + def time_replace_dict(self): + self.ser.replace(self.replace_dict) + + def peakmem_replace_dict(self): + self.ser.replace(self.replace_dict) + + def time_replace_list(self): + self.ser.replace(self.to_replace_list, self.values_list) + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index f7787aa52623b..54137f888f9b3 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -664,21 +664,26 @@ def replace_list( # Calculate the mask once, prior to the call of comp # in order to avoid repeating the same computations mask = ~isna(values) - masks = [ - compare_or_regex_search(values, s[0], regex=regex, mask=mask) + masks = ( + extract_bool_array( + compare_or_regex_search(values, s[0], regex=regex, mask=mask) + ) for s in pairs - ] + ) else: # GH#38086 faster if we know we dont need to check for regex - masks = [missing.mask_missing(values, s[0]) for s in pairs] + # masks = [missing.mask_missing(values, s[0]) for s in pairs] + masks = ( + extract_bool_array(missing.mask_missing(values, s[0])) for s in pairs + ) # error: Argument 1 to "extract_bool_array" has incompatible type # "Union[ExtensionArray, ndarray, bool]"; expected "Union[ExtensionArray, # ndarray]" - masks = [extract_bool_array(x) for x in masks] # type: ignore[arg-type] + # masks = [extract_bool_array(x) for x in masks] # type: ignore[arg-type] rb = [self if inplace else self.copy()] - for i, (src, dest) in enumerate(pairs): + for i, ((src, dest), mask) in enumerate(zip(pairs, masks)): convert = i == src_len # only convert once at the end new_rb: list[Block] = [] @@ -687,9 +692,9 @@ def replace_list( # where to index into the mask for blk_num, blk in enumerate(rb): if len(rb) == 1: - m = masks[i] + m = mask else: - mib = masks[i] + mib = mask assert not isinstance(mib, bool) m = mib[blk_num : blk_num + 1] From 167d78983bbcc9eac01b6118197af519be28348f Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Fri, 13 Jan 2023 15:29:27 -0800 Subject: [PATCH 2/8] Fixes --- asv_bench/benchmarks/series_methods.py | 11 +++++++---- pandas/core/internals/blocks.py | 6 ++---- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index e75120207b744..424cba2c339e2 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -404,7 +404,7 @@ def time_to_numpy_copy(self): class Replace: param_names = ["num_to_replace"] - params = [100, 10000, 1_000_000] + params = [100, 1000] def setup(self, num_to_replace): N = 1_000_000 @@ -418,13 +418,16 @@ def setup(self, num_to_replace): self.replace_dict = dict(zip(self.to_replace_list, self.values_list)) - def time_replace_dict(self): + def time_replace_dict(self, num_to_replace): self.ser.replace(self.replace_dict) - def peakmem_replace_dict(self): + def peakmem_replace_dict(self, num_to_replace): self.ser.replace(self.replace_dict) - def time_replace_list(self): + def time_replace_list(self, num_to_replace): + self.ser.replace(self.to_replace_list, self.values_list) + + def peakmem_replace_list(self, num_to_replace): self.ser.replace(self.to_replace_list, self.values_list) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 54137f888f9b3..a81b53c6f01bd 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -663,16 +663,15 @@ def replace_list( if is_string_dtype(values.dtype): # Calculate the mask once, prior to the call of comp # in order to avoid repeating the same computations - mask = ~isna(values) + na_mask = ~isna(values) masks = ( extract_bool_array( - compare_or_regex_search(values, s[0], regex=regex, mask=mask) + compare_or_regex_search(values, s[0], regex=regex, mask=na_mask) ) for s in pairs ) else: # GH#38086 faster if we know we dont need to check for regex - # masks = [missing.mask_missing(values, s[0]) for s in pairs] masks = ( extract_bool_array(missing.mask_missing(values, s[0])) for s in pairs ) @@ -681,7 +680,6 @@ def replace_list( # "Union[ExtensionArray, ndarray, bool]"; expected "Union[ExtensionArray, # ndarray]" # masks = [extract_bool_array(x) for x in masks] # type: ignore[arg-type] - rb = [self if inplace else self.copy()] for i, ((src, dest), mask) in enumerate(zip(pairs, masks)): convert = i == src_len # only convert once at the end From 3902ad1a5c387d90bf48f6522b5e2a090f7937b9 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Thu, 2 Feb 2023 16:13:49 -0500 Subject: [PATCH 3/8] fix cascading case --- pandas/core/internals/blocks.py | 9 ++++++++- pandas/tests/series/methods/test_replace.py | 12 ++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index a81b53c6f01bd..ce1166b2bcc4b 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -670,16 +670,23 @@ def replace_list( ) for s in pairs ) + # Materialize if inplace = True, since the masks can change + # as we replace + if inplace: + masks = list(masks) else: # GH#38086 faster if we know we dont need to check for regex masks = ( extract_bool_array(missing.mask_missing(values, s[0])) for s in pairs ) + # Materialize if inplace = True, since the masks can change + # as we replace + if inplace: + masks = list(masks) # error: Argument 1 to "extract_bool_array" has incompatible type # "Union[ExtensionArray, ndarray, bool]"; expected "Union[ExtensionArray, # ndarray]" - # masks = [extract_bool_array(x) for x in masks] # type: ignore[arg-type] rb = [self if inplace else self.copy()] for i, ((src, dest), mask) in enumerate(zip(pairs, masks)): convert = i == src_len # only convert once at the end diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 1188c9452520c..77f24694f852b 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -298,6 +298,18 @@ def test_replace2(self): assert (ser[6:10] == -1).all() assert (ser[20:30] == -1).all() + @pytest.mark.parametrize("inplace", [True, False]) + def test_replace_cascade(self, inplace): + # Test that replaced values are not replaced again + ser = pd.Series([1, 2, 3]) + expected = pd.Series([2, 3, 4]) + + res = ser.replace([1, 2, 3], [2, 3, 4], inplace=inplace) + if inplace: + tm.assert_series_equal(ser, expected) + else: + tm.assert_series_equal(res, expected) + def test_replace_with_dictlike_and_string_dtype(self, nullable_string_dtype): # GH 32621, GH#44940 ser = pd.Series(["one", "two", np.nan], dtype=nullable_string_dtype) From f95d3d2b16e5e93381772670e6033326cbb73d54 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Thu, 2 Feb 2023 20:27:21 -0500 Subject: [PATCH 4/8] fix typing --- pandas/core/internals/blocks.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 56907cc87c3d2..be25bb42eaf49 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -669,29 +669,26 @@ def replace_list( # Calculate the mask once, prior to the call of comp # in order to avoid repeating the same computations na_mask = ~isna(values) - masks = ( + masks: Iterable[ndarray[Any, dtype[bool_]] = ( extract_bool_array( compare_or_regex_search(values, s[0], regex=regex, mask=na_mask) ) for s in pairs - ) + ) # type: ignore[arg-type] # Materialize if inplace = True, since the masks can change # as we replace if inplace: masks = list(masks) else: # GH#38086 faster if we know we dont need to check for regex - masks = ( + masks: Iterable[ndarray[Any, dtype[bool_]] = ( extract_bool_array(missing.mask_missing(values, s[0])) for s in pairs - ) + ) # type: ignore[arg-type] # Materialize if inplace = True, since the masks can change # as we replace if inplace: masks = list(masks) - # error: Argument 1 to "extract_bool_array" has incompatible type - # "Union[ExtensionArray, ndarray, bool]"; expected "Union[ExtensionArray, - # ndarray]" rb = [self if inplace else self.copy()] for i, ((src, dest), mask) in enumerate(zip(pairs, masks)): convert = i == src_len # only convert once at the end From 67ca2a196e2be5ad0fe46b986987e258590b7adf Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Thu, 2 Feb 2023 21:34:52 -0500 Subject: [PATCH 5/8] placate mypy --- pandas/core/internals/blocks.py | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index be25bb42eaf49..d0589cf94d04d 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -669,25 +669,24 @@ def replace_list( # Calculate the mask once, prior to the call of comp # in order to avoid repeating the same computations na_mask = ~isna(values) - masks: Iterable[ndarray[Any, dtype[bool_]] = ( + masks: Iterable[npt.NDArray[np.bool_]] = ( extract_bool_array( - compare_or_regex_search(values, s[0], regex=regex, mask=na_mask) + cast( + ArrayLike, + compare_or_regex_search( + values, s[0], regex=regex, mask=na_mask + ), + ) ) for s in pairs - ) # type: ignore[arg-type] - # Materialize if inplace = True, since the masks can change - # as we replace - if inplace: - masks = list(masks) + ) else: # GH#38086 faster if we know we dont need to check for regex - masks: Iterable[ndarray[Any, dtype[bool_]] = ( - extract_bool_array(missing.mask_missing(values, s[0])) for s in pairs - ) # type: ignore[arg-type] - # Materialize if inplace = True, since the masks can change - # as we replace - if inplace: - masks = list(masks) + masks = (missing.mask_missing(values, s[0]) for s in pairs) + # Materialize if inplace = True, since the masks can change + # as we replace + if inplace: + masks = list(masks) rb = [self if inplace else self.copy()] for i, ((src, dest), mask) in enumerate(zip(pairs, masks)): @@ -711,7 +710,7 @@ def replace_list( result = blk._replace_coerce( to_replace=src, value=dest, - mask=m, # type: ignore[arg-type] + mask=m, inplace=inplace, regex=regex, ) From 1eee094ed62c74e38faf3d3b46e31db9c52b43ed Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 22 Feb 2023 12:04:31 -0500 Subject: [PATCH 6/8] add GH number --- pandas/tests/series/methods/test_replace.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 0d5ee5f060f9d..2880e3f3e85db 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -301,6 +301,7 @@ def test_replace2(self): @pytest.mark.parametrize("inplace", [True, False]) def test_replace_cascade(self, inplace): # Test that replaced values are not replaced again + # GH #50778 ser = pd.Series([1, 2, 3]) expected = pd.Series([2, 3, 4]) From a81611b28a9e22144e8a8e955daa32beee8f3a08 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 22 Feb 2023 17:43:11 -0500 Subject: [PATCH 7/8] whatsnew --- doc/source/whatsnew/v2.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 45a5d139349e9..d31f8083deb2f 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1161,6 +1161,7 @@ Performance improvements - Fixed a reference leak in :func:`read_hdf` (:issue:`37441`) - Fixed a memory leak in :meth:`DataFrame.to_json` and :meth:`Series.to_json` when serializing datetimes and timedeltas (:issue:`40443`) - Decreased memory usage in many :class:`DataFrameGroupBy` methods (:issue:`51090`) +- Performance improvement in :meth:`DataFrame.replace` and :meth:`Series.replace` when using a large dict for ``to_replace`` .. --------------------------------------------------------------------------- .. _whatsnew_200.bug_fixes: From b2c3cad216dbc8cdc3297af2a0eac634040ead0c Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 22 Feb 2023 17:46:10 -0500 Subject: [PATCH 8/8] fixes --- doc/source/whatsnew/v2.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index d31f8083deb2f..64c486c2a1e73 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1161,7 +1161,7 @@ Performance improvements - Fixed a reference leak in :func:`read_hdf` (:issue:`37441`) - Fixed a memory leak in :meth:`DataFrame.to_json` and :meth:`Series.to_json` when serializing datetimes and timedeltas (:issue:`40443`) - Decreased memory usage in many :class:`DataFrameGroupBy` methods (:issue:`51090`) -- Performance improvement in :meth:`DataFrame.replace` and :meth:`Series.replace` when using a large dict for ``to_replace`` +- Performance improvement in :meth:`DataFrame.replace` and :meth:`Series.replace` when using a large dict for ``to_replace`` (:issue:`6697`) .. --------------------------------------------------------------------------- .. _whatsnew_200.bug_fixes: