Skip to content

Commit 4179a44

Browse files
authored
PERF: Use generator expression for Blocks.replace_list (#50778)
* PERF: Use less memory in replace * Fixes * fix cascading case * fix typing * placate mypy * add GH number * whatsnew * fixes
1 parent 3b632d9 commit 4179a44

File tree

4 files changed

+64
-12
lines changed

4 files changed

+64
-12
lines changed

asv_bench/benchmarks/series_methods.py

+29
Original file line numberDiff line numberDiff line change
@@ -386,4 +386,33 @@ def time_to_numpy_copy(self):
386386
self.ser.to_numpy(copy=True)
387387

388388

389+
class Replace:
390+
param_names = ["num_to_replace"]
391+
params = [100, 1000]
392+
393+
def setup(self, num_to_replace):
394+
N = 1_000_000
395+
self.arr = np.random.randn(N)
396+
self.arr1 = self.arr.copy()
397+
np.random.shuffle(self.arr1)
398+
self.ser = Series(self.arr)
399+
400+
self.to_replace_list = np.random.choice(self.arr, num_to_replace)
401+
self.values_list = np.random.choice(self.arr1, num_to_replace)
402+
403+
self.replace_dict = dict(zip(self.to_replace_list, self.values_list))
404+
405+
def time_replace_dict(self, num_to_replace):
406+
self.ser.replace(self.replace_dict)
407+
408+
def peakmem_replace_dict(self, num_to_replace):
409+
self.ser.replace(self.replace_dict)
410+
411+
def time_replace_list(self, num_to_replace):
412+
self.ser.replace(self.to_replace_list, self.values_list)
413+
414+
def peakmem_replace_list(self, num_to_replace):
415+
self.ser.replace(self.to_replace_list, self.values_list)
416+
417+
389418
from .pandas_vb_common import setup # noqa: F401 isort:skip

doc/source/whatsnew/v2.0.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -1170,9 +1170,9 @@ Performance improvements
11701170
- Fixed a reference leak in :func:`read_hdf` (:issue:`37441`)
11711171
- Fixed a memory leak in :meth:`DataFrame.to_json` and :meth:`Series.to_json` when serializing datetimes and timedeltas (:issue:`40443`)
11721172
- Decreased memory usage in many :class:`DataFrameGroupBy` methods (:issue:`51090`)
1173+
- Performance improvement in :meth:`DataFrame.replace` and :meth:`Series.replace` when using a large dict for ``to_replace`` (:issue:`6697`)
11731174
- Memory improvement in :class:`StataReader` when reading seekable files (:issue:`48922`)
11741175

1175-
11761176
.. ---------------------------------------------------------------------------
11771177
.. _whatsnew_200.bug_fixes:
11781178

pandas/core/internals/blocks.py

+21-11
Original file line numberDiff line numberDiff line change
@@ -724,24 +724,34 @@ def replace_list(
724724
if is_string_dtype(values.dtype):
725725
# Calculate the mask once, prior to the call of comp
726726
# in order to avoid repeating the same computations
727-
mask = ~isna(values)
728-
masks = [
729-
compare_or_regex_search(values, s[0], regex=regex, mask=mask)
727+
na_mask = ~isna(values)
728+
masks: Iterable[npt.NDArray[np.bool_]] = (
729+
extract_bool_array(
730+
cast(
731+
ArrayLike,
732+
compare_or_regex_search(
733+
values, s[0], regex=regex, mask=na_mask
734+
),
735+
)
736+
)
730737
for s in pairs
731-
]
738+
)
732739
else:
733740
# GH#38086 faster if we know we dont need to check for regex
734-
masks = [missing.mask_missing(values, s[0]) for s in pairs]
735-
736-
masks = [extract_bool_array(x) for x in masks]
741+
masks = (missing.mask_missing(values, s[0]) for s in pairs)
742+
# Materialize if inplace = True, since the masks can change
743+
# as we replace
744+
if inplace:
745+
masks = list(masks)
737746

738747
if using_cow and inplace:
739748
# Don't set up refs here, otherwise we will think that we have
740749
# references when we check again later
741750
rb = [self]
742751
else:
743752
rb = [self if inplace else self.copy()]
744-
for i, (src, dest) in enumerate(pairs):
753+
754+
for i, ((src, dest), mask) in enumerate(zip(pairs, masks)):
745755
convert = i == src_len # only convert once at the end
746756
new_rb: list[Block] = []
747757

@@ -750,9 +760,9 @@ def replace_list(
750760
# where to index into the mask
751761
for blk_num, blk in enumerate(rb):
752762
if len(rb) == 1:
753-
m = masks[i]
763+
m = mask
754764
else:
755-
mib = masks[i]
765+
mib = mask
756766
assert not isinstance(mib, bool)
757767
m = mib[blk_num : blk_num + 1]
758768

@@ -762,7 +772,7 @@ def replace_list(
762772
result = blk._replace_coerce(
763773
to_replace=src,
764774
value=dest,
765-
mask=m, # type: ignore[arg-type]
775+
mask=m,
766776
inplace=inplace,
767777
regex=regex,
768778
using_cow=using_cow,

pandas/tests/series/methods/test_replace.py

+13
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,19 @@ def test_replace2(self):
298298
assert (ser[6:10] == -1).all()
299299
assert (ser[20:30] == -1).all()
300300

301+
@pytest.mark.parametrize("inplace", [True, False])
302+
def test_replace_cascade(self, inplace):
303+
# Test that replaced values are not replaced again
304+
# GH #50778
305+
ser = pd.Series([1, 2, 3])
306+
expected = pd.Series([2, 3, 4])
307+
308+
res = ser.replace([1, 2, 3], [2, 3, 4], inplace=inplace)
309+
if inplace:
310+
tm.assert_series_equal(ser, expected)
311+
else:
312+
tm.assert_series_equal(res, expected)
313+
301314
def test_replace_with_dictlike_and_string_dtype(self, nullable_string_dtype):
302315
# GH 32621, GH#44940
303316
ser = pd.Series(["one", "two", np.nan], dtype=nullable_string_dtype)

0 commit comments

Comments
 (0)