Skip to content

Commit f69f970

Browse files
lithomas1meeseeksmachine
authored andcommitted
Backport PR pandas-dev#50778: PERF: Use generator expression for Blocks.replace_list
1 parent 3c5fb2e commit f69f970

File tree

4 files changed

+64
-12
lines changed

4 files changed

+64
-12
lines changed

asv_bench/benchmarks/series_methods.py

+29
Original file line numberDiff line numberDiff line change
@@ -386,4 +386,33 @@ def time_to_numpy_copy(self):
386386
self.ser.to_numpy(copy=True)
387387

388388

389+
class Replace:
390+
param_names = ["num_to_replace"]
391+
params = [100, 1000]
392+
393+
def setup(self, num_to_replace):
394+
N = 1_000_000
395+
self.arr = np.random.randn(N)
396+
self.arr1 = self.arr.copy()
397+
np.random.shuffle(self.arr1)
398+
self.ser = Series(self.arr)
399+
400+
self.to_replace_list = np.random.choice(self.arr, num_to_replace)
401+
self.values_list = np.random.choice(self.arr1, num_to_replace)
402+
403+
self.replace_dict = dict(zip(self.to_replace_list, self.values_list))
404+
405+
def time_replace_dict(self, num_to_replace):
406+
self.ser.replace(self.replace_dict)
407+
408+
def peakmem_replace_dict(self, num_to_replace):
409+
self.ser.replace(self.replace_dict)
410+
411+
def time_replace_list(self, num_to_replace):
412+
self.ser.replace(self.to_replace_list, self.values_list)
413+
414+
def peakmem_replace_list(self, num_to_replace):
415+
self.ser.replace(self.to_replace_list, self.values_list)
416+
417+
389418
from .pandas_vb_common import setup # noqa: F401 isort:skip

doc/source/whatsnew/v2.0.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -1169,9 +1169,9 @@ Performance improvements
11691169
- Fixed a reference leak in :func:`read_hdf` (:issue:`37441`)
11701170
- Fixed a memory leak in :meth:`DataFrame.to_json` and :meth:`Series.to_json` when serializing datetimes and timedeltas (:issue:`40443`)
11711171
- Decreased memory usage in many :class:`DataFrameGroupBy` methods (:issue:`51090`)
1172+
- Performance improvement in :meth:`DataFrame.replace` and :meth:`Series.replace` when using a large dict for ``to_replace`` (:issue:`6697`)
11721173
- Memory improvement in :class:`StataReader` when reading seekable files (:issue:`48922`)
11731174

1174-
11751175
.. ---------------------------------------------------------------------------
11761176
.. _whatsnew_200.bug_fixes:
11771177

pandas/core/internals/blocks.py

+21-11
Original file line numberDiff line numberDiff line change
@@ -723,24 +723,34 @@ def replace_list(
723723
if is_string_dtype(values.dtype):
724724
# Calculate the mask once, prior to the call of comp
725725
# in order to avoid repeating the same computations
726-
mask = ~isna(values)
727-
masks = [
728-
compare_or_regex_search(values, s[0], regex=regex, mask=mask)
726+
na_mask = ~isna(values)
727+
masks: Iterable[npt.NDArray[np.bool_]] = (
728+
extract_bool_array(
729+
cast(
730+
ArrayLike,
731+
compare_or_regex_search(
732+
values, s[0], regex=regex, mask=na_mask
733+
),
734+
)
735+
)
729736
for s in pairs
730-
]
737+
)
731738
else:
732739
# GH#38086 faster if we know we dont need to check for regex
733-
masks = [missing.mask_missing(values, s[0]) for s in pairs]
734-
735-
masks = [extract_bool_array(x) for x in masks]
740+
masks = (missing.mask_missing(values, s[0]) for s in pairs)
741+
# Materialize if inplace = True, since the masks can change
742+
# as we replace
743+
if inplace:
744+
masks = list(masks)
736745

737746
if using_cow and inplace:
738747
# Don't set up refs here, otherwise we will think that we have
739748
# references when we check again later
740749
rb = [self]
741750
else:
742751
rb = [self if inplace else self.copy()]
743-
for i, (src, dest) in enumerate(pairs):
752+
753+
for i, ((src, dest), mask) in enumerate(zip(pairs, masks)):
744754
convert = i == src_len # only convert once at the end
745755
new_rb: list[Block] = []
746756

@@ -749,9 +759,9 @@ def replace_list(
749759
# where to index into the mask
750760
for blk_num, blk in enumerate(rb):
751761
if len(rb) == 1:
752-
m = masks[i]
762+
m = mask
753763
else:
754-
mib = masks[i]
764+
mib = mask
755765
assert not isinstance(mib, bool)
756766
m = mib[blk_num : blk_num + 1]
757767

@@ -761,7 +771,7 @@ def replace_list(
761771
result = blk._replace_coerce(
762772
to_replace=src,
763773
value=dest,
764-
mask=m, # type: ignore[arg-type]
774+
mask=m,
765775
inplace=inplace,
766776
regex=regex,
767777
using_cow=using_cow,

pandas/tests/series/methods/test_replace.py

+13
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,19 @@ def test_replace2(self):
298298
assert (ser[6:10] == -1).all()
299299
assert (ser[20:30] == -1).all()
300300

301+
@pytest.mark.parametrize("inplace", [True, False])
302+
def test_replace_cascade(self, inplace):
303+
# Test that replaced values are not replaced again
304+
# GH #50778
305+
ser = pd.Series([1, 2, 3])
306+
expected = pd.Series([2, 3, 4])
307+
308+
res = ser.replace([1, 2, 3], [2, 3, 4], inplace=inplace)
309+
if inplace:
310+
tm.assert_series_equal(ser, expected)
311+
else:
312+
tm.assert_series_equal(res, expected)
313+
301314
def test_replace_with_dictlike_and_string_dtype(self, nullable_string_dtype):
302315
# GH 32621, GH#44940
303316
ser = pd.Series(["one", "two", np.nan], dtype=nullable_string_dtype)

0 commit comments

Comments
 (0)