Skip to content

Commit 7479d41

Browse files
sinhrksjorisvandenbossche
authored andcommitted
[Backport #12745] PERF: Improve replace perf
When .replace is called with `dict`, replacements are done per value. Current impl try to soft convert the dtype in every replacement, but it is enough to be done in the final replacement. Author: sinhrks <[email protected]> Closes #12745 from sinhrks/replace_perf and squashes the following commits: ffc59b0 [sinhrks] PERF: Improve replace perf (cherry picked from commit e299560)
1 parent 560aded commit 7479d41

File tree

4 files changed

+52
-13
lines changed

4 files changed

+52
-13
lines changed

asv_bench/benchmarks/replace.py

+24
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,30 @@ def time_replace_large_dict(self):
3232
self.s.replace(self.to_rep, inplace=True)
3333

3434

35+
class replace_convert(object):
36+
goal_time = 0.5
37+
38+
def setup(self):
39+
self.n = (10 ** 3)
40+
self.to_ts = dict(((i, pd.Timestamp(i)) for i in range(self.n)))
41+
self.to_td = dict(((i, pd.Timedelta(i)) for i in range(self.n)))
42+
self.s = Series(np.random.randint(self.n, size=(10 ** 3)))
43+
self.df = DataFrame({'A': np.random.randint(self.n, size=(10 ** 3)),
44+
'B': np.random.randint(self.n, size=(10 ** 3))})
45+
46+
def time_replace_series_timestamp(self):
47+
self.s.replace(self.to_ts)
48+
49+
def time_replace_series_timedelta(self):
50+
self.s.replace(self.to_td)
51+
52+
def time_replace_frame_timestamp(self):
53+
self.df.replace(self.to_ts)
54+
55+
def time_replace_frame_timedelta(self):
56+
self.df.replace(self.to_td)
57+
58+
3559
class replace_replacena(object):
3660
goal_time = 0.2
3761

doc/source/whatsnew/v0.19.2.txt

+1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ Highlights include:
2121
Performance Improvements
2222
~~~~~~~~~~~~~~~~~~~~~~~~
2323

24+
- Improved performance of ``.replace()`` (:issue:`12745`)
2425

2526
.. _whatsnew_0192.bug_fixes:
2627

pandas/core/generic.py

+15-8
Original file line numberDiff line numberDiff line change
@@ -3477,20 +3477,27 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None,
34773477
res = self if inplace else self.copy()
34783478
for c, src in compat.iteritems(to_replace):
34793479
if c in value and c in self:
3480+
# object conversion is handled in
3481+
# series.replace which is called recursivelly
34803482
res[c] = res[c].replace(to_replace=src,
34813483
value=value[c],
3482-
inplace=False, regex=regex)
3484+
inplace=False,
3485+
regex=regex)
34833486
return None if inplace else res
34843487

34853488
# {'A': NA} -> 0
34863489
elif not is_list_like(value):
3487-
for k, src in compat.iteritems(to_replace):
3488-
if k in self:
3489-
new_data = new_data.replace(to_replace=src,
3490-
value=value,
3491-
filter=[k],
3492-
inplace=inplace,
3493-
regex=regex)
3490+
keys = [(k, src) for k, src in compat.iteritems(to_replace)
3491+
if k in self]
3492+
keys_len = len(keys) - 1
3493+
for i, (k, src) in enumerate(keys):
3494+
convert = i == keys_len
3495+
new_data = new_data.replace(to_replace=src,
3496+
value=value,
3497+
filter=[k],
3498+
inplace=inplace,
3499+
regex=regex,
3500+
convert=convert)
34943501
else:
34953502
raise TypeError('value argument must be scalar, dict, or '
34963503
'Series')

pandas/core/internals.py

+12-5
Original file line numberDiff line numberDiff line change
@@ -622,7 +622,6 @@ def replace(self, to_replace, value, inplace=False, filter=None,
622622

623623
original_to_replace = to_replace
624624
mask = isnull(self.values)
625-
626625
# try to replace, if we raise an error, convert to ObjectBlock and
627626
# retry
628627
try:
@@ -1794,13 +1793,14 @@ def should_store(self, value):
17941793
return issubclass(value.dtype.type, np.bool_)
17951794

17961795
def replace(self, to_replace, value, inplace=False, filter=None,
1797-
regex=False, mgr=None):
1796+
regex=False, convert=True, mgr=None):
17981797
to_replace_values = np.atleast_1d(to_replace)
17991798
if not np.can_cast(to_replace_values, bool):
18001799
return self
18011800
return super(BoolBlock, self).replace(to_replace, value,
18021801
inplace=inplace, filter=filter,
1803-
regex=regex, mgr=mgr)
1802+
regex=regex, convert=convert,
1803+
mgr=mgr)
18041804

18051805

18061806
class ObjectBlock(Block):
@@ -3213,6 +3213,7 @@ def comp(s):
32133213
masks = [comp(s) for i, s in enumerate(src_list)]
32143214

32153215
result_blocks = []
3216+
src_len = len(src_list) - 1
32163217
for blk in self.blocks:
32173218

32183219
# its possible to get multiple result blocks here
@@ -3222,8 +3223,9 @@ def comp(s):
32223223
new_rb = []
32233224
for b in rb:
32243225
if b.dtype == np.object_:
3226+
convert = i == src_len
32253227
result = b.replace(s, d, inplace=inplace, regex=regex,
3226-
mgr=mgr)
3228+
mgr=mgr, convert=convert)
32273229
new_rb = _extend_blocks(result, new_rb)
32283230
else:
32293231
# get our mask for this element, sized to this
@@ -4787,7 +4789,12 @@ def _putmask_smart(v, m, n):
47874789

47884790
# change the dtype
47894791
dtype, _ = _maybe_promote(n.dtype)
4790-
nv = v.astype(dtype)
4792+
4793+
if is_extension_type(v.dtype) and is_object_dtype(dtype):
4794+
nv = v.get_values(dtype)
4795+
else:
4796+
nv = v.astype(dtype)
4797+
47914798
try:
47924799
nv[m] = n[m]
47934800
except ValueError:

0 commit comments

Comments
 (0)