Skip to content

Commit 8239067

Browse files
committed
PERF: Improve replace perf
1 parent 096d886 commit 8239067

File tree

4 files changed

+55
-13
lines changed

4 files changed

+55
-13
lines changed

asv_bench/benchmarks/replace.py

+24
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,30 @@ def time_replace_large_dict(self):
3232
self.s.replace(self.to_rep, inplace=True)
3333

3434

35+
class replace_convert(object):
36+
goal_time = 0.5
37+
38+
def setup(self):
39+
self.n = (10 ** 3)
40+
self.to_ts = dict(((i, pd.Timestamp(i)) for i in range(self.n)))
41+
self.to_td = dict(((i, pd.Timedelta(i)) for i in range(self.n)))
42+
self.s = Series(np.random.randint(self.n, size=(10 ** 3)))
43+
self.df = DataFrame({'A': np.random.randint(self.n, size=(10 ** 3)),
44+
'B': np.random.randint(self.n, size=(10 ** 3))})
45+
46+
def time_replace_series_timestamp(self):
47+
self.s.replace(self.to_ts)
48+
49+
def time_replace_series_timedelta(self):
50+
self.s.replace(self.to_td)
51+
52+
def time_replace_frame_timestamp(self):
53+
self.df.replace(self.to_ts)
54+
55+
def time_replace_frame_timedelta(self):
56+
self.df.replace(self.to_td)
57+
58+
3559
class replace_replacena(object):
3660
goal_time = 0.2
3761

doc/source/whatsnew/v0.19.0.txt

+4
Original file line numberDiff line numberDiff line change
@@ -1404,6 +1404,7 @@ Performance Improvements
14041404
- Improved performance of ``DataFrame.quantile()`` as it now operates per-block (:issue:`11623`)
14051405
- Improved performance of float64 hash table operations, fixing some very slow indexing and groupby operations in python 3 (:issue:`13166`, :issue:`13334`)
14061406
- Improved performance of ``DataFrameGroupBy.transform`` (:issue:`12737`)
1407+
<<<<<<< 6dcc23862b6b60ce2a67436b4a278fbe4c05490f
14071408
- Improved performance of ``Index`` and ``Series`` ``.duplicated`` (:issue:`10235`)
14081409
- Improved performance of ``Index.difference`` (:issue:`12044`)
14091410
- Improved performance of ``RangeIndex.is_monotonic_increasing`` and ``is_monotonic_decreasing`` (:issue:`13749`)
@@ -1413,6 +1414,9 @@ Performance Improvements
14131414
- Improved performance of by lazily creating indexing hashtables on larger Indexes (:issue:`14266`)
14141415
- Improved performance of ``groupby.groups`` (:issue:`14293`)
14151416
- Unecessary materializing of a MultiIndex when introspecting for memory usage (:issue:`14308`)
1417+
=======
1418+
- Improved performance of ``.replace()`` (:issue:`12745`)
1419+
>>>>>>> PERF: Improve replace perf
14161420

14171421
.. _whatsnew_0190.bug_fixes:
14181422

pandas/core/generic.py

+15-8
Original file line numberDiff line numberDiff line change
@@ -3477,20 +3477,27 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None,
34773477
res = self if inplace else self.copy()
34783478
for c, src in compat.iteritems(to_replace):
34793479
if c in value and c in self:
3480+
# object conversion is handled in
3481+
# series.replace which is called recursivelly
34803482
res[c] = res[c].replace(to_replace=src,
34813483
value=value[c],
3482-
inplace=False, regex=regex)
3484+
inplace=False,
3485+
regex=regex)
34833486
return None if inplace else res
34843487

34853488
# {'A': NA} -> 0
34863489
elif not is_list_like(value):
3487-
for k, src in compat.iteritems(to_replace):
3488-
if k in self:
3489-
new_data = new_data.replace(to_replace=src,
3490-
value=value,
3491-
filter=[k],
3492-
inplace=inplace,
3493-
regex=regex)
3490+
keys = [(k, src) for k, src in compat.iteritems(to_replace)
3491+
if k in self]
3492+
keys_len = len(keys) - 1
3493+
for i, (k, src) in enumerate(keys):
3494+
convert = i == keys_len
3495+
new_data = new_data.replace(to_replace=src,
3496+
value=value,
3497+
filter=[k],
3498+
inplace=inplace,
3499+
regex=regex,
3500+
convert=convert)
34943501
else:
34953502
raise TypeError('value argument must be scalar, dict, or '
34963503
'Series')

pandas/core/internals.py

+12-5
Original file line numberDiff line numberDiff line change
@@ -623,7 +623,6 @@ def replace(self, to_replace, value, inplace=False, filter=None,
623623

624624
original_to_replace = to_replace
625625
mask = isnull(self.values)
626-
627626
# try to replace, if we raise an error, convert to ObjectBlock and
628627
# retry
629628
try:
@@ -1774,13 +1773,14 @@ def should_store(self, value):
17741773
return issubclass(value.dtype.type, np.bool_)
17751774

17761775
def replace(self, to_replace, value, inplace=False, filter=None,
1777-
regex=False, mgr=None):
1776+
regex=False, convert=True, mgr=None):
17781777
to_replace_values = np.atleast_1d(to_replace)
17791778
if not np.can_cast(to_replace_values, bool):
17801779
return self
17811780
return super(BoolBlock, self).replace(to_replace, value,
17821781
inplace=inplace, filter=filter,
1783-
regex=regex, mgr=mgr)
1782+
regex=regex, convert=convert,
1783+
mgr=mgr)
17841784

17851785

17861786
class ObjectBlock(Block):
@@ -3193,6 +3193,7 @@ def comp(s):
31933193
masks = [comp(s) for i, s in enumerate(src_list)]
31943194

31953195
result_blocks = []
3196+
src_len = len(src_list) - 1
31963197
for blk in self.blocks:
31973198

31983199
# its possible to get multiple result blocks here
@@ -3202,8 +3203,9 @@ def comp(s):
32023203
new_rb = []
32033204
for b in rb:
32043205
if b.dtype == np.object_:
3206+
convert = i == src_len
32053207
result = b.replace(s, d, inplace=inplace, regex=regex,
3206-
mgr=mgr)
3208+
mgr=mgr, convert=convert)
32073209
new_rb = _extend_blocks(result, new_rb)
32083210
else:
32093211
# get our mask for this element, sized to this
@@ -4767,7 +4769,12 @@ def _putmask_smart(v, m, n):
47674769

47684770
# change the dtype
47694771
dtype, _ = _maybe_promote(n.dtype)
4770-
nv = v.astype(dtype)
4772+
4773+
if needs_i8_conversion(v.dtype) and is_object_dtype(dtype):
4774+
nv = v.get_values(dtype)
4775+
else:
4776+
nv = v.astype(dtype)
4777+
47714778
try:
47724779
nv[m] = n[m]
47734780
except ValueError:

0 commit comments

Comments
 (0)