Skip to content

Commit 720d263

Browse files
peterpanmjjreback
authored andcommitted
BUG: inconsistency between replace dict using integers and using strings (#20656) (#21477)
1 parent 9b18811 commit 720d263

File tree

4 files changed

+156
-22
lines changed

4 files changed

+156
-22
lines changed

doc/source/whatsnew/v0.24.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -669,6 +669,7 @@ Reshaping
669669
- :func:`pandas.core.groupby.GroupBy.rank` now raises a ``ValueError`` when an invalid value is passed for argument ``na_option`` (:issue:`22124`)
670670
- Bug in :func:`get_dummies` with Unicode attributes in Python 2 (:issue:`22084`)
671671
- Bug in :meth:`DataFrame.replace` raises ``RecursionError`` when replacing empty lists (:issue:`22083`)
672+
- Bug in :meth:`Series.replace` and meth:`DataFrame.replace` when dict is used as the `to_replace` value and one key in the dict is is another key's value, the results were inconsistent between using integer key and using string key (:issue:`20656`)
672673
-
673674

674675
Build Changes

pandas/core/internals/blocks.py

+103-3
Original file line numberDiff line numberDiff line change
@@ -1695,6 +1695,45 @@ def _nanpercentile(values, q, axis, **kw):
16951695
placement=np.arange(len(result)),
16961696
ndim=ndim)
16971697

1698+
def _replace_coerce(self, to_replace, value, inplace=True, regex=False,
1699+
convert=False, mgr=None, mask=None):
1700+
"""
1701+
Replace value corresponding to the given boolean array with another
1702+
value.
1703+
1704+
Parameters
1705+
----------
1706+
to_replace : object or pattern
1707+
Scalar to replace or regular expression to match.
1708+
value : object
1709+
Replacement object.
1710+
inplace : bool, default False
1711+
Perform inplace modification.
1712+
regex : bool, default False
1713+
If true, perform regular expression substitution.
1714+
convert : bool, default True
1715+
If true, try to coerce any object types to better types.
1716+
mgr : BlockManager, optional
1717+
mask : array-like of bool, optional
1718+
True indicate corresponding element is ignored.
1719+
1720+
Returns
1721+
-------
1722+
A new block if there is anything to replace or the original block.
1723+
"""
1724+
1725+
if mask.any():
1726+
if not regex:
1727+
self = self.coerce_to_target_dtype(value)
1728+
return self.putmask(mask, value, inplace=inplace)
1729+
else:
1730+
return self._replace_single(to_replace, value, inplace=inplace,
1731+
regex=regex,
1732+
convert=convert,
1733+
mask=mask,
1734+
mgr=mgr)
1735+
return self
1736+
16981737

16991738
class ScalarBlock(Block):
17001739
"""
@@ -2470,8 +2509,31 @@ def replace(self, to_replace, value, inplace=False, filter=None,
24702509
regex=regex, mgr=mgr)
24712510

24722511
def _replace_single(self, to_replace, value, inplace=False, filter=None,
2473-
regex=False, convert=True, mgr=None):
2512+
regex=False, convert=True, mgr=None, mask=None):
2513+
"""
2514+
Replace elements by the given value.
24742515
2516+
Parameters
2517+
----------
2518+
to_replace : object or pattern
2519+
Scalar to replace or regular expression to match.
2520+
value : object
2521+
Replacement object.
2522+
inplace : bool, default False
2523+
Perform inplace modification.
2524+
filter : list, optional
2525+
regex : bool, default False
2526+
If true, perform regular expression substitution.
2527+
convert : bool, default True
2528+
If true, try to coerce any object types to better types.
2529+
mgr : BlockManager, optional
2530+
mask : array-like of bool, optional
2531+
True indicate corresponding element is ignored.
2532+
2533+
Returns
2534+
-------
2535+
a new block, the result after replacing
2536+
"""
24752537
inplace = validate_bool_kwarg(inplace, 'inplace')
24762538

24772539
# to_replace is regex compilable
@@ -2537,15 +2599,53 @@ def re_replacer(s):
25372599
else:
25382600
filt = self.mgr_locs.isin(filter).nonzero()[0]
25392601

2540-
new_values[filt] = f(new_values[filt])
2602+
if mask is None:
2603+
new_values[filt] = f(new_values[filt])
2604+
else:
2605+
new_values[filt][mask] = f(new_values[filt][mask])
25412606

25422607
# convert
25432608
block = self.make_block(new_values)
25442609
if convert:
25452610
block = block.convert(by_item=True, numeric=False)
2546-
25472611
return block
25482612

2613+
def _replace_coerce(self, to_replace, value, inplace=True, regex=False,
2614+
convert=False, mgr=None, mask=None):
2615+
"""
2616+
Replace value corresponding to the given boolean array with another
2617+
value.
2618+
2619+
Parameters
2620+
----------
2621+
to_replace : object or pattern
2622+
Scalar to replace or regular expression to match.
2623+
value : object
2624+
Replacement object.
2625+
inplace : bool, default False
2626+
Perform inplace modification.
2627+
regex : bool, default False
2628+
If true, perform regular expression substitution.
2629+
convert : bool, default True
2630+
If true, try to coerce any object types to better types.
2631+
mgr : BlockManager, optional
2632+
mask : array-like of bool, optional
2633+
True indicate corresponding element is ignored.
2634+
2635+
Returns
2636+
-------
2637+
A new block if there is anything to replace or the original block.
2638+
"""
2639+
if mask.any():
2640+
block = super(ObjectBlock, self)._replace_coerce(
2641+
to_replace=to_replace, value=value, inplace=inplace,
2642+
regex=regex, convert=convert, mgr=mgr, mask=mask)
2643+
if convert:
2644+
block = [b.convert(by_item=True, numeric=False, copy=True)
2645+
for b in block]
2646+
return block
2647+
return self
2648+
25492649

25502650
class CategoricalBlock(ExtensionBlock):
25512651
__slots__ = ()

pandas/core/internals/managers.py

+44-19
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from functools import partial
44
import itertools
55
import operator
6+
import re
67

78
import numpy as np
89

@@ -23,7 +24,8 @@
2324
from pandas.core.dtypes.cast import (
2425
maybe_promote,
2526
infer_dtype_from_scalar,
26-
find_common_type)
27+
find_common_type,
28+
maybe_convert_objects)
2729
from pandas.core.dtypes.missing import isna
2830
import pandas.core.dtypes.concat as _concat
2931
from pandas.core.dtypes.generic import ABCSeries, ABCExtensionArray
@@ -571,12 +573,19 @@ def replace_list(self, src_list, dest_list, inplace=False, regex=False,
571573
# figure out our mask a-priori to avoid repeated replacements
572574
values = self.as_array()
573575

574-
def comp(s):
576+
def comp(s, regex=False):
577+
"""
578+
Generate a bool array by perform an equality check, or perform
579+
an element-wise regular expression matching
580+
"""
575581
if isna(s):
576582
return isna(values)
577-
return _maybe_compare(values, getattr(s, 'asm8', s), operator.eq)
583+
if hasattr(s, 'asm8'):
584+
return _compare_or_regex_match(maybe_convert_objects(values),
585+
getattr(s, 'asm8'), regex)
586+
return _compare_or_regex_match(values, s, regex)
578587

579-
masks = [comp(s) for i, s in enumerate(src_list)]
588+
masks = [comp(s, regex) for i, s in enumerate(src_list)]
580589

581590
result_blocks = []
582591
src_len = len(src_list) - 1
@@ -588,20 +597,16 @@ def comp(s):
588597
for i, (s, d) in enumerate(zip(src_list, dest_list)):
589598
new_rb = []
590599
for b in rb:
591-
if b.dtype == np.object_:
592-
convert = i == src_len
593-
result = b.replace(s, d, inplace=inplace, regex=regex,
594-
mgr=mgr, convert=convert)
600+
m = masks[i][b.mgr_locs.indexer]
601+
convert = i == src_len
602+
result = b._replace_coerce(mask=m, to_replace=s, value=d,
603+
inplace=inplace,
604+
convert=convert, regex=regex,
605+
mgr=mgr)
606+
if m.any():
595607
new_rb = _extend_blocks(result, new_rb)
596608
else:
597-
# get our mask for this element, sized to this
598-
# particular block
599-
m = masks[i][b.mgr_locs.indexer]
600-
if m.any():
601-
b = b.coerce_to_target_dtype(d)
602-
new_rb.extend(b.putmask(m, d, inplace=True))
603-
else:
604-
new_rb.append(b)
609+
new_rb.append(b)
605610
rb = new_rb
606611
result_blocks.extend(rb)
607612

@@ -1890,7 +1895,28 @@ def _consolidate(blocks):
18901895
return new_blocks
18911896

18921897

1893-
def _maybe_compare(a, b, op):
1898+
def _compare_or_regex_match(a, b, regex=False):
1899+
"""
1900+
Compare two array_like inputs of the same shape or two scalar values
1901+
1902+
Calls operator.eq or re.match, depending on regex argument. If regex is
1903+
True, perform an element-wise regex matching.
1904+
1905+
Parameters
1906+
----------
1907+
a : array_like or scalar
1908+
b : array_like or scalar
1909+
regex : bool, default False
1910+
1911+
Returns
1912+
-------
1913+
mask : array_like of bool
1914+
"""
1915+
if not regex:
1916+
op = lambda x: operator.eq(x, b)
1917+
else:
1918+
op = np.vectorize(lambda x: bool(re.match(b, x)) if isinstance(x, str)
1919+
else False)
18941920

18951921
is_a_array = isinstance(a, np.ndarray)
18961922
is_b_array = isinstance(b, np.ndarray)
@@ -1902,9 +1928,8 @@ def _maybe_compare(a, b, op):
19021928
# numpy deprecation warning if comparing numeric vs string-like
19031929
elif is_numeric_v_string_like(a, b):
19041930
result = False
1905-
19061931
else:
1907-
result = op(a, b)
1932+
result = op(a)
19081933

19091934
if is_scalar(result) and (is_a_array or is_b_array):
19101935
type_names = [type(a).__name__, type(b).__name__]

pandas/tests/series/test_replace.py

+8
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,14 @@ def test_replace_string_with_number(self):
256256
expected = pd.Series([1, 2, 3])
257257
tm.assert_series_equal(expected, result)
258258

259+
def test_replace_replacer_equals_replacement(self):
260+
# GH 20656
261+
# make sure all replacers are matching against original values
262+
s = pd.Series(['a', 'b'])
263+
expected = pd.Series(['b', 'a'])
264+
result = s.replace({'a': 'b', 'b': 'a'})
265+
tm.assert_series_equal(expected, result)
266+
259267
def test_replace_unicode_with_number(self):
260268
# GH 15743
261269
s = pd.Series([1, 2, 3])

0 commit comments

Comments
 (0)