Skip to content

BUG: inconsistency between replace dict using integers and using strings (#20656) #21477

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Aug 9, 2018
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -668,6 +668,7 @@ Reshaping
- :func:`pandas.core.groupby.GroupBy.rank` now raises a ``ValueError`` when an invalid value is passed for argument ``na_option`` (:issue:`22124`)
- Bug in :func:`get_dummies` with Unicode attributes in Python 2 (:issue:`22084`)
- Bug in :meth:`DataFrame.replace` raises ``RecursionError`` when replacing empty lists (:issue:`22083`)
- Bug in :meth:`Series.replace` and meth:`DataFrame.replace` when dict is used as the `to_replace` value and one key in the dict is is another key's value, the results were inconsistent between using integer key and using string key (:issue:`20656`)
-

Build Changes
Expand Down
106 changes: 103 additions & 3 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1695,6 +1695,45 @@ def _nanpercentile(values, q, axis, **kw):
placement=np.arange(len(result)),
ndim=ndim)

def _replace_coerce(self, to_replace, value, inplace=True, regex=False,
convert=False, mgr=None, mask=None):
"""
Replace value corresponding to the given boolean array with another
value.

Parameters
----------
to_replace : object or pattern
Scalar to replace or regular expression to match.
value : object
Replacement object.
inplace : bool, default False
Perform inplace modification.
regex : bool, default False
If true, perform regular expression substitution.
convert : bool, default True
If true, try to coerce any object types to better types.
mgr : BlockManager, optional
mask : array-like of bool, optional
True indicate corresponding element is ignored.

Returns
-------
A new block if there is anything to replace or the original block.
"""

if mask.any():
if not regex:
self = self.coerce_to_target_dtype(value)
return self.putmask(mask, value, inplace=inplace)
else:
return self._replace_single(to_replace, value, inplace=inplace,
regex=regex,
convert=convert,
mask=mask,
mgr=mgr)
return self


class ScalarBlock(Block):
"""
Expand Down Expand Up @@ -2470,8 +2509,31 @@ def replace(self, to_replace, value, inplace=False, filter=None,
regex=regex, mgr=mgr)

def _replace_single(self, to_replace, value, inplace=False, filter=None,
regex=False, convert=True, mgr=None):
regex=False, convert=True, mgr=None, mask=None):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a doc-string here

"""
Replace elements by the given value.

Parameters
----------
to_replace : object or pattern
Scalar to replace or regular expression to match.
value : object
Replacement object.
inplace : bool, default False
Perform inplace modification.
filter : list, optional
regex : bool, default False
If true, perform regular expression substitution.
convert : bool, default True
If true, try to coerce any object types to better types.
mgr : BlockManager, optional
mask : array-like of bool, optional
True indicate corresponding element is ignored.

Returns
-------
a new block, the result after replacing
"""
inplace = validate_bool_kwarg(inplace, 'inplace')

# to_replace is regex compilable
Expand Down Expand Up @@ -2537,15 +2599,53 @@ def re_replacer(s):
else:
filt = self.mgr_locs.isin(filter).nonzero()[0]

new_values[filt] = f(new_values[filt])
if mask is None:
new_values[filt] = f(new_values[filt])
else:
new_values[filt][mask] = f(new_values[filt][mask])

# convert
block = self.make_block(new_values)
if convert:
block = block.convert(by_item=True, numeric=False)

return block

def _replace_coerce(self, to_replace, value, inplace=True, regex=False,
convert=False, mgr=None, mask=None):
"""
Replace value corresponding to the given boolean array with another
value.

Parameters
----------
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same as above (you can make a shared doc-string if you want here)

to_replace : object or pattern
Scalar to replace or regular expression to match.
value : object
Replacement object.
inplace : bool, default False
Perform inplace modification.
regex : bool, default False
If true, perform regular expression substitution.
convert : bool, default True
If true, try to coerce any object types to better types.
mgr : BlockManager, optional
mask : array-like of bool, optional
True indicate corresponding element is ignored.

Returns
-------
A new block if there is anything to replace or the original block.
"""
if mask.any():
block = super(ObjectBlock, self)._replace_coerce(
to_replace=to_replace, value=value, inplace=inplace,
regex=regex, convert=convert, mgr=mgr, mask=mask)
if convert:
block = [b.convert(by_item=True, numeric=False, copy=True)
for b in block]
return block
return self


class CategoricalBlock(ExtensionBlock):
__slots__ = ()
Expand Down
63 changes: 44 additions & 19 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from functools import partial
import itertools
import operator
import re

import numpy as np

Expand All @@ -23,7 +24,8 @@
from pandas.core.dtypes.cast import (
maybe_promote,
infer_dtype_from_scalar,
find_common_type)
find_common_type,
maybe_convert_objects)
from pandas.core.dtypes.missing import isna
import pandas.core.dtypes.concat as _concat
from pandas.core.dtypes.generic import ABCSeries, ABCExtensionArray
Expand Down Expand Up @@ -571,12 +573,19 @@ def replace_list(self, src_list, dest_list, inplace=False, regex=False,
# figure out our mask a-priori to avoid repeated replacements
values = self.as_array()

def comp(s):
def comp(s, regex=False):
"""
Generate a bool array by perform an equality check, or perform
an element-wise regular expression matching
"""
if isna(s):
return isna(values)
return _maybe_compare(values, getattr(s, 'asm8', s), operator.eq)
if hasattr(s, 'asm8'):
return _compare_or_regex_match(maybe_convert_objects(values),
getattr(s, 'asm8'), regex)
return _compare_or_regex_match(values, s, regex)

masks = [comp(s) for i, s in enumerate(src_list)]
masks = [comp(s, regex) for i, s in enumerate(src_list)]

result_blocks = []
src_len = len(src_list) - 1
Expand All @@ -588,20 +597,16 @@ def comp(s):
for i, (s, d) in enumerate(zip(src_list, dest_list)):
new_rb = []
for b in rb:
if b.dtype == np.object_:
convert = i == src_len
result = b.replace(s, d, inplace=inplace, regex=regex,
mgr=mgr, convert=convert)
m = masks[i][b.mgr_locs.indexer]
convert = i == src_len
result = b._replace_coerce(mask=m, to_replace=s, value=d,
inplace=inplace,
convert=convert, regex=regex,
mgr=mgr)
if m.any():
new_rb = _extend_blocks(result, new_rb)
else:
# get our mask for this element, sized to this
# particular block
m = masks[i][b.mgr_locs.indexer]
if m.any():
b = b.coerce_to_target_dtype(d)
new_rb.extend(b.putmask(m, d, inplace=True))
else:
new_rb.append(b)
new_rb.append(b)
rb = new_rb
result_blocks.extend(rb)

Expand Down Expand Up @@ -1890,7 +1895,28 @@ def _consolidate(blocks):
return new_blocks


def _maybe_compare(a, b, op):
def _compare_or_regex_match(a, b, regex=False):
"""
Compare two array_like inputs of the same shape or two scalar values

Calls operator.eq or re.match, depending on regex argument. If regex is
True, perform an element-wise regex matching.

Parameters
----------
a : array_like or scalar
b : array_like or scalar
regex : bool, default False

Returns
-------
mask : array_like of bool
"""
if not regex:
op = lambda x: operator.eq(x, b)
else:
op = np.vectorize(lambda x: bool(re.match(b, x)) if isinstance(x, str)
else False)

is_a_array = isinstance(a, np.ndarray)
is_b_array = isinstance(b, np.ndarray)
Expand All @@ -1902,9 +1928,8 @@ def _maybe_compare(a, b, op):
# numpy deprecation warning if comparing numeric vs string-like
elif is_numeric_v_string_like(a, b):
result = False

else:
result = op(a, b)
result = op(a)

if is_scalar(result) and (is_a_array or is_b_array):
type_names = [type(a).__name__, type(b).__name__]
Expand Down
8 changes: 8 additions & 0 deletions pandas/tests/series/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,14 @@ def test_replace_string_with_number(self):
expected = pd.Series([1, 2, 3])
tm.assert_series_equal(expected, result)

def test_replace_replacer_equals_replacement(self):
# GH 20656
# make sure all replacers are matching against original values
s = pd.Series(['a', 'b'])
expected = pd.Series(['b', 'a'])
result = s.replace({'a': 'b', 'b': 'a'})
tm.assert_series_equal(expected, result)

def test_replace_unicode_with_number(self):
# GH 15743
s = pd.Series([1, 2, 3])
Expand Down