diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index e28b15fcf621f..c340486c78a59 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -668,6 +668,7 @@ Reshaping - :func:`pandas.core.groupby.GroupBy.rank` now raises a ``ValueError`` when an invalid value is passed for argument ``na_option`` (:issue:`22124`) - Bug in :func:`get_dummies` with Unicode attributes in Python 2 (:issue:`22084`) - Bug in :meth:`DataFrame.replace` raises ``RecursionError`` when replacing empty lists (:issue:`22083`) +- Bug in :meth:`Series.replace` and meth:`DataFrame.replace` when dict is used as the `to_replace` value and one key in the dict is is another key's value, the results were inconsistent between using integer key and using string key (:issue:`20656`) - Build Changes diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index be80a605f08fd..f0635014b166b 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1695,6 +1695,45 @@ def _nanpercentile(values, q, axis, **kw): placement=np.arange(len(result)), ndim=ndim) + def _replace_coerce(self, to_replace, value, inplace=True, regex=False, + convert=False, mgr=None, mask=None): + """ + Replace value corresponding to the given boolean array with another + value. + + Parameters + ---------- + to_replace : object or pattern + Scalar to replace or regular expression to match. + value : object + Replacement object. + inplace : bool, default False + Perform inplace modification. + regex : bool, default False + If true, perform regular expression substitution. + convert : bool, default True + If true, try to coerce any object types to better types. + mgr : BlockManager, optional + mask : array-like of bool, optional + True indicate corresponding element is ignored. + + Returns + ------- + A new block if there is anything to replace or the original block. + """ + + if mask.any(): + if not regex: + self = self.coerce_to_target_dtype(value) + return self.putmask(mask, value, inplace=inplace) + else: + return self._replace_single(to_replace, value, inplace=inplace, + regex=regex, + convert=convert, + mask=mask, + mgr=mgr) + return self + class ScalarBlock(Block): """ @@ -2470,8 +2509,31 @@ def replace(self, to_replace, value, inplace=False, filter=None, regex=regex, mgr=mgr) def _replace_single(self, to_replace, value, inplace=False, filter=None, - regex=False, convert=True, mgr=None): + regex=False, convert=True, mgr=None, mask=None): + """ + Replace elements by the given value. + Parameters + ---------- + to_replace : object or pattern + Scalar to replace or regular expression to match. + value : object + Replacement object. + inplace : bool, default False + Perform inplace modification. + filter : list, optional + regex : bool, default False + If true, perform regular expression substitution. + convert : bool, default True + If true, try to coerce any object types to better types. + mgr : BlockManager, optional + mask : array-like of bool, optional + True indicate corresponding element is ignored. + + Returns + ------- + a new block, the result after replacing + """ inplace = validate_bool_kwarg(inplace, 'inplace') # to_replace is regex compilable @@ -2537,15 +2599,53 @@ def re_replacer(s): else: filt = self.mgr_locs.isin(filter).nonzero()[0] - new_values[filt] = f(new_values[filt]) + if mask is None: + new_values[filt] = f(new_values[filt]) + else: + new_values[filt][mask] = f(new_values[filt][mask]) # convert block = self.make_block(new_values) if convert: block = block.convert(by_item=True, numeric=False) - return block + def _replace_coerce(self, to_replace, value, inplace=True, regex=False, + convert=False, mgr=None, mask=None): + """ + Replace value corresponding to the given boolean array with another + value. + + Parameters + ---------- + to_replace : object or pattern + Scalar to replace or regular expression to match. + value : object + Replacement object. + inplace : bool, default False + Perform inplace modification. + regex : bool, default False + If true, perform regular expression substitution. + convert : bool, default True + If true, try to coerce any object types to better types. + mgr : BlockManager, optional + mask : array-like of bool, optional + True indicate corresponding element is ignored. + + Returns + ------- + A new block if there is anything to replace or the original block. + """ + if mask.any(): + block = super(ObjectBlock, self)._replace_coerce( + to_replace=to_replace, value=value, inplace=inplace, + regex=regex, convert=convert, mgr=mgr, mask=mask) + if convert: + block = [b.convert(by_item=True, numeric=False, copy=True) + for b in block] + return block + return self + class CategoricalBlock(ExtensionBlock): __slots__ = () diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 32e8372d5c6c9..e64ba44bb8a92 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -3,6 +3,7 @@ from functools import partial import itertools import operator +import re import numpy as np @@ -23,7 +24,8 @@ from pandas.core.dtypes.cast import ( maybe_promote, infer_dtype_from_scalar, - find_common_type) + find_common_type, + maybe_convert_objects) from pandas.core.dtypes.missing import isna import pandas.core.dtypes.concat as _concat from pandas.core.dtypes.generic import ABCSeries, ABCExtensionArray @@ -571,12 +573,19 @@ def replace_list(self, src_list, dest_list, inplace=False, regex=False, # figure out our mask a-priori to avoid repeated replacements values = self.as_array() - def comp(s): + def comp(s, regex=False): + """ + Generate a bool array by perform an equality check, or perform + an element-wise regular expression matching + """ if isna(s): return isna(values) - return _maybe_compare(values, getattr(s, 'asm8', s), operator.eq) + if hasattr(s, 'asm8'): + return _compare_or_regex_match(maybe_convert_objects(values), + getattr(s, 'asm8'), regex) + return _compare_or_regex_match(values, s, regex) - masks = [comp(s) for i, s in enumerate(src_list)] + masks = [comp(s, regex) for i, s in enumerate(src_list)] result_blocks = [] src_len = len(src_list) - 1 @@ -588,20 +597,16 @@ def comp(s): for i, (s, d) in enumerate(zip(src_list, dest_list)): new_rb = [] for b in rb: - if b.dtype == np.object_: - convert = i == src_len - result = b.replace(s, d, inplace=inplace, regex=regex, - mgr=mgr, convert=convert) + m = masks[i][b.mgr_locs.indexer] + convert = i == src_len + result = b._replace_coerce(mask=m, to_replace=s, value=d, + inplace=inplace, + convert=convert, regex=regex, + mgr=mgr) + if m.any(): new_rb = _extend_blocks(result, new_rb) else: - # get our mask for this element, sized to this - # particular block - m = masks[i][b.mgr_locs.indexer] - if m.any(): - b = b.coerce_to_target_dtype(d) - new_rb.extend(b.putmask(m, d, inplace=True)) - else: - new_rb.append(b) + new_rb.append(b) rb = new_rb result_blocks.extend(rb) @@ -1890,7 +1895,28 @@ def _consolidate(blocks): return new_blocks -def _maybe_compare(a, b, op): +def _compare_or_regex_match(a, b, regex=False): + """ + Compare two array_like inputs of the same shape or two scalar values + + Calls operator.eq or re.match, depending on regex argument. If regex is + True, perform an element-wise regex matching. + + Parameters + ---------- + a : array_like or scalar + b : array_like or scalar + regex : bool, default False + + Returns + ------- + mask : array_like of bool + """ + if not regex: + op = lambda x: operator.eq(x, b) + else: + op = np.vectorize(lambda x: bool(re.match(b, x)) if isinstance(x, str) + else False) is_a_array = isinstance(a, np.ndarray) is_b_array = isinstance(b, np.ndarray) @@ -1902,9 +1928,8 @@ def _maybe_compare(a, b, op): # numpy deprecation warning if comparing numeric vs string-like elif is_numeric_v_string_like(a, b): result = False - else: - result = op(a, b) + result = op(a) if is_scalar(result) and (is_a_array or is_b_array): type_names = [type(a).__name__, type(b).__name__] diff --git a/pandas/tests/series/test_replace.py b/pandas/tests/series/test_replace.py index d495fd9c83c24..9e198d2854f24 100644 --- a/pandas/tests/series/test_replace.py +++ b/pandas/tests/series/test_replace.py @@ -256,6 +256,14 @@ def test_replace_string_with_number(self): expected = pd.Series([1, 2, 3]) tm.assert_series_equal(expected, result) + def test_replace_replacer_equals_replacement(self): + # GH 20656 + # make sure all replacers are matching against original values + s = pd.Series(['a', 'b']) + expected = pd.Series(['b', 'a']) + result = s.replace({'a': 'b', 'b': 'a'}) + tm.assert_series_equal(expected, result) + def test_replace_unicode_with_number(self): # GH 15743 s = pd.Series([1, 2, 3])