From 8c8b36fcb79998754cdf13bc9abddcfd711f1944 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 3 Sep 2020 08:53:04 -0700 Subject: [PATCH 1/4] BUG: df.replace with numeric values and string to_replace --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/core/array_algos/replace.py | 94 ++++++++++++++++++ pandas/core/internals/blocks.py | 28 +++++- pandas/core/internals/managers.py | 104 +------------------- pandas/tests/frame/methods/test_replace.py | 15 ++- pandas/tests/series/methods/test_replace.py | 5 +- 6 files changed, 136 insertions(+), 112 deletions(-) create mode 100644 pandas/core/array_algos/replace.py diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 407e8ba029ada..526e1a4fb0f89 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -281,7 +281,7 @@ ExtensionArray Other ^^^^^ -- +- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` with numeric values and string ``to_replace`` (:issue:`34789`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/array_algos/replace.py b/pandas/core/array_algos/replace.py new file mode 100644 index 0000000000000..341472e764e7f --- /dev/null +++ b/pandas/core/array_algos/replace.py @@ -0,0 +1,94 @@ +""" +Methods used by Block.replace and related methods. +""" +import operator +import re +from typing import Optional, Pattern, Union + +import numpy as np + +from pandas._typing import ArrayLike, Scalar + +from pandas.core.dtypes.common import ( + is_datetimelike_v_numeric, + is_numeric_v_string_like, + is_scalar, +) + + +def compare_or_regex_search( + a: ArrayLike, + b: Union[Scalar, Pattern], + regex: bool = False, + mask: Optional[ArrayLike] = None, +) -> Union[ArrayLike, bool]: + """ + Compare two array_like inputs of the same shape or two scalar values + + Calls operator.eq or re.search, depending on regex argument. If regex is + True, perform an element-wise regex matching. + + Parameters + ---------- + a : array_like + b : scalar or regex pattern + regex : bool, default False + mask : array_like or None (default) + + Returns + ------- + mask : array_like of bool + """ + + def _check_comparison_types( + result: Union[ArrayLike, bool], a: ArrayLike, b: Union[Scalar, Pattern] + ): + """ + Raises an error if the two arrays (a,b) cannot be compared. + Otherwise, returns the comparison result as expected. + """ + if is_scalar(result) and isinstance(a, np.ndarray): + type_names = [type(a).__name__, type(b).__name__] + + if isinstance(a, np.ndarray): + type_names[0] = f"ndarray(dtype={a.dtype})" + + raise TypeError( + f"Cannot compare types {repr(type_names[0])} and {repr(type_names[1])}" + ) + + if not regex: + op = lambda x: operator.eq(x, b) + else: + op = np.vectorize( + lambda x: bool(re.search(b, x)) + if isinstance(x, str) and isinstance(b, (str, Pattern)) + else False + ) + + # GH#32621 use mask to avoid comparing to NAs + if mask is None and isinstance(a, np.ndarray) and not isinstance(b, np.ndarray): + mask = np.reshape(~(isna(a)), a.shape) + if isinstance(a, np.ndarray): + a = a[mask] + + if is_numeric_v_string_like(a, b): + # GH#29553 avoid deprecation warnings from numpy + return np.zeros(a.shape, dtype=bool) + + elif is_datetimelike_v_numeric(a, b) or is_numeric_v_string_like(a, b): + # GH#29553 avoid deprecation warnings from numpy + _check_comparison_types(False, a, b) + return False + + result = op(a) + + if isinstance(result, np.ndarray) and mask is not None: + # The shape of the mask can differ to that of the result + # since we may compare only a subset of a's or b's elements + tmp = np.zeros(mask.shape, dtype=np.bool_) + tmp[mask] = result + result = tmp + + _check_comparison_types(result, a, b) + return result diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index ad388ef3f53b0..66448cac63106 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -11,7 +11,7 @@ from pandas._libs.internals import BlockPlacement from pandas._libs.tslibs import conversion from pandas._libs.tslibs.timezones import tz_compare -from pandas._typing import ArrayLike +from pandas._typing import ArrayLike, Scalar from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( @@ -59,8 +59,10 @@ from pandas.core.dtypes.missing import _isna_compat, is_valid_nat_for_dtype, isna import pandas.core.algorithms as algos +from pandas.core.array_algos.replace import compare_or_regex_search from pandas.core.array_algos.transforms import shift from pandas.core.arrays import ( + BooleanArray, Categorical, DatetimeArray, ExtensionArray, @@ -792,7 +794,6 @@ def _replace_list( self, src_list: List[Any], dest_list: List[Any], - masks: List[np.ndarray], inplace: bool = False, regex: bool = False, ) -> List["Block"]: @@ -801,11 +802,28 @@ def _replace_list( """ src_len = len(src_list) - 1 + def comp(s: Scalar, mask: np.ndarray, regex: bool = False) -> np.ndarray: + """ + Generate a bool array by perform an equality check, or perform + an element-wise regular expression matching + """ + if isna(s): + return ~mask + + s = com.maybe_box_datetimelike(s) + return compare_or_regex_search(self.values, s, regex, mask) + + # Calculate the mask once, prior to the call of comp + # in order to avoid repeating the same computations + mask = ~isna(self.values) + + masks = [comp(s, mask, regex) for s in src_list] + rb = [self if inplace else self.copy()] for i, (src, dest) in enumerate(zip(src_list, dest_list)): new_rb: List["Block"] = [] for blk in rb: - m = masks[i][blk.mgr_locs.indexer] + m = masks[i] convert = i == src_len # only convert once at the end result = blk._replace_coerce( mask=m, @@ -2906,7 +2924,9 @@ def _extract_bool_array(mask: ArrayLike) -> np.ndarray: """ If we have a SparseArray or BooleanArray, convert it to ndarray[bool]. """ - if isinstance(mask, ExtensionArray): + if isinstance(mask, BooleanArray): + mask = mask.to_numpy(dtype=bool, na_value=False) + elif isinstance(mask, ExtensionArray): # We could have BooleanArray, Sparse[bool], ... mask = np.asarray(mask, dtype=np.bool_) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 389252e7ef0f2..fe53acdb1880d 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1,14 +1,11 @@ from collections import defaultdict import itertools -import operator -import re from typing import ( Any, DefaultDict, Dict, List, Optional, - Pattern, Sequence, Tuple, TypeVar, @@ -19,7 +16,7 @@ import numpy as np from pandas._libs import internals as libinternals, lib -from pandas._typing import ArrayLike, DtypeObj, Label, Scalar +from pandas._typing import ArrayLike, DtypeObj, Label from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( @@ -29,12 +26,9 @@ ) from pandas.core.dtypes.common import ( DT64NS_DTYPE, - is_datetimelike_v_numeric, is_dtype_equal, is_extension_array_dtype, is_list_like, - is_numeric_v_string_like, - is_scalar, ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.dtypes import ExtensionDtype @@ -44,7 +38,6 @@ import pandas.core.algorithms as algos from pandas.core.arrays.sparse import SparseDtype from pandas.core.base import PandasObject -import pandas.core.common as com from pandas.core.construction import extract_array from pandas.core.indexers import maybe_convert_indices from pandas.core.indexes.api import Index, ensure_index @@ -610,31 +603,10 @@ def replace_list( """ do a list replace """ inplace = validate_bool_kwarg(inplace, "inplace") - # figure out our mask apriori to avoid repeated replacements - values = self.as_array() - - def comp(s: Scalar, mask: np.ndarray, regex: bool = False): - """ - Generate a bool array by perform an equality check, or perform - an element-wise regular expression matching - """ - if isna(s): - return ~mask - - s = com.maybe_box_datetimelike(s) - return _compare_or_regex_search(values, s, regex, mask) - - # Calculate the mask once, prior to the call of comp - # in order to avoid repeating the same computations - mask = ~isna(values) - - masks = [comp(s, mask, regex) for s in src_list] - bm = self.apply( "_replace_list", src_list=src_list, dest_list=dest_list, - masks=masks, inplace=inplace, regex=regex, ) @@ -1882,80 +1854,6 @@ def _merge_blocks( return blocks -def _compare_or_regex_search( - a: ArrayLike, - b: Union[Scalar, Pattern], - regex: bool = False, - mask: Optional[ArrayLike] = None, -) -> Union[ArrayLike, bool]: - """ - Compare two array_like inputs of the same shape or two scalar values - - Calls operator.eq or re.search, depending on regex argument. If regex is - True, perform an element-wise regex matching. - - Parameters - ---------- - a : array_like - b : scalar or regex pattern - regex : bool, default False - mask : array_like or None (default) - - Returns - ------- - mask : array_like of bool - """ - - def _check_comparison_types( - result: Union[ArrayLike, bool], a: ArrayLike, b: Union[Scalar, Pattern] - ): - """ - Raises an error if the two arrays (a,b) cannot be compared. - Otherwise, returns the comparison result as expected. - """ - if is_scalar(result) and isinstance(a, np.ndarray): - type_names = [type(a).__name__, type(b).__name__] - - if isinstance(a, np.ndarray): - type_names[0] = f"ndarray(dtype={a.dtype})" - - raise TypeError( - f"Cannot compare types {repr(type_names[0])} and {repr(type_names[1])}" - ) - - if not regex: - op = lambda x: operator.eq(x, b) - else: - op = np.vectorize( - lambda x: bool(re.search(b, x)) - if isinstance(x, str) and isinstance(b, (str, Pattern)) - else False - ) - - # GH#32621 use mask to avoid comparing to NAs - if mask is None and isinstance(a, np.ndarray) and not isinstance(b, np.ndarray): - mask = np.reshape(~(isna(a)), a.shape) - if isinstance(a, np.ndarray): - a = a[mask] - - if is_datetimelike_v_numeric(a, b) or is_numeric_v_string_like(a, b): - # GH#29553 avoid deprecation warnings from numpy - _check_comparison_types(False, a, b) - return False - - result = op(a) - - if isinstance(result, np.ndarray) and mask is not None: - # The shape of the mask can differ to that of the result - # since we may compare only a subset of a's or b's elements - tmp = np.zeros(mask.shape, dtype=np.bool_) - tmp[mask] = result - result = tmp - - _check_comparison_types(result, a, b) - return result - - def _fast_count_smallints(arr: np.ndarray) -> np.ndarray: """Faster version of set(arr) for sequences of small numbers.""" counts = np.bincount(arr.astype(np.int_)) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 83dfd42ae2a6e..ea2488dfc0877 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1131,8 +1131,19 @@ def test_replace_bool_with_bool(self): def test_replace_with_dict_with_bool_keys(self): df = DataFrame({0: [True, False], 1: [False, True]}) - with pytest.raises(TypeError, match="Cannot compare types .+"): - df.replace({"asdf": "asdb", True: "yes"}) + result = df.replace({"asdf": "asdb", True: "yes"}) + expected = DataFrame({0: ["yes", False], 1: [False, "yes"]}) + tm.assert_frame_equal(result, expected) + + def test_replace_dict_strings_vs_ints(self): + # GH#34789 + df = pd.DataFrame({"Y0": [1, 2], "Y1": [3, 4]}) + result = df.replace({"replace_string": "test"}) + + tm.assert_frame_equal(result, df) + + result = df["Y0"].replace({"replace_string": "test"}) + tm.assert_series_equal(result, df["Y0"]) def test_replace_truthy(self): df = DataFrame({"a": [True, True]}) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index f78a28c66e946..02c2c0dca1873 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -218,8 +218,9 @@ def test_replace_bool_with_bool(self): def test_replace_with_dict_with_bool_keys(self): s = pd.Series([True, False, True]) - with pytest.raises(TypeError, match="Cannot compare types .+"): - s.replace({"asdf": "asdb", True: "yes"}) + result = s.replace({"asdf": "asdb", True: "yes"}) + expected = pd.Series(["yes", False, "yes"]) + tm.assert_series_equal(result, expected) def test_replace2(self): N = 100 From 839b757218bc998be901149460f79d797c0e9838 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 3 Sep 2020 10:21:27 -0700 Subject: [PATCH 2/4] missing import --- pandas/core/array_algos/replace.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/array_algos/replace.py b/pandas/core/array_algos/replace.py index 341472e764e7f..32351a42df092 100644 --- a/pandas/core/array_algos/replace.py +++ b/pandas/core/array_algos/replace.py @@ -14,6 +14,7 @@ is_numeric_v_string_like, is_scalar, ) +from pandas.core.dtypes.missing import isna def compare_or_regex_search( From b683707bfb88af8daae24e77c444d915b1db3bb3 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 3 Sep 2020 10:51:58 -0700 Subject: [PATCH 3/4] avoid specific BooleanArray special casing --- pandas/core/array_algos/replace.py | 2 +- pandas/core/internals/blocks.py | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/core/array_algos/replace.py b/pandas/core/array_algos/replace.py index 32351a42df092..6ac3cc1f9f2fe 100644 --- a/pandas/core/array_algos/replace.py +++ b/pandas/core/array_algos/replace.py @@ -77,7 +77,7 @@ def _check_comparison_types( # GH#29553 avoid deprecation warnings from numpy return np.zeros(a.shape, dtype=bool) - elif is_datetimelike_v_numeric(a, b) or is_numeric_v_string_like(a, b): + elif is_datetimelike_v_numeric(a, b): # GH#29553 avoid deprecation warnings from numpy _check_comparison_types(False, a, b) return False diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 66448cac63106..30ea2766e5133 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -62,7 +62,6 @@ from pandas.core.array_algos.replace import compare_or_regex_search from pandas.core.array_algos.transforms import shift from pandas.core.arrays import ( - BooleanArray, Categorical, DatetimeArray, ExtensionArray, @@ -2924,11 +2923,11 @@ def _extract_bool_array(mask: ArrayLike) -> np.ndarray: """ If we have a SparseArray or BooleanArray, convert it to ndarray[bool]. """ - if isinstance(mask, BooleanArray): - mask = mask.to_numpy(dtype=bool, na_value=False) - elif isinstance(mask, ExtensionArray): + if isinstance(mask, ExtensionArray): # We could have BooleanArray, Sparse[bool], ... - mask = np.asarray(mask, dtype=np.bool_) + # Except for BooleanArray, this is equivalent to just + # np.asarray(mask, dtype=bool) + mask = mask.to_numpy(dtype=bool, na_value=False) assert isinstance(mask, np.ndarray), type(mask) assert mask.dtype == bool, mask.dtype From 30be372ccdac418933d851e3f253412a06868418 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 4 Sep 2020 09:55:20 -0700 Subject: [PATCH 4/4] update docs --- doc/source/user_guide/missing_data.rst | 26 -------------------------- pandas/core/generic.py | 14 -------------- 2 files changed, 40 deletions(-) diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index 2e68a0598bb71..28206192dd161 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -689,32 +689,6 @@ You can also operate on the DataFrame in place: df.replace(1.5, np.nan, inplace=True) -.. warning:: - - When replacing multiple ``bool`` or ``datetime64`` objects, the first - argument to ``replace`` (``to_replace``) must match the type of the value - being replaced. For example, - - .. code-block:: python - - >>> s = pd.Series([True, False, True]) - >>> s.replace({'a string': 'new value', True: False}) # raises - TypeError: Cannot compare types 'ndarray(dtype=bool)' and 'str' - - will raise a ``TypeError`` because one of the ``dict`` keys is not of the - correct type for replacement. - - However, when replacing a *single* object such as, - - .. ipython:: python - - s = pd.Series([True, False, True]) - s.replace('a string', 'another string') - - the original ``NDFrame`` object will be returned untouched. We're working on - unifying this API, but for backwards compatibility reasons we cannot break - the latter behavior. See :issue:`6354` for more details. - Missing data casting rules and indexing --------------------------------------- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6c8780a0fc186..7b8072279ce69 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6559,20 +6559,6 @@ def replace( 1 new new 2 bait xyz - Note that when replacing multiple ``bool`` or ``datetime64`` objects, - the data types in the `to_replace` parameter must match the data - type of the value being replaced: - - >>> df = pd.DataFrame({{'A': [True, False, True], - ... 'B': [False, True, False]}}) - >>> df.replace({{'a string': 'new value', True: False}}) # raises - Traceback (most recent call last): - ... - TypeError: Cannot compare types 'ndarray(dtype=bool)' and 'str' - - This raises a ``TypeError`` because one of the ``dict`` keys is not of - the correct type for replacement. - Compare the behavior of ``s.replace({{'a': None}})`` and ``s.replace('a', None)`` to understand the peculiarities of the `to_replace` parameter: