Skip to content

BUG: df.replace with numeric values and str to_replace #36093

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Sep 5, 2020
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,7 @@ ExtensionArray
Other
^^^^^
- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` incorrectly raising ``AssertionError`` instead of ``ValueError`` when invalid parameter combinations are passed (:issue:`36045`)
- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` with numeric values and string ``to_replace`` (:issue:`34789`)
-

.. ---------------------------------------------------------------------------
Expand Down
94 changes: 94 additions & 0 deletions pandas/core/array_algos/replace.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
"""
Methods used by Block.replace and related methods.
"""
import operator
import re
from typing import Optional, Pattern, Union

import numpy as np

from pandas._typing import ArrayLike, Scalar

from pandas.core.dtypes.common import (
is_datetimelike_v_numeric,
is_numeric_v_string_like,
is_scalar,
)


def compare_or_regex_search(
a: ArrayLike,
b: Union[Scalar, Pattern],
regex: bool = False,
mask: Optional[ArrayLike] = None,
) -> Union[ArrayLike, bool]:
"""
Compare two array_like inputs of the same shape or two scalar values

Calls operator.eq or re.search, depending on regex argument. If regex is
True, perform an element-wise regex matching.

Parameters
----------
a : array_like
b : scalar or regex pattern
regex : bool, default False
mask : array_like or None (default)

Returns
-------
mask : array_like of bool
"""

def _check_comparison_types(
result: Union[ArrayLike, bool], a: ArrayLike, b: Union[Scalar, Pattern]
):
"""
Raises an error if the two arrays (a,b) cannot be compared.
Otherwise, returns the comparison result as expected.
"""
if is_scalar(result) and isinstance(a, np.ndarray):
type_names = [type(a).__name__, type(b).__name__]

if isinstance(a, np.ndarray):
type_names[0] = f"ndarray(dtype={a.dtype})"

raise TypeError(
f"Cannot compare types {repr(type_names[0])} and {repr(type_names[1])}"
)

if not regex:
op = lambda x: operator.eq(x, b)
else:
op = np.vectorize(
lambda x: bool(re.search(b, x))
if isinstance(x, str) and isinstance(b, (str, Pattern))
else False
)

# GH#32621 use mask to avoid comparing to NAs
if mask is None and isinstance(a, np.ndarray) and not isinstance(b, np.ndarray):
mask = np.reshape(~(isna(a)), a.shape)
if isinstance(a, np.ndarray):
a = a[mask]

if is_numeric_v_string_like(a, b):
# GH#29553 avoid deprecation warnings from numpy
return np.zeros(a.shape, dtype=bool)

elif is_datetimelike_v_numeric(a, b) or is_numeric_v_string_like(a, b):
# GH#29553 avoid deprecation warnings from numpy
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Related to this PR?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, though there is also a mistake here (the second condition has been refactored to a few lines up, so this line should just be elif is_datetimelike_v_numeric(a, b):

In master this is where we incorrectly raise instead of just consider string==numeric not-equal

_check_comparison_types(False, a, b)
return False

result = op(a)

if isinstance(result, np.ndarray) and mask is not None:
# The shape of the mask can differ to that of the result
# since we may compare only a subset of a's or b's elements
tmp = np.zeros(mask.shape, dtype=np.bool_)
tmp[mask] = result
result = tmp

_check_comparison_types(result, a, b)
return result
28 changes: 24 additions & 4 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from pandas._libs.internals import BlockPlacement
from pandas._libs.tslibs import conversion
from pandas._libs.tslibs.timezones import tz_compare
from pandas._typing import ArrayLike
from pandas._typing import ArrayLike, Scalar
from pandas.util._validators import validate_bool_kwarg

from pandas.core.dtypes.cast import (
Expand Down Expand Up @@ -59,8 +59,10 @@
from pandas.core.dtypes.missing import _isna_compat, is_valid_nat_for_dtype, isna

import pandas.core.algorithms as algos
from pandas.core.array_algos.replace import compare_or_regex_search
from pandas.core.array_algos.transforms import shift
from pandas.core.arrays import (
BooleanArray,
Categorical,
DatetimeArray,
ExtensionArray,
Expand Down Expand Up @@ -792,7 +794,6 @@ def _replace_list(
self,
src_list: List[Any],
dest_list: List[Any],
masks: List[np.ndarray],
inplace: bool = False,
regex: bool = False,
) -> List["Block"]:
Expand All @@ -801,11 +802,28 @@ def _replace_list(
"""
src_len = len(src_list) - 1

def comp(s: Scalar, mask: np.ndarray, regex: bool = False) -> np.ndarray:
"""
Generate a bool array by perform an equality check, or perform
an element-wise regular expression matching
"""
if isna(s):
return ~mask

s = com.maybe_box_datetimelike(s)
return compare_or_regex_search(self.values, s, regex, mask)

# Calculate the mask once, prior to the call of comp
# in order to avoid repeating the same computations
mask = ~isna(self.values)

masks = [comp(s, mask, regex) for s in src_list]

rb = [self if inplace else self.copy()]
for i, (src, dest) in enumerate(zip(src_list, dest_list)):
new_rb: List["Block"] = []
for blk in rb:
m = masks[i][blk.mgr_locs.indexer]
m = masks[i]
convert = i == src_len # only convert once at the end
result = blk._replace_coerce(
mask=m,
Expand Down Expand Up @@ -2906,7 +2924,9 @@ def _extract_bool_array(mask: ArrayLike) -> np.ndarray:
"""
If we have a SparseArray or BooleanArray, convert it to ndarray[bool].
"""
if isinstance(mask, ExtensionArray):
if isinstance(mask, BooleanArray):
mask = mask.to_numpy(dtype=bool, na_value=False)
elif isinstance(mask, ExtensionArray):
# We could have BooleanArray, Sparse[bool], ...
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think need to update this comment now though - so is there no way to keep this in the same branch as the ExtensionArray check? Would be nice to stay as generic as possible

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll see if we can use to_numpy in the general case

mask = np.asarray(mask, dtype=np.bool_)

Expand Down
104 changes: 1 addition & 103 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
from collections import defaultdict
import itertools
import operator
import re
from typing import (
Any,
DefaultDict,
Dict,
List,
Optional,
Pattern,
Sequence,
Tuple,
TypeVar,
Expand All @@ -19,7 +16,7 @@
import numpy as np

from pandas._libs import internals as libinternals, lib
from pandas._typing import ArrayLike, DtypeObj, Label, Scalar
from pandas._typing import ArrayLike, DtypeObj, Label
from pandas.util._validators import validate_bool_kwarg

from pandas.core.dtypes.cast import (
Expand All @@ -29,12 +26,9 @@
)
from pandas.core.dtypes.common import (
DT64NS_DTYPE,
is_datetimelike_v_numeric,
is_dtype_equal,
is_extension_array_dtype,
is_list_like,
is_numeric_v_string_like,
is_scalar,
)
from pandas.core.dtypes.concat import concat_compat
from pandas.core.dtypes.dtypes import ExtensionDtype
Expand All @@ -44,7 +38,6 @@
import pandas.core.algorithms as algos
from pandas.core.arrays.sparse import SparseDtype
from pandas.core.base import PandasObject
import pandas.core.common as com
from pandas.core.construction import extract_array
from pandas.core.indexers import maybe_convert_indices
from pandas.core.indexes.api import Index, ensure_index
Expand Down Expand Up @@ -628,31 +621,10 @@ def replace_list(
""" do a list replace """
inplace = validate_bool_kwarg(inplace, "inplace")

# figure out our mask apriori to avoid repeated replacements
values = self.as_array()

def comp(s: Scalar, mask: np.ndarray, regex: bool = False):
"""
Generate a bool array by perform an equality check, or perform
an element-wise regular expression matching
"""
if isna(s):
return ~mask

s = com.maybe_box_datetimelike(s)
return _compare_or_regex_search(values, s, regex, mask)

# Calculate the mask once, prior to the call of comp
# in order to avoid repeating the same computations
mask = ~isna(values)

masks = [comp(s, mask, regex) for s in src_list]

bm = self.apply(
"_replace_list",
src_list=src_list,
dest_list=dest_list,
masks=masks,
inplace=inplace,
regex=regex,
)
Expand Down Expand Up @@ -1900,80 +1872,6 @@ def _merge_blocks(
return blocks


def _compare_or_regex_search(
a: ArrayLike,
b: Union[Scalar, Pattern],
regex: bool = False,
mask: Optional[ArrayLike] = None,
) -> Union[ArrayLike, bool]:
"""
Compare two array_like inputs of the same shape or two scalar values

Calls operator.eq or re.search, depending on regex argument. If regex is
True, perform an element-wise regex matching.

Parameters
----------
a : array_like
b : scalar or regex pattern
regex : bool, default False
mask : array_like or None (default)

Returns
-------
mask : array_like of bool
"""

def _check_comparison_types(
result: Union[ArrayLike, bool], a: ArrayLike, b: Union[Scalar, Pattern]
):
"""
Raises an error if the two arrays (a,b) cannot be compared.
Otherwise, returns the comparison result as expected.
"""
if is_scalar(result) and isinstance(a, np.ndarray):
type_names = [type(a).__name__, type(b).__name__]

if isinstance(a, np.ndarray):
type_names[0] = f"ndarray(dtype={a.dtype})"

raise TypeError(
f"Cannot compare types {repr(type_names[0])} and {repr(type_names[1])}"
)

if not regex:
op = lambda x: operator.eq(x, b)
else:
op = np.vectorize(
lambda x: bool(re.search(b, x))
if isinstance(x, str) and isinstance(b, (str, Pattern))
else False
)

# GH#32621 use mask to avoid comparing to NAs
if mask is None and isinstance(a, np.ndarray) and not isinstance(b, np.ndarray):
mask = np.reshape(~(isna(a)), a.shape)
if isinstance(a, np.ndarray):
a = a[mask]

if is_datetimelike_v_numeric(a, b) or is_numeric_v_string_like(a, b):
# GH#29553 avoid deprecation warnings from numpy
_check_comparison_types(False, a, b)
return False

result = op(a)

if isinstance(result, np.ndarray) and mask is not None:
# The shape of the mask can differ to that of the result
# since we may compare only a subset of a's or b's elements
tmp = np.zeros(mask.shape, dtype=np.bool_)
tmp[mask] = result
result = tmp

_check_comparison_types(result, a, b)
return result


def _fast_count_smallints(arr: np.ndarray) -> np.ndarray:
"""Faster version of set(arr) for sequences of small numbers."""
counts = np.bincount(arr.astype(np.int_))
Expand Down
15 changes: 13 additions & 2 deletions pandas/tests/frame/methods/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -1131,8 +1131,19 @@ def test_replace_bool_with_bool(self):

def test_replace_with_dict_with_bool_keys(self):
df = DataFrame({0: [True, False], 1: [False, True]})
with pytest.raises(TypeError, match="Cannot compare types .+"):
df.replace({"asdf": "asdb", True: "yes"})
result = df.replace({"asdf": "asdb", True: "yes"})
expected = DataFrame({0: ["yes", False], 1: [False, "yes"]})
tm.assert_frame_equal(result, expected)

def test_replace_dict_strings_vs_ints(self):
# GH#34789
df = pd.DataFrame({"Y0": [1, 2], "Y1": [3, 4]})
result = df.replace({"replace_string": "test"})

tm.assert_frame_equal(result, df)

result = df["Y0"].replace({"replace_string": "test"})
tm.assert_series_equal(result, df["Y0"])

def test_replace_truthy(self):
df = DataFrame({"a": [True, True]})
Expand Down
5 changes: 3 additions & 2 deletions pandas/tests/series/methods/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,8 +218,9 @@ def test_replace_bool_with_bool(self):

def test_replace_with_dict_with_bool_keys(self):
s = pd.Series([True, False, True])
with pytest.raises(TypeError, match="Cannot compare types .+"):
s.replace({"asdf": "asdb", True: "yes"})
result = s.replace({"asdf": "asdb", True: "yes"})
expected = pd.Series(["yes", False, "yes"])
tm.assert_series_equal(result, expected)

def test_replace2(self):
N = 100
Expand Down