Skip to content

Commit 3967131

Browse files
authored
BUG: df.replace with numeric values and str to_replace (#36093)
1 parent 6f45304 commit 3967131

File tree

8 files changed

+136
-151
lines changed

8 files changed

+136
-151
lines changed

doc/source/user_guide/missing_data.rst

-26
Original file line numberDiff line numberDiff line change
@@ -689,32 +689,6 @@ You can also operate on the DataFrame in place:
689689
690690
df.replace(1.5, np.nan, inplace=True)
691691
692-
.. warning::
693-
694-
When replacing multiple ``bool`` or ``datetime64`` objects, the first
695-
argument to ``replace`` (``to_replace``) must match the type of the value
696-
being replaced. For example,
697-
698-
.. code-block:: python
699-
700-
>>> s = pd.Series([True, False, True])
701-
>>> s.replace({'a string': 'new value', True: False}) # raises
702-
TypeError: Cannot compare types 'ndarray(dtype=bool)' and 'str'
703-
704-
will raise a ``TypeError`` because one of the ``dict`` keys is not of the
705-
correct type for replacement.
706-
707-
However, when replacing a *single* object such as,
708-
709-
.. ipython:: python
710-
711-
s = pd.Series([True, False, True])
712-
s.replace('a string', 'another string')
713-
714-
the original ``NDFrame`` object will be returned untouched. We're working on
715-
unifying this API, but for backwards compatibility reasons we cannot break
716-
the latter behavior. See :issue:`6354` for more details.
717-
718692
Missing data casting rules and indexing
719693
---------------------------------------
720694

doc/source/whatsnew/v1.2.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,7 @@ ExtensionArray
339339
Other
340340
^^^^^
341341
- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` incorrectly raising ``AssertionError`` instead of ``ValueError`` when invalid parameter combinations are passed (:issue:`36045`)
342+
- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` with numeric values and string ``to_replace`` (:issue:`34789`)
342343
-
343344

344345
.. ---------------------------------------------------------------------------

pandas/core/array_algos/replace.py

+95
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
"""
2+
Methods used by Block.replace and related methods.
3+
"""
4+
import operator
5+
import re
6+
from typing import Optional, Pattern, Union
7+
8+
import numpy as np
9+
10+
from pandas._typing import ArrayLike, Scalar
11+
12+
from pandas.core.dtypes.common import (
13+
is_datetimelike_v_numeric,
14+
is_numeric_v_string_like,
15+
is_scalar,
16+
)
17+
from pandas.core.dtypes.missing import isna
18+
19+
20+
def compare_or_regex_search(
21+
a: ArrayLike,
22+
b: Union[Scalar, Pattern],
23+
regex: bool = False,
24+
mask: Optional[ArrayLike] = None,
25+
) -> Union[ArrayLike, bool]:
26+
"""
27+
Compare two array_like inputs of the same shape or two scalar values
28+
29+
Calls operator.eq or re.search, depending on regex argument. If regex is
30+
True, perform an element-wise regex matching.
31+
32+
Parameters
33+
----------
34+
a : array_like
35+
b : scalar or regex pattern
36+
regex : bool, default False
37+
mask : array_like or None (default)
38+
39+
Returns
40+
-------
41+
mask : array_like of bool
42+
"""
43+
44+
def _check_comparison_types(
45+
result: Union[ArrayLike, bool], a: ArrayLike, b: Union[Scalar, Pattern]
46+
):
47+
"""
48+
Raises an error if the two arrays (a,b) cannot be compared.
49+
Otherwise, returns the comparison result as expected.
50+
"""
51+
if is_scalar(result) and isinstance(a, np.ndarray):
52+
type_names = [type(a).__name__, type(b).__name__]
53+
54+
if isinstance(a, np.ndarray):
55+
type_names[0] = f"ndarray(dtype={a.dtype})"
56+
57+
raise TypeError(
58+
f"Cannot compare types {repr(type_names[0])} and {repr(type_names[1])}"
59+
)
60+
61+
if not regex:
62+
op = lambda x: operator.eq(x, b)
63+
else:
64+
op = np.vectorize(
65+
lambda x: bool(re.search(b, x))
66+
if isinstance(x, str) and isinstance(b, (str, Pattern))
67+
else False
68+
)
69+
70+
# GH#32621 use mask to avoid comparing to NAs
71+
if mask is None and isinstance(a, np.ndarray) and not isinstance(b, np.ndarray):
72+
mask = np.reshape(~(isna(a)), a.shape)
73+
if isinstance(a, np.ndarray):
74+
a = a[mask]
75+
76+
if is_numeric_v_string_like(a, b):
77+
# GH#29553 avoid deprecation warnings from numpy
78+
return np.zeros(a.shape, dtype=bool)
79+
80+
elif is_datetimelike_v_numeric(a, b):
81+
# GH#29553 avoid deprecation warnings from numpy
82+
_check_comparison_types(False, a, b)
83+
return False
84+
85+
result = op(a)
86+
87+
if isinstance(result, np.ndarray) and mask is not None:
88+
# The shape of the mask can differ to that of the result
89+
# since we may compare only a subset of a's or b's elements
90+
tmp = np.zeros(mask.shape, dtype=np.bool_)
91+
tmp[mask] = result
92+
result = tmp
93+
94+
_check_comparison_types(result, a, b)
95+
return result

pandas/core/generic.py

-14
Original file line numberDiff line numberDiff line change
@@ -6561,20 +6561,6 @@ def replace(
65616561
1 new new
65626562
2 bait xyz
65636563
6564-
Note that when replacing multiple ``bool`` or ``datetime64`` objects,
6565-
the data types in the `to_replace` parameter must match the data
6566-
type of the value being replaced:
6567-
6568-
>>> df = pd.DataFrame({{'A': [True, False, True],
6569-
... 'B': [False, True, False]}})
6570-
>>> df.replace({{'a string': 'new value', True: False}}) # raises
6571-
Traceback (most recent call last):
6572-
...
6573-
TypeError: Cannot compare types 'ndarray(dtype=bool)' and 'str'
6574-
6575-
This raises a ``TypeError`` because one of the ``dict`` keys is not of
6576-
the correct type for replacement.
6577-
65786564
Compare the behavior of ``s.replace({{'a': None}})`` and
65796565
``s.replace('a', None)`` to understand the peculiarities
65806566
of the `to_replace` parameter:

pandas/core/internals/blocks.py

+23-4
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from pandas._libs.internals import BlockPlacement
1212
from pandas._libs.tslibs import conversion
1313
from pandas._libs.tslibs.timezones import tz_compare
14-
from pandas._typing import ArrayLike
14+
from pandas._typing import ArrayLike, Scalar
1515
from pandas.util._validators import validate_bool_kwarg
1616

1717
from pandas.core.dtypes.cast import (
@@ -59,6 +59,7 @@
5959
from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna, isna_compat
6060

6161
import pandas.core.algorithms as algos
62+
from pandas.core.array_algos.replace import compare_or_regex_search
6263
from pandas.core.array_algos.transforms import shift
6364
from pandas.core.arrays import (
6465
Categorical,
@@ -792,7 +793,6 @@ def _replace_list(
792793
self,
793794
src_list: List[Any],
794795
dest_list: List[Any],
795-
masks: List[np.ndarray],
796796
inplace: bool = False,
797797
regex: bool = False,
798798
) -> List["Block"]:
@@ -801,11 +801,28 @@ def _replace_list(
801801
"""
802802
src_len = len(src_list) - 1
803803

804+
def comp(s: Scalar, mask: np.ndarray, regex: bool = False) -> np.ndarray:
805+
"""
806+
Generate a bool array by perform an equality check, or perform
807+
an element-wise regular expression matching
808+
"""
809+
if isna(s):
810+
return ~mask
811+
812+
s = com.maybe_box_datetimelike(s)
813+
return compare_or_regex_search(self.values, s, regex, mask)
814+
815+
# Calculate the mask once, prior to the call of comp
816+
# in order to avoid repeating the same computations
817+
mask = ~isna(self.values)
818+
819+
masks = [comp(s, mask, regex) for s in src_list]
820+
804821
rb = [self if inplace else self.copy()]
805822
for i, (src, dest) in enumerate(zip(src_list, dest_list)):
806823
new_rb: List["Block"] = []
807824
for blk in rb:
808-
m = masks[i][blk.mgr_locs.indexer]
825+
m = masks[i]
809826
convert = i == src_len # only convert once at the end
810827
result = blk._replace_coerce(
811828
mask=m,
@@ -2908,7 +2925,9 @@ def _extract_bool_array(mask: ArrayLike) -> np.ndarray:
29082925
"""
29092926
if isinstance(mask, ExtensionArray):
29102927
# We could have BooleanArray, Sparse[bool], ...
2911-
mask = np.asarray(mask, dtype=np.bool_)
2928+
# Except for BooleanArray, this is equivalent to just
2929+
# np.asarray(mask, dtype=bool)
2930+
mask = mask.to_numpy(dtype=bool, na_value=False)
29122931

29132932
assert isinstance(mask, np.ndarray), type(mask)
29142933
assert mask.dtype == bool, mask.dtype

pandas/core/internals/managers.py

+1-103
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,11 @@
11
from collections import defaultdict
22
import itertools
3-
import operator
4-
import re
53
from typing import (
64
Any,
75
DefaultDict,
86
Dict,
97
List,
108
Optional,
11-
Pattern,
129
Sequence,
1310
Tuple,
1411
TypeVar,
@@ -19,7 +16,7 @@
1916
import numpy as np
2017

2118
from pandas._libs import internals as libinternals, lib
22-
from pandas._typing import ArrayLike, DtypeObj, Label, Scalar
19+
from pandas._typing import ArrayLike, DtypeObj, Label
2320
from pandas.util._validators import validate_bool_kwarg
2421

2522
from pandas.core.dtypes.cast import (
@@ -29,12 +26,9 @@
2926
)
3027
from pandas.core.dtypes.common import (
3128
DT64NS_DTYPE,
32-
is_datetimelike_v_numeric,
3329
is_dtype_equal,
3430
is_extension_array_dtype,
3531
is_list_like,
36-
is_numeric_v_string_like,
37-
is_scalar,
3832
)
3933
from pandas.core.dtypes.concat import concat_compat
4034
from pandas.core.dtypes.dtypes import ExtensionDtype
@@ -44,7 +38,6 @@
4438
import pandas.core.algorithms as algos
4539
from pandas.core.arrays.sparse import SparseDtype
4640
from pandas.core.base import PandasObject
47-
import pandas.core.common as com
4841
from pandas.core.construction import extract_array
4942
from pandas.core.indexers import maybe_convert_indices
5043
from pandas.core.indexes.api import Index, ensure_index
@@ -628,31 +621,10 @@ def replace_list(
628621
""" do a list replace """
629622
inplace = validate_bool_kwarg(inplace, "inplace")
630623

631-
# figure out our mask apriori to avoid repeated replacements
632-
values = self.as_array()
633-
634-
def comp(s: Scalar, mask: np.ndarray, regex: bool = False):
635-
"""
636-
Generate a bool array by perform an equality check, or perform
637-
an element-wise regular expression matching
638-
"""
639-
if isna(s):
640-
return ~mask
641-
642-
s = com.maybe_box_datetimelike(s)
643-
return _compare_or_regex_search(values, s, regex, mask)
644-
645-
# Calculate the mask once, prior to the call of comp
646-
# in order to avoid repeating the same computations
647-
mask = ~isna(values)
648-
649-
masks = [comp(s, mask, regex) for s in src_list]
650-
651624
bm = self.apply(
652625
"_replace_list",
653626
src_list=src_list,
654627
dest_list=dest_list,
655-
masks=masks,
656628
inplace=inplace,
657629
regex=regex,
658630
)
@@ -1900,80 +1872,6 @@ def _merge_blocks(
19001872
return blocks
19011873

19021874

1903-
def _compare_or_regex_search(
1904-
a: ArrayLike,
1905-
b: Union[Scalar, Pattern],
1906-
regex: bool = False,
1907-
mask: Optional[ArrayLike] = None,
1908-
) -> Union[ArrayLike, bool]:
1909-
"""
1910-
Compare two array_like inputs of the same shape or two scalar values
1911-
1912-
Calls operator.eq or re.search, depending on regex argument. If regex is
1913-
True, perform an element-wise regex matching.
1914-
1915-
Parameters
1916-
----------
1917-
a : array_like
1918-
b : scalar or regex pattern
1919-
regex : bool, default False
1920-
mask : array_like or None (default)
1921-
1922-
Returns
1923-
-------
1924-
mask : array_like of bool
1925-
"""
1926-
1927-
def _check_comparison_types(
1928-
result: Union[ArrayLike, bool], a: ArrayLike, b: Union[Scalar, Pattern]
1929-
):
1930-
"""
1931-
Raises an error if the two arrays (a,b) cannot be compared.
1932-
Otherwise, returns the comparison result as expected.
1933-
"""
1934-
if is_scalar(result) and isinstance(a, np.ndarray):
1935-
type_names = [type(a).__name__, type(b).__name__]
1936-
1937-
if isinstance(a, np.ndarray):
1938-
type_names[0] = f"ndarray(dtype={a.dtype})"
1939-
1940-
raise TypeError(
1941-
f"Cannot compare types {repr(type_names[0])} and {repr(type_names[1])}"
1942-
)
1943-
1944-
if not regex:
1945-
op = lambda x: operator.eq(x, b)
1946-
else:
1947-
op = np.vectorize(
1948-
lambda x: bool(re.search(b, x))
1949-
if isinstance(x, str) and isinstance(b, (str, Pattern))
1950-
else False
1951-
)
1952-
1953-
# GH#32621 use mask to avoid comparing to NAs
1954-
if mask is None and isinstance(a, np.ndarray) and not isinstance(b, np.ndarray):
1955-
mask = np.reshape(~(isna(a)), a.shape)
1956-
if isinstance(a, np.ndarray):
1957-
a = a[mask]
1958-
1959-
if is_datetimelike_v_numeric(a, b) or is_numeric_v_string_like(a, b):
1960-
# GH#29553 avoid deprecation warnings from numpy
1961-
_check_comparison_types(False, a, b)
1962-
return False
1963-
1964-
result = op(a)
1965-
1966-
if isinstance(result, np.ndarray) and mask is not None:
1967-
# The shape of the mask can differ to that of the result
1968-
# since we may compare only a subset of a's or b's elements
1969-
tmp = np.zeros(mask.shape, dtype=np.bool_)
1970-
tmp[mask] = result
1971-
result = tmp
1972-
1973-
_check_comparison_types(result, a, b)
1974-
return result
1975-
1976-
19771875
def _fast_count_smallints(arr: np.ndarray) -> np.ndarray:
19781876
"""Faster version of set(arr) for sequences of small numbers."""
19791877
counts = np.bincount(arr.astype(np.int_))

0 commit comments

Comments
 (0)