Skip to content

Commit b472080

Browse files
authored
BUG: replace with regex raising for StringDType (#41343)
1 parent 15dff11 commit b472080

File tree

6 files changed

+47
-29
lines changed

6 files changed

+47
-29
lines changed

doc/source/whatsnew/v1.3.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -748,7 +748,7 @@ Strings
748748
^^^^^^^
749749

750750
- Bug in the conversion from ``pyarrow.ChunkedArray`` to :class:`~arrays.StringArray` when the original had zero chunks (:issue:`41040`)
751-
-
751+
- Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` ignoring replacements with ``regex=True`` for ``StringDType`` data (:issue:`41333`, :issue:`35977`)
752752

753753
Interval
754754
^^^^^^^^

pandas/conftest.py

+21
Original file line numberDiff line numberDiff line change
@@ -1153,6 +1153,27 @@ def object_dtype(request):
11531153
return request.param
11541154

11551155

1156+
@pytest.fixture(
1157+
params=[
1158+
"object",
1159+
"string",
1160+
pytest.param(
1161+
"arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0")
1162+
),
1163+
]
1164+
)
1165+
def any_string_dtype(request):
1166+
"""
1167+
Parametrized fixture for string dtypes.
1168+
* 'object'
1169+
* 'string'
1170+
* 'arrow_string'
1171+
"""
1172+
from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401
1173+
1174+
return request.param
1175+
1176+
11561177
@pytest.fixture(params=tm.DATETIME64_DTYPES)
11571178
def datetime64_dtype(request):
11581179
"""

pandas/core/array_algos/replace.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ def re_replacer(s):
149149
else:
150150
return s
151151

152-
f = np.vectorize(re_replacer, otypes=[values.dtype])
152+
f = np.vectorize(re_replacer, otypes=[np.object_])
153153

154154
if mask is None:
155155
values[:] = f(values)

pandas/core/internals/blocks.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
is_extension_array_dtype,
5050
is_list_like,
5151
is_sparse,
52+
is_string_dtype,
5253
pandas_dtype,
5354
)
5455
from pandas.core.dtypes.dtypes import (
@@ -788,7 +789,7 @@ def _replace_list(
788789

789790
src_len = len(pairs) - 1
790791

791-
if values.dtype == _dtype_obj:
792+
if is_string_dtype(values):
792793
# Calculate the mask once, prior to the call of comp
793794
# in order to avoid repeating the same computations
794795
mask = ~isna(values)

pandas/tests/frame/methods/test_replace.py

+22-3
Original file line numberDiff line numberDiff line change
@@ -563,10 +563,11 @@ def test_regex_replace_dict_nested(self, mix_abc):
563563
tm.assert_frame_equal(res3, expec)
564564
tm.assert_frame_equal(res4, expec)
565565

566-
def test_regex_replace_dict_nested_non_first_character(self):
566+
def test_regex_replace_dict_nested_non_first_character(self, any_string_dtype):
567567
# GH 25259
568-
df = DataFrame({"first": ["abc", "bca", "cab"]})
569-
expected = DataFrame({"first": [".bc", "bc.", "c.b"]})
568+
dtype = any_string_dtype
569+
df = DataFrame({"first": ["abc", "bca", "cab"]}, dtype=dtype)
570+
expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype)
570571
result = df.replace({"a": "."}, regex=True)
571572
tm.assert_frame_equal(result, expected)
572573

@@ -685,6 +686,24 @@ def test_replace_regex_metachar(self, metachar):
685686
expected = DataFrame({"a": ["paren", "else"]})
686687
tm.assert_frame_equal(result, expected)
687688

689+
@pytest.mark.parametrize(
690+
"data,to_replace,expected",
691+
[
692+
(["xax", "xbx"], {"a": "c", "b": "d"}, ["xcx", "xdx"]),
693+
(["d", "", ""], {r"^\s*$": pd.NA}, ["d", pd.NA, pd.NA]),
694+
],
695+
)
696+
def test_regex_replace_string_types(
697+
self, data, to_replace, expected, frame_or_series, any_string_dtype
698+
):
699+
# GH-41333, GH-35977
700+
dtype = any_string_dtype
701+
obj = frame_or_series(data, dtype=dtype)
702+
result = obj.replace(to_replace, regex=True)
703+
expected = frame_or_series(expected, dtype=dtype)
704+
705+
tm.assert_equal(result, expected)
706+
688707
def test_replace(self, datetime_frame):
689708
datetime_frame["A"][:5] = np.nan
690709
datetime_frame["A"][-5:] = np.nan

pandas/tests/strings/conftest.py

-23
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
import numpy as np
22
import pytest
33

4-
import pandas.util._test_decorators as td
5-
64
from pandas import Series
75
from pandas.core import strings as strings
86

@@ -175,24 +173,3 @@ def any_allowed_skipna_inferred_dtype(request):
175173

176174
# correctness of inference tested in tests/dtypes/test_inference.py
177175
return inferred_dtype, values
178-
179-
180-
@pytest.fixture(
181-
params=[
182-
"object",
183-
"string",
184-
pytest.param(
185-
"arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0")
186-
),
187-
]
188-
)
189-
def any_string_dtype(request):
190-
"""
191-
Parametrized fixture for string dtypes.
192-
* 'object'
193-
* 'string'
194-
* 'arrow_string'
195-
"""
196-
from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401
197-
198-
return request.param

0 commit comments

Comments
 (0)