Skip to content

Commit 866a7f6

Browse files
DEPR (string): non-bool na for obj.str.contains (pandas-dev#59615)
Co-authored-by: Joris Van den Bossche <[email protected]>
1 parent 3121121 commit 866a7f6

File tree

4 files changed

+87
-4
lines changed

4 files changed

+87
-4
lines changed

doc/source/whatsnew/v2.3.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ notable_bug_fix1
5353

5454
Deprecations
5555
~~~~~~~~~~~~
56-
-
56+
- Deprecated allowing non-``bool`` values for ``na`` in :meth:`.str.contains`, :meth:`.str.startswith`, and :meth:`.str.endswith` for dtypes that do not already disallow these (:issue:`59615`)
5757
-
5858

5959
.. ---------------------------------------------------------------------------

pandas/core/arrays/string_arrow.py

+8
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,14 @@ def _str_contains(
300300
result = pc.match_substring(self._pa_array, pat, ignore_case=not case)
301301
result = self._convert_bool_result(result, na=na)
302302
if not isna(na):
303+
if not isinstance(na, bool):
304+
# GH#59561
305+
warnings.warn(
306+
"Allowing a non-bool 'na' in obj.str.contains is deprecated "
307+
"and will raise in a future version.",
308+
FutureWarning,
309+
stacklevel=find_stack_level(),
310+
)
303311
result[isna(result)] = bool(na)
304312
return result
305313

pandas/core/strings/object_array.py

+26
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,14 @@
1010
cast,
1111
)
1212
import unicodedata
13+
import warnings
1314

1415
import numpy as np
1516

1617
from pandas._libs import lib
1718
import pandas._libs.missing as libmissing
1819
import pandas._libs.ops as libops
20+
from pandas.util._exceptions import find_stack_level
1921

2022
from pandas.core.dtypes.missing import isna
2123

@@ -140,14 +142,38 @@ def _str_contains(
140142
else:
141143
upper_pat = pat.upper()
142144
f = lambda x: upper_pat in x.upper()
145+
if not isna(na) and not isinstance(na, bool):
146+
# GH#59561
147+
warnings.warn(
148+
"Allowing a non-bool 'na' in obj.str.contains is deprecated "
149+
"and will raise in a future version.",
150+
FutureWarning,
151+
stacklevel=find_stack_level(),
152+
)
143153
return self._str_map(f, na, dtype=np.dtype("bool"))
144154

145155
def _str_startswith(self, pat, na=None):
146156
f = lambda x: x.startswith(pat)
157+
if not isna(na) and not isinstance(na, bool):
158+
# GH#59561
159+
warnings.warn(
160+
"Allowing a non-bool 'na' in obj.str.startswith is deprecated "
161+
"and will raise in a future version.",
162+
FutureWarning,
163+
stacklevel=find_stack_level(),
164+
)
147165
return self._str_map(f, na_value=na, dtype=np.dtype(bool))
148166

149167
def _str_endswith(self, pat, na=None):
150168
f = lambda x: x.endswith(pat)
169+
if not isna(na) and not isinstance(na, bool):
170+
# GH#59561
171+
warnings.warn(
172+
"Allowing a non-bool 'na' in obj.str.endswith is deprecated "
173+
"and will raise in a future version.",
174+
FutureWarning,
175+
stacklevel=find_stack_level(),
176+
)
151177
return self._str_map(f, na_value=na, dtype=np.dtype(bool))
152178

153179
def _str_replace(

pandas/tests/strings/test_find_replace.py

+52-3
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@
44
import numpy as np
55
import pytest
66

7+
from pandas._config import using_string_dtype
8+
9+
from pandas.compat import HAS_PYARROW
710
from pandas.errors import PerformanceWarning
811
import pandas.util._test_decorators as td
912

@@ -167,7 +170,16 @@ def test_contains_na_kwarg_for_nullable_string_dtype(
167170
# https://github.com/pandas-dev/pandas/pull/41025#issuecomment-824062416
168171

169172
values = Series(["a", "b", "c", "a", np.nan], dtype=nullable_string_dtype)
170-
result = values.str.contains("a", na=na, regex=regex)
173+
174+
msg = (
175+
"Allowing a non-bool 'na' in obj.str.contains is deprecated and "
176+
"will raise in a future version"
177+
)
178+
warn = None
179+
if not pd.isna(na) and not isinstance(na, bool):
180+
warn = FutureWarning
181+
with tm.assert_produces_warning(warn, match=msg):
182+
result = values.str.contains("a", na=na, regex=regex)
171183
expected = Series([True, False, False, True, expected], dtype="boolean")
172184
tm.assert_series_equal(result, expected)
173185

@@ -233,6 +245,7 @@ def test_contains_nan(any_string_dtype):
233245
expected = Series([True, True, True], dtype=expected_dtype)
234246
tm.assert_series_equal(result, expected)
235247

248+
# TODO(infer_string)
236249
# this particular combination of events is broken on 2.3
237250
# would require cherry picking #58483, which in turn requires #57481
238251
# which introduce many behavioral changes
@@ -241,14 +254,19 @@ def test_contains_nan(any_string_dtype):
241254
and any_string_dtype.storage == "python"
242255
and any_string_dtype.na_value is np.nan
243256
):
244-
result = s.str.contains("foo", na="foo")
257+
msg = (
258+
"Allowing a non-bool 'na' in obj.str.contains is deprecated and "
259+
"will raise in a future version"
260+
)
261+
with tm.assert_produces_warning(FutureWarning, match=msg):
262+
result = s.str.contains("foo", na="foo")
245263
if any_string_dtype == "object":
246264
expected = Series(["foo", "foo", "foo"], dtype=np.object_)
247265
elif any_string_dtype.na_value is np.nan:
248266
expected = Series([True, True, True], dtype=np.bool_)
249267
else:
250268
expected = Series([True, True, True], dtype="boolean")
251-
tm.assert_series_equal(result, expected)
269+
tm.assert_series_equal(result, expected)
252270

253271
result = s.str.contains("foo")
254272
expected_dtype = (
@@ -263,6 +281,37 @@ def test_contains_nan(any_string_dtype):
263281
# --------------------------------------------------------------------------------------
264282

265283

284+
@pytest.mark.xfail(
285+
using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False
286+
)
287+
def test_startswith_endswith_validate_na(any_string_dtype):
288+
# GH#59615
289+
ser = Series(
290+
["om", np.nan, "foo_nom", "nom", "bar_foo", np.nan, "foo"],
291+
dtype=any_string_dtype,
292+
)
293+
294+
dtype = ser.dtype
295+
if (
296+
isinstance(dtype, pd.StringDtype) and dtype.storage == "python"
297+
) or dtype == np.dtype("object"):
298+
msg = "Allowing a non-bool 'na' in obj.str.startswith is deprecated"
299+
with tm.assert_produces_warning(FutureWarning, match=msg):
300+
ser.str.startswith("kapow", na="baz")
301+
msg = "Allowing a non-bool 'na' in obj.str.endswith is deprecated"
302+
with tm.assert_produces_warning(FutureWarning, match=msg):
303+
ser.str.endswith("bar", na="baz")
304+
else:
305+
# TODO(infer_string): don't surface pyarrow errors
306+
import pyarrow as pa
307+
308+
msg = "Could not convert 'baz' with type str: tried to convert to boolean"
309+
with pytest.raises(pa.lib.ArrowInvalid, match=msg):
310+
ser.str.startswith("kapow", na="baz")
311+
with pytest.raises(pa.lib.ArrowInvalid, match=msg):
312+
ser.str.endswith("kapow", na="baz")
313+
314+
266315
@pytest.mark.parametrize("pat", ["foo", ("foo", "baz")])
267316
@pytest.mark.parametrize("dtype", ["object", "category"])
268317
@pytest.mark.parametrize("null_value", [None, np.nan, pd.NA])

0 commit comments

Comments
 (0)