diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index d1881bf04826f..b8fbd2927bdde 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -53,7 +53,7 @@ notable_bug_fix1 Deprecations ~~~~~~~~~~~~ -- +- Deprecated allowing non-``bool`` values for ``na`` in :meth:`.str.contains`, :meth:`.str.startswith`, and :meth:`.str.endswith` for dtypes that do not already disallow these (:issue:`59615`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 67114815341b6..9aff000a5bb40 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -6,6 +6,7 @@ TYPE_CHECKING, Union, ) +import warnings import numpy as np @@ -19,6 +20,7 @@ pa_version_under10p1, pa_version_under13p0, ) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( is_scalar, @@ -295,6 +297,14 @@ def _str_contains( result = pc.match_substring(self._pa_array, pat, ignore_case=not case) result = self._result_converter(result, na=na) if not isna(na): + if not isinstance(na, bool): + # GH#59561 + warnings.warn( + "Allowing a non-bool 'na' in obj.str.contains is deprecated " + "and will raise in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) result[isna(result)] = bool(na) return result diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 100afa956bd24..c6b18d7049c57 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -9,12 +9,14 @@ cast, ) import unicodedata +import warnings import numpy as np from pandas._libs import lib import pandas._libs.missing as libmissing import pandas._libs.ops as libops +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.missing import isna @@ -142,14 +144,38 @@ def _str_contains( else: upper_pat = pat.upper() f = lambda x: upper_pat in x.upper() + if not isna(na) and not isinstance(na, bool): + # GH#59561 + warnings.warn( + "Allowing a non-bool 'na' in obj.str.contains is deprecated " + "and will raise in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) return self._str_map(f, na, dtype=np.dtype("bool")) def _str_startswith(self, pat, na=None): f = lambda x: x.startswith(pat) + if not isna(na) and not isinstance(na, bool): + # GH#59561 + warnings.warn( + "Allowing a non-bool 'na' in obj.str.startswith is deprecated " + "and will raise in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) return self._str_map(f, na_value=na, dtype=np.dtype(bool)) def _str_endswith(self, pat, na=None): f = lambda x: x.endswith(pat) + if not isna(na) and not isinstance(na, bool): + # GH#59561 + warnings.warn( + "Allowing a non-bool 'na' in obj.str.endswith is deprecated " + "and will raise in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) return self._str_map(f, na_value=na, dtype=np.dtype(bool)) def _str_replace( diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 00677ef4fcfe9..bf01c4996bb32 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -166,7 +166,16 @@ def test_contains_na_kwarg_for_nullable_string_dtype( # https://github.com/pandas-dev/pandas/pull/41025#issuecomment-824062416 values = Series(["a", "b", "c", "a", np.nan], dtype=nullable_string_dtype) - result = values.str.contains("a", na=na, regex=regex) + + msg = ( + "Allowing a non-bool 'na' in obj.str.contains is deprecated and " + "will raise in a future version" + ) + warn = None + if not pd.isna(na) and not isinstance(na, bool): + warn = FutureWarning + with tm.assert_produces_warning(warn, match=msg): + result = values.str.contains("a", na=na, regex=regex) expected = Series([True, False, False, True, expected], dtype="boolean") tm.assert_series_equal(result, expected) @@ -232,7 +241,12 @@ def test_contains_nan(any_string_dtype): expected = Series([True, True, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) - result = s.str.contains("foo", na="foo") + msg = ( + "Allowing a non-bool 'na' in obj.str.contains is deprecated and " + "will raise in a future version" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = s.str.contains("foo", na="foo") if any_string_dtype == "object": expected = Series(["foo", "foo", "foo"], dtype=np.object_) elif any_string_dtype.na_value is np.nan: @@ -254,6 +268,34 @@ def test_contains_nan(any_string_dtype): # -------------------------------------------------------------------------------------- +def test_startswith_endswith_validate_na(any_string_dtype): + # GH#59615 + ser = Series( + ["om", np.nan, "foo_nom", "nom", "bar_foo", np.nan, "foo"], + dtype=any_string_dtype, + ) + + dtype = ser.dtype + if ( + isinstance(dtype, pd.StringDtype) and dtype.storage == "python" + ) or dtype == np.dtype("object"): + msg = "Allowing a non-bool 'na' in obj.str.startswith is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + ser.str.startswith("kapow", na="baz") + msg = "Allowing a non-bool 'na' in obj.str.endswith is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + ser.str.endswith("bar", na="baz") + else: + # TODO(infer_string): don't surface pyarrow errors + import pyarrow as pa + + msg = "Could not convert 'baz' with type str: tried to convert to boolean" + with pytest.raises(pa.lib.ArrowInvalid, match=msg): + ser.str.startswith("kapow", na="baz") + with pytest.raises(pa.lib.ArrowInvalid, match=msg): + ser.str.endswith("kapow", na="baz") + + @pytest.mark.parametrize("pat", ["foo", ("foo", "baz")]) @pytest.mark.parametrize("dtype", ["object", "category"]) @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA])