From b5bfac73d32913a5a723d8a0f6468d25341176f8 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 26 Aug 2024 08:43:28 -0700 Subject: [PATCH 1/4] DEPR: non-bool na for obj.str.contains --- pandas/core/arrays/string_arrow.py | 10 ++++++++++ pandas/core/strings/object_array.py | 10 ++++++++++ pandas/tests/strings/test_find_replace.py | 18 ++++++++++++++++-- 3 files changed, 36 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 67114815341b6..9aff000a5bb40 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -6,6 +6,7 @@ TYPE_CHECKING, Union, ) +import warnings import numpy as np @@ -19,6 +20,7 @@ pa_version_under10p1, pa_version_under13p0, ) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( is_scalar, @@ -295,6 +297,14 @@ def _str_contains( result = pc.match_substring(self._pa_array, pat, ignore_case=not case) result = self._result_converter(result, na=na) if not isna(na): + if not isinstance(na, bool): + # GH#59561 + warnings.warn( + "Allowing a non-bool 'na' in obj.str.contains is deprecated " + "and will raise in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) result[isna(result)] = bool(na) return result diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 100afa956bd24..a754c57558adc 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -9,12 +9,14 @@ cast, ) import unicodedata +import warnings import numpy as np from pandas._libs import lib import pandas._libs.missing as libmissing import pandas._libs.ops as libops +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.missing import isna @@ -142,6 +144,14 @@ def _str_contains( else: upper_pat = pat.upper() f = lambda x: upper_pat in x.upper() + if not isna(na) and not isinstance(na, bool): + # GH#59561 + warnings.warn( + "Allowing a non-bool 'na' in obj.str.contains is deprecated " + "and will raise in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) return self._str_map(f, na, dtype=np.dtype("bool")) def _str_startswith(self, pat, na=None): diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 00677ef4fcfe9..407b51d4bbe49 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -166,7 +166,16 @@ def test_contains_na_kwarg_for_nullable_string_dtype( # https://github.com/pandas-dev/pandas/pull/41025#issuecomment-824062416 values = Series(["a", "b", "c", "a", np.nan], dtype=nullable_string_dtype) - result = values.str.contains("a", na=na, regex=regex) + + msg = ( + "Allowing a non-bool 'na' in obj.str.contains is deprecated and " + "will raise in a future version" + ) + warn = None + if not pd.isna(na) and not isinstance(na, bool): + warn = FutureWarning + with tm.assert_produces_warning(warn, match=msg): + result = values.str.contains("a", na=na, regex=regex) expected = Series([True, False, False, True, expected], dtype="boolean") tm.assert_series_equal(result, expected) @@ -232,7 +241,12 @@ def test_contains_nan(any_string_dtype): expected = Series([True, True, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) - result = s.str.contains("foo", na="foo") + msg = ( + "Allowing a non-bool 'na' in obj.str.contains is deprecated and " + "will raise in a future version" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = s.str.contains("foo", na="foo") if any_string_dtype == "object": expected = Series(["foo", "foo", "foo"], dtype=np.object_) elif any_string_dtype.na_value is np.nan: From 1f201f6c88d397522a203f54624421fa89d4ae86 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 27 Aug 2024 08:04:50 -0700 Subject: [PATCH 2/4] DEPR: na validation for startswith, endswith --- pandas/core/strings/object_array.py | 16 +++++++++++++ pandas/tests/strings/test_find_replace.py | 28 +++++++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index a754c57558adc..c6b18d7049c57 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -156,10 +156,26 @@ def _str_contains( def _str_startswith(self, pat, na=None): f = lambda x: x.startswith(pat) + if not isna(na) and not isinstance(na, bool): + # GH#59561 + warnings.warn( + "Allowing a non-bool 'na' in obj.str.startswith is deprecated " + "and will raise in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) return self._str_map(f, na_value=na, dtype=np.dtype(bool)) def _str_endswith(self, pat, na=None): f = lambda x: x.endswith(pat) + if not isna(na) and not isinstance(na, bool): + # GH#59561 + warnings.warn( + "Allowing a non-bool 'na' in obj.str.endswith is deprecated " + "and will raise in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) return self._str_map(f, na_value=na, dtype=np.dtype(bool)) def _str_replace( diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 407b51d4bbe49..c1dfe4d4a0b0b 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -268,6 +268,34 @@ def test_contains_nan(any_string_dtype): # -------------------------------------------------------------------------------------- +def test_startswith_endswith_validate_na(any_string_dtype): + # GH#59615 + ser = Series( + ["om", np.nan, "foo_nom", "nom", "bar_foo", np.nan, "foo"], + dtype=any_string_dtype, + ) + + dtype = ser.dtype + if ( + isinstance(dtype, pd.StringDtype) and dtype.storage == "python" + ) or dtype == np.dtype("object"): + msg = "Allowing a non-bool 'na' in obj.str.startswith is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + ser.str.startswith("kapow", na="baz") + msg = "Allowing a non-bool 'na' in obj.str.endswith is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + ser.str.endswith("bar", na="baz") + else: + # TODO: don't surface pyarrow errors + import pyarrow as pa + + msg = "Could not convert 'baz' with type str: tried to convert to boolean" + with pytest.raises(pa.lib.ArrowInvalid, match=msg): + ser.str.startswith("kapow", na="baz") + with pytest.raises(pa.lib.ArrowInvalid, match=msg): + ser.str.endswith("kapow", na="baz") + + @pytest.mark.parametrize("pat", ["foo", ("foo", "baz")]) @pytest.mark.parametrize("dtype", ["object", "category"]) @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA]) From 1403c87e2f0c45665254f113f24319c9cb39cfdb Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 27 Aug 2024 12:06:56 -0700 Subject: [PATCH 3/4] whatsnew --- doc/source/whatsnew/v2.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index d1881bf04826f..b8fbd2927bdde 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -53,7 +53,7 @@ notable_bug_fix1 Deprecations ~~~~~~~~~~~~ -- +- Deprecated allowing non-``bool`` values for ``na`` in :meth:`.str.contains`, :meth:`.str.startswith`, and :meth:`.str.endswith` for dtypes that do not already disallow these (:issue:`59615`) - .. --------------------------------------------------------------------------- From 549dcffc79b7627ed2ad982c31064811640074c3 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 31 Aug 2024 18:46:05 +0200 Subject: [PATCH 4/4] Update pandas/tests/strings/test_find_replace.py --- pandas/tests/strings/test_find_replace.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index c1dfe4d4a0b0b..bf01c4996bb32 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -286,7 +286,7 @@ def test_startswith_endswith_validate_na(any_string_dtype): with tm.assert_produces_warning(FutureWarning, match=msg): ser.str.endswith("bar", na="baz") else: - # TODO: don't surface pyarrow errors + # TODO(infer_string): don't surface pyarrow errors import pyarrow as pa msg = "Could not convert 'baz' with type str: tried to convert to boolean"