From 70799c07a57d1817285433e09a6ec5d2120e19a4 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 5 May 2021 16:28:30 +0100 Subject: [PATCH 1/8] wip --- doc/redirects.csv | 1 + doc/source/reference/series.rst | 1 + pandas/core/arrays/string_arrow.py | 43 ++++++++++++++--------- pandas/core/strings/base.py | 10 +----- pandas/core/strings/object_array.py | 8 +---- pandas/tests/strings/test_find_replace.py | 25 +++++-------- 6 files changed, 40 insertions(+), 48 deletions(-) diff --git a/doc/redirects.csv b/doc/redirects.csv index de69d0168835d..9b8a5a73dedff 100644 --- a/doc/redirects.csv +++ b/doc/redirects.csv @@ -1197,6 +1197,7 @@ generated/pandas.Series.str.extractall,../reference/api/pandas.Series.str.extrac generated/pandas.Series.str.extract,../reference/api/pandas.Series.str.extract generated/pandas.Series.str.findall,../reference/api/pandas.Series.str.findall generated/pandas.Series.str.find,../reference/api/pandas.Series.str.find +generated/pandas.Series.str.fullmatch,../reference/api/pandas.Series.str.fullmatch generated/pandas.Series.str.get_dummies,../reference/api/pandas.Series.str.get_dummies generated/pandas.Series.str.get,../reference/api/pandas.Series.str.get generated/pandas.Series.str,../reference/api/pandas.Series.str diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index cc2937695e80f..3ff3b2bb53fda 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -415,6 +415,7 @@ strings and apply several methods to it. These can be accessed like Series.str.extractall Series.str.find Series.str.findall + Series.str.fullmatch Series.str.get Series.str.index Series.str.join diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index de987b8d34f08..975573266cf75 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -817,33 +817,44 @@ def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex: bool = True): result[isna(result)] = bool(na) return result - def _str_startswith(self, pat, na=None): + def _str_startswith(self, pat: str, na=None): # match_substring_regex added in pyarrow 4.0.0 - if hasattr(pc, "match_substring_regex"): - result = pc.match_substring_regex(self._data, "^" + re.escape(pat)) - result = BooleanDtype().__from_arrow__(result) - if not isna(na): - result[isna(result)] = bool(na) - return result - else: + if not hasattr(pc, "match_substring_regex"): return super()._str_startswith(pat, na) - def _str_endswith(self, pat, na=None): + pat = "^" + re.escape(pat) + return self._str_contains(pat, na=na, regex=True) + + def _str_endswith(self, pat: str, na=None): # match_substring_regex added in pyarrow 4.0.0 - if hasattr(pc, "match_substring_regex"): - result = pc.match_substring_regex(self._data, re.escape(pat) + "$") - result = BooleanDtype().__from_arrow__(result) - if not isna(na): - result[isna(result)] = bool(na) - return result - else: + if not hasattr(pc, "match_substring_regex"): return super()._str_endswith(pat, na) + pat = re.escape(pat) + "$" + return self._str_contains(pat, na=na, regex=True) + def _str_match( self, pat: str, case: bool = True, flags: int = 0, na: Scalar = None ): + # match_substring_regex added in pyarrow 4.0.0 + if not hasattr(pc, "match_substring_regex"): + return super()._str_match(pat, case, flags, na) + + if not pat.startswith("^"): + pat = "^" + pat + return self._str_contains(pat, case, flags, na, regex=True) + + def _str_fullmatch( + self, pat: str, case: bool = True, flags: int = 0, na: Scalar = None + ): + # match_substring_regex added in pyarrow 4.0.0 + if not hasattr(pc, "match_substring_regex"): + return super()._str_fullmatch(pat, case, flags, na) + if not pat.startswith("^"): pat = "^" + pat + if not pat.endswith("$") or pat.endswith("//$"): + pat = pat + "$" return self._str_contains(pat, case, flags, na, regex=True) def _str_isalnum(self): diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py index a77f8861a7c02..8e1199c75c9d2 100644 --- a/pandas/core/strings/base.py +++ b/pandas/core/strings/base.py @@ -1,8 +1,4 @@ import abc -from typing import ( - Pattern, - Union, -) import numpy as np @@ -67,11 +63,7 @@ def _str_match( @abc.abstractmethod def _str_fullmatch( - self, - pat: Union[str, Pattern], - case: bool = True, - flags: int = 0, - na: Scalar = np.nan, + self, pat: str, case: bool = True, flags: int = 0, na: Scalar = np.nan ): pass diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 869eabc76b555..af3f769e19635 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -2,9 +2,7 @@ import textwrap from typing import ( Optional, - Pattern, Set, - Union, ) import unicodedata @@ -197,11 +195,7 @@ def _str_match( return self._str_map(f, na_value=na, dtype=np.dtype(bool)) def _str_fullmatch( - self, - pat: Union[str, Pattern], - case: bool = True, - flags: int = 0, - na: Scalar = None, + self, pat: str, case: bool = True, flags: int = 0, na: Scalar = None ): if not case: flags |= re.IGNORECASE diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 06a7c6d56a61d..0206408c95017 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -486,27 +486,20 @@ def test_match_case_kwarg(any_string_dtype): tm.assert_series_equal(result, expected) -def test_fullmatch(): +def test_fullmatch(any_string_dtype): # GH 32806 - ser = Series(["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"]) + ser = Series( + ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype + ) result = ser.str.fullmatch(".*BAD[_]+.*BAD") - expected = Series([True, False, np.nan, False]) + expected_dtype = "object" if any_string_dtype == "object" else "boolean" + expected = Series([True, False, np.nan, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) - ser = Series(["ab", "AB", "abc", "ABC"]) + ser = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype) result = ser.str.fullmatch("ab", case=False) - expected = Series([True, True, False, False]) - tm.assert_series_equal(result, expected) - - -def test_fullmatch_nullable_string_dtype(nullable_string_dtype): - ser = Series( - ["fooBAD__barBAD", "BAD_BADleroybrown", None, "foo"], - dtype=nullable_string_dtype, - ) - result = ser.str.fullmatch(".*BAD[_]+.*BAD") - # Result is nullable boolean - expected = Series([True, False, np.nan, False], dtype="boolean") + expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" + expected = Series([True, True, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) From 70a1845fa53e36fc47be73deb146a62f49289b86 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 5 May 2021 16:35:03 +0100 Subject: [PATCH 2/8] update to use pa_version_under --- pandas/core/arrays/string_arrow.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 5f4d55a24ab18..807c6cab6e6e9 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -820,16 +820,14 @@ def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex: bool = True): return result def _str_startswith(self, pat: str, na=None): - # match_substring_regex added in pyarrow 4.0.0 - if not hasattr(pc, "match_substring_regex"): + if pa_version_under4p0: return super()._str_startswith(pat, na) pat = "^" + re.escape(pat) return self._str_contains(pat, na=na, regex=True) def _str_endswith(self, pat: str, na=None): - # match_substring_regex added in pyarrow 4.0.0 - if not hasattr(pc, "match_substring_regex"): + if pa_version_under4p0: return super()._str_endswith(pat, na) pat = re.escape(pat) + "$" @@ -838,8 +836,7 @@ def _str_endswith(self, pat: str, na=None): def _str_match( self, pat: str, case: bool = True, flags: int = 0, na: Scalar = None ): - # match_substring_regex added in pyarrow 4.0.0 - if not hasattr(pc, "match_substring_regex"): + if pa_version_under4p0: return super()._str_match(pat, case, flags, na) if not pat.startswith("^"): @@ -849,15 +846,12 @@ def _str_match( def _str_fullmatch( self, pat: str, case: bool = True, flags: int = 0, na: Scalar = None ): - # match_substring_regex added in pyarrow 4.0.0 - if not hasattr(pc, "match_substring_regex"): + if pa_version_under4p0: return super()._str_fullmatch(pat, case, flags, na) - if not pat.startswith("^"): - pat = "^" + pat if not pat.endswith("$") or pat.endswith("//$"): pat = pat + "$" - return self._str_contains(pat, case, flags, na, regex=True) + return self._str_match(pat, case, flags, na) def _str_isalnum(self): result = pc.utf8_is_alnum(self._data) From 7504466431602e50cc233b479c88cdb50630e91e Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 15 May 2021 12:24:54 +0100 Subject: [PATCH 3/8] undo changes to annotations --- pandas/core/arrays/string_arrow.py | 2 +- pandas/core/strings/base.py | 10 +++++++++- pandas/core/strings/object_array.py | 8 +++++++- 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index ec3ac882d229b..fda1f3571d2d0 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -845,7 +845,7 @@ def _str_match( return self._str_contains(pat, case, flags, na, regex=True) def _str_fullmatch( - self, pat: str, case: bool = True, flags: int = 0, na: Scalar = None + self, pat, case: bool = True, flags: int = 0, na: Scalar = None ): if pa_version_under4p0: return super()._str_fullmatch(pat, case, flags, na) diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py index 8e1199c75c9d2..a77f8861a7c02 100644 --- a/pandas/core/strings/base.py +++ b/pandas/core/strings/base.py @@ -1,4 +1,8 @@ import abc +from typing import ( + Pattern, + Union, +) import numpy as np @@ -63,7 +67,11 @@ def _str_match( @abc.abstractmethod def _str_fullmatch( - self, pat: str, case: bool = True, flags: int = 0, na: Scalar = np.nan + self, + pat: Union[str, Pattern], + case: bool = True, + flags: int = 0, + na: Scalar = np.nan, ): pass diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index af3f769e19635..869eabc76b555 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -2,7 +2,9 @@ import textwrap from typing import ( Optional, + Pattern, Set, + Union, ) import unicodedata @@ -195,7 +197,11 @@ def _str_match( return self._str_map(f, na_value=na, dtype=np.dtype(bool)) def _str_fullmatch( - self, pat: str, case: bool = True, flags: int = 0, na: Scalar = None + self, + pat: Union[str, Pattern], + case: bool = True, + flags: int = 0, + na: Scalar = None, ): if not case: flags |= re.IGNORECASE From 1bdd53c09867360fdfb44e30c6e9c8c5fbcc4820 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 15 May 2021 12:26:00 +0100 Subject: [PATCH 4/8] black --- pandas/core/arrays/string_arrow.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index fda1f3571d2d0..d5ee28eb7017e 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -844,9 +844,7 @@ def _str_match( pat = "^" + pat return self._str_contains(pat, case, flags, na, regex=True) - def _str_fullmatch( - self, pat, case: bool = True, flags: int = 0, na: Scalar = None - ): + def _str_fullmatch(self, pat, case: bool = True, flags: int = 0, na: Scalar = None): if pa_version_under4p0: return super()._str_fullmatch(pat, case, flags, na) From b4a426d0ad8d09b57db06b4caf156d3d7a0ece8b Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 15 May 2021 13:41:25 +0100 Subject: [PATCH 5/8] fix docstring validation --- pandas/core/strings/accessor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 15fc2d9e6d3c5..4c857ab73182a 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1170,7 +1170,7 @@ def match(self, pat, case=True, flags=0, na=None): Returns ------- - Series/array of boolean values + Series/Index/array of boolean values See Also -------- @@ -1197,14 +1197,14 @@ def fullmatch(self, pat, case=True, flags=0, na=None): If True, case sensitive. flags : int, default 0 (no flags) Regex module flags, e.g. re.IGNORECASE. - na : scalar, optional. + na : scalar, optional Fill value for missing values. The default depends on dtype of the array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``, ``pandas.NA`` is used. Returns ------- - Series/array of boolean values + Series/Index/array of boolean values See Also -------- From 4894fcf8c99226811fb90265693ea2f9d83d82b6 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 15 May 2021 13:54:23 +0100 Subject: [PATCH 6/8] more tests --- pandas/tests/strings/test_find_replace.py | 24 ++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 843b0ba55e691..e80d9d7a89586 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -494,10 +494,32 @@ def test_fullmatch(any_string_dtype): expected = Series([True, False, np.nan, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) + +def test_fullmatch_na_kwargs(any_string_dtype): + ser = Series( + ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype + ) + result = ser.str.fullmatch(".*BAD[_]+.*BAD", na=False) + expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" + expected = Series([True, False, False, False], dtype=expected_dtype) + tm.assert_series_equal(result, expected) + + +def test_fullmatch_case_kwarg(any_string_dtype): ser = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype) - result = ser.str.fullmatch("ab", case=False) expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" + + expected = Series([True, False, False, False], dtype=expected_dtype) + + result = ser.str.fullmatch("ab", case=True) + tm.assert_series_equal(result, expected) + expected = Series([True, True, False, False], dtype=expected_dtype) + + result = ser.str.fullmatch("ab", case=False) + tm.assert_series_equal(result, expected) + + result = ser.str.fullmatch("ab", flags=re.IGNORECASE) tm.assert_series_equal(result, expected) From 5176d356fac0c9babf82e9856e9b16e85b8f395b Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 15 May 2021 13:57:48 +0100 Subject: [PATCH 7/8] asv --- asv_bench/benchmarks/strings.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 0f68d1043b49d..2560d6726249e 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -83,6 +83,9 @@ def time_find(self, dtype): def time_rfind(self, dtype): self.s.str.rfind("[A-Z]+") + def time_fullmatch(self, dtype): + self.s.str.fullmatch("A") + def time_get(self, dtype): self.s.str.get(0) From 0ef9d2758d3b6c514eaa7dacedd99aa648af4cd6 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 17 May 2021 19:43:44 +0100 Subject: [PATCH 8/8] typo --- pandas/tests/strings/test_find_replace.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index e80d9d7a89586..0815d23f2b493 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -495,7 +495,7 @@ def test_fullmatch(any_string_dtype): tm.assert_series_equal(result, expected) -def test_fullmatch_na_kwargs(any_string_dtype): +def test_fullmatch_na_kwarg(any_string_dtype): ser = Series( ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype )