Skip to content

Commit 671cf86

Browse files
[ArrowStringArray] PERF: use pa.compute.match_substring_regex for str.fullmatch if available (#41332)
1 parent dc48fb6 commit 671cf86

File tree

6 files changed

+39
-4
lines changed

6 files changed

+39
-4
lines changed

asv_bench/benchmarks/strings.py

+3
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,9 @@ def time_find(self, dtype):
8585
def time_rfind(self, dtype):
8686
self.s.str.rfind("[A-Z]+")
8787

88+
def time_fullmatch(self, dtype):
89+
self.s.str.fullmatch("A")
90+
8891
def time_get(self, dtype):
8992
self.s.str.get(0)
9093

doc/redirects.csv

+1
Original file line numberDiff line numberDiff line change
@@ -1197,6 +1197,7 @@ generated/pandas.Series.str.extractall,../reference/api/pandas.Series.str.extrac
11971197
generated/pandas.Series.str.extract,../reference/api/pandas.Series.str.extract
11981198
generated/pandas.Series.str.findall,../reference/api/pandas.Series.str.findall
11991199
generated/pandas.Series.str.find,../reference/api/pandas.Series.str.find
1200+
generated/pandas.Series.str.fullmatch,../reference/api/pandas.Series.str.fullmatch
12001201
generated/pandas.Series.str.get_dummies,../reference/api/pandas.Series.str.get_dummies
12011202
generated/pandas.Series.str.get,../reference/api/pandas.Series.str.get
12021203
generated/pandas.Series.str,../reference/api/pandas.Series.str

doc/source/reference/series.rst

+1
Original file line numberDiff line numberDiff line change
@@ -415,6 +415,7 @@ strings and apply several methods to it. These can be accessed like
415415
Series.str.extractall
416416
Series.str.find
417417
Series.str.findall
418+
Series.str.fullmatch
418419
Series.str.get
419420
Series.str.index
420421
Series.str.join

pandas/core/arrays/string_arrow.py

+8
Original file line numberDiff line numberDiff line change
@@ -844,6 +844,14 @@ def _str_match(
844844
pat = "^" + pat
845845
return self._str_contains(pat, case, flags, na, regex=True)
846846

847+
def _str_fullmatch(self, pat, case: bool = True, flags: int = 0, na: Scalar = None):
848+
if pa_version_under4p0:
849+
return super()._str_fullmatch(pat, case, flags, na)
850+
851+
if not pat.endswith("$") or pat.endswith("//$"):
852+
pat = pat + "$"
853+
return self._str_match(pat, case, flags, na)
854+
847855
def _str_isalnum(self):
848856
result = pc.utf8_is_alnum(self._data)
849857
return BooleanDtype().__from_arrow__(result)

pandas/core/strings/accessor.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -1170,7 +1170,7 @@ def match(self, pat, case=True, flags=0, na=None):
11701170
11711171
Returns
11721172
-------
1173-
Series/array of boolean values
1173+
Series/Index/array of boolean values
11741174
11751175
See Also
11761176
--------
@@ -1197,14 +1197,14 @@ def fullmatch(self, pat, case=True, flags=0, na=None):
11971197
If True, case sensitive.
11981198
flags : int, default 0 (no flags)
11991199
Regex module flags, e.g. re.IGNORECASE.
1200-
na : scalar, optional.
1200+
na : scalar, optional
12011201
Fill value for missing values. The default depends on dtype of the
12021202
array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``,
12031203
``pandas.NA`` is used.
12041204
12051205
Returns
12061206
-------
1207-
Series/array of boolean values
1207+
Series/Index/array of boolean values
12081208
12091209
See Also
12101210
--------

pandas/tests/strings/test_find_replace.py

+23-1
Original file line numberDiff line numberDiff line change
@@ -494,10 +494,32 @@ def test_fullmatch(any_string_dtype):
494494
expected = Series([True, False, np.nan, False], dtype=expected_dtype)
495495
tm.assert_series_equal(result, expected)
496496

497+
498+
def test_fullmatch_na_kwarg(any_string_dtype):
499+
ser = Series(
500+
["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype
501+
)
502+
result = ser.str.fullmatch(".*BAD[_]+.*BAD", na=False)
503+
expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean"
504+
expected = Series([True, False, False, False], dtype=expected_dtype)
505+
tm.assert_series_equal(result, expected)
506+
507+
508+
def test_fullmatch_case_kwarg(any_string_dtype):
497509
ser = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype)
498-
result = ser.str.fullmatch("ab", case=False)
499510
expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean"
511+
512+
expected = Series([True, False, False, False], dtype=expected_dtype)
513+
514+
result = ser.str.fullmatch("ab", case=True)
515+
tm.assert_series_equal(result, expected)
516+
500517
expected = Series([True, True, False, False], dtype=expected_dtype)
518+
519+
result = ser.str.fullmatch("ab", case=False)
520+
tm.assert_series_equal(result, expected)
521+
522+
result = ser.str.fullmatch("ab", flags=re.IGNORECASE)
501523
tm.assert_series_equal(result, expected)
502524

503525

0 commit comments

Comments
 (0)