Skip to content

[ArrowStringArray] PERF: use pa.compute.match_substring_regex for str.fullmatch if available #41332

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
May 17, 2021
3 changes: 3 additions & 0 deletions asv_bench/benchmarks/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,9 @@ def time_find(self, dtype):
def time_rfind(self, dtype):
self.s.str.rfind("[A-Z]+")

def time_fullmatch(self, dtype):
self.s.str.fullmatch("A")

def time_get(self, dtype):
self.s.str.get(0)

Expand Down
1 change: 1 addition & 0 deletions doc/redirects.csv
Original file line number Diff line number Diff line change
Expand Up @@ -1197,6 +1197,7 @@ generated/pandas.Series.str.extractall,../reference/api/pandas.Series.str.extrac
generated/pandas.Series.str.extract,../reference/api/pandas.Series.str.extract
generated/pandas.Series.str.findall,../reference/api/pandas.Series.str.findall
generated/pandas.Series.str.find,../reference/api/pandas.Series.str.find
generated/pandas.Series.str.fullmatch,../reference/api/pandas.Series.str.fullmatch
generated/pandas.Series.str.get_dummies,../reference/api/pandas.Series.str.get_dummies
generated/pandas.Series.str.get,../reference/api/pandas.Series.str.get
generated/pandas.Series.str,../reference/api/pandas.Series.str
Expand Down
1 change: 1 addition & 0 deletions doc/source/reference/series.rst
Original file line number Diff line number Diff line change
Expand Up @@ -415,6 +415,7 @@ strings and apply several methods to it. These can be accessed like
Series.str.extractall
Series.str.find
Series.str.findall
Series.str.fullmatch
Series.str.get
Series.str.index
Series.str.join
Expand Down
8 changes: 8 additions & 0 deletions pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -844,6 +844,14 @@ def _str_match(
pat = "^" + pat
return self._str_contains(pat, case, flags, na, regex=True)

def _str_fullmatch(self, pat, case: bool = True, flags: int = 0, na: Scalar = None):
if pa_version_under4p0:
return super()._str_fullmatch(pat, case, flags, na)

if not pat.endswith("$") or pat.endswith("//$"):
pat = pat + "$"
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we probably don't need the check but maybe faster... will need to check.

return self._str_match(pat, case, flags, na)

def _str_isalnum(self):
result = pc.utf8_is_alnum(self._data)
return BooleanDtype().__from_arrow__(result)
Expand Down
6 changes: 3 additions & 3 deletions pandas/core/strings/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -1170,7 +1170,7 @@ def match(self, pat, case=True, flags=0, na=None):

Returns
-------
Series/array of boolean values
Series/Index/array of boolean values

See Also
--------
Expand All @@ -1197,14 +1197,14 @@ def fullmatch(self, pat, case=True, flags=0, na=None):
If True, case sensitive.
flags : int, default 0 (no flags)
Regex module flags, e.g. re.IGNORECASE.
na : scalar, optional.
na : scalar, optional
Fill value for missing values. The default depends on dtype of the
array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``,
``pandas.NA`` is used.

Returns
-------
Series/array of boolean values
Series/Index/array of boolean values

See Also
--------
Expand Down
24 changes: 23 additions & 1 deletion pandas/tests/strings/test_find_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,10 +494,32 @@ def test_fullmatch(any_string_dtype):
expected = Series([True, False, np.nan, False], dtype=expected_dtype)
tm.assert_series_equal(result, expected)


def test_fullmatch_na_kwarg(any_string_dtype):
ser = Series(
["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype
)
result = ser.str.fullmatch(".*BAD[_]+.*BAD", na=False)
expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean"
expected = Series([True, False, False, False], dtype=expected_dtype)
tm.assert_series_equal(result, expected)


def test_fullmatch_case_kwarg(any_string_dtype):
ser = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype)
result = ser.str.fullmatch("ab", case=False)
expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean"

expected = Series([True, False, False, False], dtype=expected_dtype)

result = ser.str.fullmatch("ab", case=True)
tm.assert_series_equal(result, expected)

expected = Series([True, True, False, False], dtype=expected_dtype)

result = ser.str.fullmatch("ab", case=False)
tm.assert_series_equal(result, expected)

result = ser.str.fullmatch("ab", flags=re.IGNORECASE)
tm.assert_series_equal(result, expected)


Expand Down