Skip to content

[ArrowStringArray] PERF: use pa.compute.match_substring_regex for str.fullmatch if available #41332

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
May 17, 2021
1 change: 1 addition & 0 deletions doc/redirects.csv
Original file line number Diff line number Diff line change
Expand Up @@ -1197,6 +1197,7 @@ generated/pandas.Series.str.extractall,../reference/api/pandas.Series.str.extrac
generated/pandas.Series.str.extract,../reference/api/pandas.Series.str.extract
generated/pandas.Series.str.findall,../reference/api/pandas.Series.str.findall
generated/pandas.Series.str.find,../reference/api/pandas.Series.str.find
generated/pandas.Series.str.fullmatch,../reference/api/pandas.Series.str.fullmatch
generated/pandas.Series.str.get_dummies,../reference/api/pandas.Series.str.get_dummies
generated/pandas.Series.str.get,../reference/api/pandas.Series.str.get
generated/pandas.Series.str,../reference/api/pandas.Series.str
Expand Down
1 change: 1 addition & 0 deletions doc/source/reference/series.rst
Original file line number Diff line number Diff line change
Expand Up @@ -415,6 +415,7 @@ strings and apply several methods to it. These can be accessed like
Series.str.extractall
Series.str.find
Series.str.findall
Series.str.fullmatch
Series.str.get
Series.str.index
Series.str.join
Expand Down
31 changes: 19 additions & 12 deletions pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -819,33 +819,40 @@ def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex: bool = True):
result[isna(result)] = bool(na)
return result

def _str_startswith(self, pat, na=None):
def _str_startswith(self, pat: str, na=None):
if pa_version_under4p0:
return super()._str_startswith(pat, na)

result = pc.match_substring_regex(self._data, "^" + re.escape(pat))
result = BooleanDtype().__from_arrow__(result)
if not isna(na):
result[isna(result)] = bool(na)
return result
pat = "^" + re.escape(pat)
return self._str_contains(pat, na=na, regex=True)

def _str_endswith(self, pat, na=None):
def _str_endswith(self, pat: str, na=None):
if pa_version_under4p0:
return super()._str_endswith(pat, na)

result = pc.match_substring_regex(self._data, re.escape(pat) + "$")
result = BooleanDtype().__from_arrow__(result)
if not isna(na):
result[isna(result)] = bool(na)
return result
pat = re.escape(pat) + "$"
return self._str_contains(pat, na=na, regex=True)

def _str_match(
self, pat: str, case: bool = True, flags: int = 0, na: Scalar = None
):
if pa_version_under4p0:
return super()._str_match(pat, case, flags, na)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this may be quicker if using the fallback.... will need to check.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

broken this change off into #41487, slighter quicker than master, to bring the perf for object fallback back to before the pyarrow native implementation was added


if not pat.startswith("^"):
pat = "^" + pat
return self._str_contains(pat, case, flags, na, regex=True)

def _str_fullmatch(
self, pat: str, case: bool = True, flags: int = 0, na: Scalar = None
):
if pa_version_under4p0:
return super()._str_fullmatch(pat, case, flags, na)

if not pat.endswith("$") or pat.endswith("//$"):
pat = pat + "$"
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we probably don't need the check but maybe faster... will need to check.

return self._str_match(pat, case, flags, na)

def _str_isalnum(self):
result = pc.utf8_is_alnum(self._data)
return BooleanDtype().__from_arrow__(result)
Expand Down
10 changes: 1 addition & 9 deletions pandas/core/strings/base.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
import abc
from typing import (
Pattern,
Union,
)

import numpy as np

Expand Down Expand Up @@ -67,11 +63,7 @@ def _str_match(

@abc.abstractmethod
def _str_fullmatch(
self,
pat: Union[str, Pattern],
case: bool = True,
flags: int = 0,
na: Scalar = np.nan,
self, pat: str, case: bool = True, flags: int = 0, na: Scalar = np.nan
):
pass

Expand Down
8 changes: 1 addition & 7 deletions pandas/core/strings/object_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,7 @@
import textwrap
from typing import (
Optional,
Pattern,
Set,
Union,
)
import unicodedata

Expand Down Expand Up @@ -197,11 +195,7 @@ def _str_match(
return self._str_map(f, na_value=na, dtype=np.dtype(bool))

def _str_fullmatch(
self,
pat: Union[str, Pattern],
case: bool = True,
flags: int = 0,
na: Scalar = None,
self, pat: str, case: bool = True, flags: int = 0, na: Scalar = None
):
if not case:
flags |= re.IGNORECASE
Expand Down
25 changes: 9 additions & 16 deletions pandas/tests/strings/test_find_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -486,27 +486,20 @@ def test_match_case_kwarg(any_string_dtype):
tm.assert_series_equal(result, expected)


def test_fullmatch():
def test_fullmatch(any_string_dtype):
# GH 32806
ser = Series(["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"])
ser = Series(
["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype
)
result = ser.str.fullmatch(".*BAD[_]+.*BAD")
expected = Series([True, False, np.nan, False])
expected_dtype = "object" if any_string_dtype == "object" else "boolean"
expected = Series([True, False, np.nan, False], dtype=expected_dtype)
tm.assert_series_equal(result, expected)

ser = Series(["ab", "AB", "abc", "ABC"])
ser = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype)
result = ser.str.fullmatch("ab", case=False)
expected = Series([True, True, False, False])
tm.assert_series_equal(result, expected)


def test_fullmatch_nullable_string_dtype(nullable_string_dtype):
ser = Series(
["fooBAD__barBAD", "BAD_BADleroybrown", None, "foo"],
dtype=nullable_string_dtype,
)
result = ser.str.fullmatch(".*BAD[_]+.*BAD")
# Result is nullable boolean
expected = Series([True, False, np.nan, False], dtype="boolean")
expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean"
expected = Series([True, True, False, False], dtype=expected_dtype)
tm.assert_series_equal(result, expected)


Expand Down