Skip to content

Commit 94f20fa

Browse files
[ArrowStringArray] use pyarrow.compute.match_substring_regex if available (#41217)
1 parent c556c20 commit 94f20fa

File tree

1 file changed

+27
-7
lines changed

1 file changed

+27
-7
lines changed

pandas/core/arrays/string_arrow.py

+27-7
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
Sequence,
99
cast,
1010
)
11+
import warnings
1112

1213
import numpy as np
1314

@@ -766,16 +767,34 @@ def _str_map(self, f, na_value=None, dtype: Dtype | None = None):
766767
return lib.map_infer_mask(arr, f, mask.view("uint8"))
767768

768769
def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex=True):
769-
if not regex and case:
770-
result = pc.match_substring(self._data, pat)
771-
result = BooleanDtype().__from_arrow__(result)
772-
if not isna(na):
773-
result[isna(result)] = bool(na)
774-
return result
775-
else:
770+
if flags:
776771
return super()._str_contains(pat, case, flags, na, regex)
777772

773+
if regex:
774+
# match_substring_regex added in pyarrow 4.0.0
775+
if hasattr(pc, "match_substring_regex") and case:
776+
if re.compile(pat).groups:
777+
warnings.warn(
778+
"This pattern has match groups. To actually get the "
779+
"groups, use str.extract.",
780+
UserWarning,
781+
stacklevel=3,
782+
)
783+
result = pc.match_substring_regex(self._data, pat)
784+
else:
785+
return super()._str_contains(pat, case, flags, na, regex)
786+
else:
787+
if case:
788+
result = pc.match_substring(self._data, pat)
789+
else:
790+
result = pc.match_substring(pc.utf8_upper(self._data), pat.upper())
791+
result = BooleanDtype().__from_arrow__(result)
792+
if not isna(na):
793+
result[isna(result)] = bool(na)
794+
return result
795+
778796
def _str_startswith(self, pat, na=None):
797+
# match_substring_regex added in pyarrow 4.0.0
779798
if hasattr(pc, "match_substring_regex"):
780799
result = pc.match_substring_regex(self._data, "^" + re.escape(pat))
781800
result = BooleanDtype().__from_arrow__(result)
@@ -786,6 +805,7 @@ def _str_startswith(self, pat, na=None):
786805
return super()._str_startswith(pat, na)
787806

788807
def _str_endswith(self, pat, na=None):
808+
# match_substring_regex added in pyarrow 4.0.0
789809
if hasattr(pc, "match_substring_regex"):
790810
result = pc.match_substring_regex(self._data, re.escape(pat) + "$")
791811
result = BooleanDtype().__from_arrow__(result)

0 commit comments

Comments
 (0)