diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index fabb3974728de..72a2ab8a1b80a 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -8,6 +8,7 @@ Sequence, cast, ) +import warnings import numpy as np @@ -766,16 +767,34 @@ def _str_map(self, f, na_value=None, dtype: Dtype | None = None): return lib.map_infer_mask(arr, f, mask.view("uint8")) def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex=True): - if not regex and case: - result = pc.match_substring(self._data, pat) - result = BooleanDtype().__from_arrow__(result) - if not isna(na): - result[isna(result)] = bool(na) - return result - else: + if flags: return super()._str_contains(pat, case, flags, na, regex) + if regex: + # match_substring_regex added in pyarrow 4.0.0 + if hasattr(pc, "match_substring_regex") and case: + if re.compile(pat).groups: + warnings.warn( + "This pattern has match groups. To actually get the " + "groups, use str.extract.", + UserWarning, + stacklevel=3, + ) + result = pc.match_substring_regex(self._data, pat) + else: + return super()._str_contains(pat, case, flags, na, regex) + else: + if case: + result = pc.match_substring(self._data, pat) + else: + result = pc.match_substring(pc.utf8_upper(self._data), pat.upper()) + result = BooleanDtype().__from_arrow__(result) + if not isna(na): + result[isna(result)] = bool(na) + return result + def _str_startswith(self, pat, na=None): + # match_substring_regex added in pyarrow 4.0.0 if hasattr(pc, "match_substring_regex"): result = pc.match_substring_regex(self._data, "^" + re.escape(pat)) result = BooleanDtype().__from_arrow__(result) @@ -786,6 +805,7 @@ def _str_startswith(self, pat, na=None): return super()._str_startswith(pat, na) def _str_endswith(self, pat, na=None): + # match_substring_regex added in pyarrow 4.0.0 if hasattr(pc, "match_substring_regex"): result = pc.match_substring_regex(self._data, re.escape(pat) + "$") result = BooleanDtype().__from_arrow__(result)