Skip to content

Commit 431dd6f

Browse files
authored
ENH: Use explicit methods instead of regex pattern in arrow strings (#54006)
* ENH: Use explicit methods instead of regex pattern in arrow strings * Fixup * Fix
1 parent eceaffb commit 431dd6f

File tree

2 files changed

+22
-33
lines changed

2 files changed

+22
-33
lines changed

pandas/core/arrays/string_arrow.py

+16-13
Original file line numberDiff line numberDiff line change
@@ -307,28 +307,31 @@ def _str_contains(
307307
return super()._str_contains(pat, case, flags, na, regex)
308308

309309
if regex:
310-
if case is False:
311-
fallback_performancewarning()
312-
return super()._str_contains(pat, case, flags, na, regex)
313-
else:
314-
result = pc.match_substring_regex(self._pa_array, pat)
310+
result = pc.match_substring_regex(self._pa_array, pat, ignore_case=not case)
315311
else:
316-
if case:
317-
result = pc.match_substring(self._pa_array, pat)
318-
else:
319-
result = pc.match_substring(pc.utf8_upper(self._pa_array), pat.upper())
312+
result = pc.match_substring(self._pa_array, pat, ignore_case=not case)
320313
result = BooleanDtype().__from_arrow__(result)
321314
if not isna(na):
322315
result[isna(result)] = bool(na)
323316
return result
324317

325318
def _str_startswith(self, pat: str, na=None):
326-
pat = f"^{re.escape(pat)}"
327-
return self._str_contains(pat, na=na, regex=True)
319+
result = pc.starts_with(self._pa_array, pattern=pat)
320+
if not isna(na):
321+
result = result.fill_null(na)
322+
result = BooleanDtype().__from_arrow__(result)
323+
if not isna(na):
324+
result[isna(result)] = bool(na)
325+
return result
328326

329327
def _str_endswith(self, pat: str, na=None):
330-
pat = f"{re.escape(pat)}$"
331-
return self._str_contains(pat, na=na, regex=True)
328+
result = pc.ends_with(self._pa_array, pattern=pat)
329+
if not isna(na):
330+
result = result.fill_null(na)
331+
result = BooleanDtype().__from_arrow__(result)
332+
if not isna(na):
333+
result[isna(result)] = bool(na)
334+
return result
332335

333336
def _str_replace(
334337
self,

pandas/tests/strings/test_find_replace.py

+6-20
Original file line numberDiff line numberDiff line change
@@ -53,10 +53,8 @@ def test_contains(any_string_dtype):
5353
np.array(["Foo", "xYz", "fOOomMm__fOo", "MMM_"], dtype=object),
5454
dtype=any_string_dtype,
5555
)
56-
with tm.maybe_produces_warning(
57-
PerformanceWarning, any_string_dtype == "string[pyarrow]"
58-
):
59-
result = values.str.contains("FOO|mmm", case=False)
56+
57+
result = values.str.contains("FOO|mmm", case=False)
6058
expected = Series(np.array([True, False, True, True]), dtype=expected_dtype)
6159
tm.assert_series_equal(result, expected)
6260

@@ -172,10 +170,7 @@ def test_contains_moar(any_string_dtype):
172170
)
173171
tm.assert_series_equal(result, expected)
174172

175-
with tm.maybe_produces_warning(
176-
PerformanceWarning, any_string_dtype == "string[pyarrow]"
177-
):
178-
result = s.str.contains("a", case=False)
173+
result = s.str.contains("a", case=False)
179174
expected = Series(
180175
[True, False, False, True, True, False, np.nan, True, False, True],
181176
dtype=expected_dtype,
@@ -196,10 +191,7 @@ def test_contains_moar(any_string_dtype):
196191
)
197192
tm.assert_series_equal(result, expected)
198193

199-
with tm.maybe_produces_warning(
200-
PerformanceWarning, any_string_dtype == "string[pyarrow]"
201-
):
202-
result = s.str.contains("ba", case=False)
194+
result = s.str.contains("ba", case=False)
203195
expected = Series(
204196
[False, False, False, True, True, False, np.nan, True, False, False],
205197
dtype=expected_dtype,
@@ -723,10 +715,7 @@ def test_match_na_kwarg(any_string_dtype):
723715

724716
def test_match_case_kwarg(any_string_dtype):
725717
values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype)
726-
with tm.maybe_produces_warning(
727-
PerformanceWarning, any_string_dtype == "string[pyarrow]"
728-
):
729-
result = values.str.match("ab", case=False)
718+
result = values.str.match("ab", case=False)
730719
expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean"
731720
expected = Series([True, True, True, True], dtype=expected_dtype)
732721
tm.assert_series_equal(result, expected)
@@ -769,10 +758,7 @@ def test_fullmatch_case_kwarg(any_string_dtype):
769758

770759
expected = Series([True, True, False, False], dtype=expected_dtype)
771760

772-
with tm.maybe_produces_warning(
773-
PerformanceWarning, any_string_dtype == "string[pyarrow]"
774-
):
775-
result = ser.str.fullmatch("ab", case=False)
761+
result = ser.str.fullmatch("ab", case=False)
776762
tm.assert_series_equal(result, expected)
777763

778764
with tm.maybe_produces_warning(

0 commit comments

Comments
 (0)