Skip to content

Commit 6124b7b

Browse files
[ArrowStringArray] startswith/endswith using native pyarrow method (#41222)
1 parent dafe7f0 commit 6124b7b

File tree

2 files changed

+59
-0
lines changed

2 files changed

+59
-0
lines changed

pandas/core/arrays/string_arrow.py

+21
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import annotations
22

33
from distutils.version import LooseVersion
4+
import re
45
from typing import (
56
TYPE_CHECKING,
67
Any,
@@ -763,6 +764,26 @@ def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex=True):
763764
else:
764765
return super()._str_contains(pat, case, flags, na, regex)
765766

767+
def _str_startswith(self, pat, na=None):
768+
if hasattr(pc, "match_substring_regex"):
769+
result = pc.match_substring_regex(self._data, "^" + re.escape(pat))
770+
result = BooleanDtype().__from_arrow__(result)
771+
if not isna(na):
772+
result[isna(result)] = bool(na)
773+
return result
774+
else:
775+
return super()._str_startswith(pat, na)
776+
777+
def _str_endswith(self, pat, na=None):
778+
if hasattr(pc, "match_substring_regex"):
779+
result = pc.match_substring_regex(self._data, re.escape(pat) + "$")
780+
result = BooleanDtype().__from_arrow__(result)
781+
if not isna(na):
782+
result[isna(result)] = bool(na)
783+
return result
784+
else:
785+
return super()._str_endswith(pat, na)
786+
766787
def _str_isalnum(self):
767788
if hasattr(pc, "utf8_is_alnum"):
768789
result = pc.utf8_is_alnum(self._data)

pandas/tests/strings/test_find_replace.py

+38
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,25 @@ def test_startswith(dtype, null_value, na):
200200
tm.assert_series_equal(rs, xp)
201201

202202

203+
@pytest.mark.parametrize("na", [None, True, False])
204+
def test_startswith_nullable_string_dtype(nullable_string_dtype, na):
205+
values = Series(
206+
["om", None, "foo_nom", "nom", "bar_foo", None, "foo", "regex", "rege."],
207+
dtype=nullable_string_dtype,
208+
)
209+
result = values.str.startswith("foo", na=na)
210+
exp = Series(
211+
[False, na, True, False, False, na, True, False, False], dtype="boolean"
212+
)
213+
tm.assert_series_equal(result, exp)
214+
215+
result = values.str.startswith("rege.", na=na)
216+
exp = Series(
217+
[False, na, False, False, False, na, False, False, True], dtype="boolean"
218+
)
219+
tm.assert_series_equal(result, exp)
220+
221+
203222
@pytest.mark.parametrize("dtype", [None, "category"])
204223
@pytest.mark.parametrize("null_value", [None, np.nan, pd.NA])
205224
@pytest.mark.parametrize("na", [True, False])
@@ -228,6 +247,25 @@ def test_endswith(dtype, null_value, na):
228247
tm.assert_series_equal(rs, xp)
229248

230249

250+
@pytest.mark.parametrize("na", [None, True, False])
251+
def test_endswith_nullable_string_dtype(nullable_string_dtype, na):
252+
values = Series(
253+
["om", None, "foo_nom", "nom", "bar_foo", None, "foo", "regex", "rege."],
254+
dtype=nullable_string_dtype,
255+
)
256+
result = values.str.endswith("foo", na=na)
257+
exp = Series(
258+
[False, na, False, False, True, na, True, False, False], dtype="boolean"
259+
)
260+
tm.assert_series_equal(result, exp)
261+
262+
result = values.str.endswith("rege.", na=na)
263+
exp = Series(
264+
[False, na, False, False, False, na, False, False, True], dtype="boolean"
265+
)
266+
tm.assert_series_equal(result, exp)
267+
268+
231269
def test_replace():
232270
values = Series(["fooBAD__barBAD", np.nan])
233271

0 commit comments

Comments
 (0)