From 5464b035354b7bc1c1ee0bd9c140d9ac5cf1e6f2 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 19 Aug 2024 14:15:27 -0700 Subject: [PATCH 1/8] REF: de-duplicate arrow string methods --- pandas/core/arrays/string_arrow.py | 46 ++++++------------------------ 1 file changed, 8 insertions(+), 38 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 1e5adf106752f..deb63d18bf2bc 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -335,7 +335,7 @@ def _str_repeat(self, repeats: int | Sequence[int]): if not isinstance(repeats, int): return super()._str_repeat(repeats) else: - return type(self)(pc.binary_repeat(self._pa_array, repeats)) + return ArrowExtensionArray._str_repeat(self, repeats=repeats) def _str_match( self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None @@ -356,51 +356,21 @@ def _str_slice( ) -> Self: if stop is None: return super()._str_slice(start, stop, step) - if start is None: - start = 0 - if step is None: - step = 1 - return type(self)( - pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) - ) + return ArrowExtensionArray._str_slice(self, start=start, stop=stop, step=step) def _str_len(self): result = pc.utf8_length(self._pa_array) return self._convert_int_result(result) - def _str_lower(self) -> Self: - return type(self)(pc.utf8_lower(self._pa_array)) - - def _str_upper(self) -> Self: - return type(self)(pc.utf8_upper(self._pa_array)) - - def _str_strip(self, to_strip=None) -> Self: - if to_strip is None: - result = pc.utf8_trim_whitespace(self._pa_array) - else: - result = pc.utf8_trim(self._pa_array, characters=to_strip) - return type(self)(result) - - def _str_lstrip(self, to_strip=None) -> Self: - if to_strip is None: - result = pc.utf8_ltrim_whitespace(self._pa_array) - else: - result = pc.utf8_ltrim(self._pa_array, characters=to_strip) - return type(self)(result) - - def _str_rstrip(self, to_strip=None) -> Self: - if to_strip is None: - result = pc.utf8_rtrim_whitespace(self._pa_array) - else: - result = pc.utf8_rtrim(self._pa_array, characters=to_strip) - return type(self)(result) + _str_lower = ArrowExtensionArray._str_lower + _str_upper = ArrowExtensionArray._str_upper + _str_strip = ArrowExtensionArray._str_strip + _str_lstrip = ArrowExtensionArray._str_lstrip + _str_rstrip = ArrowExtensionArray._str_rstrip def _str_removeprefix(self, prefix: str): if not pa_version_under13p0: - starts_with = pc.starts_with(self._pa_array, pattern=prefix) - removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) - result = pc.if_else(starts_with, removed, self._pa_array) - return type(self)(result) + return ArrowExtensionArray._str_removeprefix(self, prefix) return super()._str_removeprefix(prefix) def _str_removesuffix(self, suffix: str): From 284b8898fd5235d4c6da27279aec0091f31f0f86 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 19 Aug 2024 14:37:41 -0700 Subject: [PATCH 2/8] REF: de-duplicate ArrowStringArray methods --- pandas/core/arrays/string_arrow.py | 23 +++-------------------- 1 file changed, 3 insertions(+), 20 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index deb63d18bf2bc..7852825600493 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -337,20 +337,6 @@ def _str_repeat(self, repeats: int | Sequence[int]): else: return ArrowExtensionArray._str_repeat(self, repeats=repeats) - def _str_match( - self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None - ): - if not pat.startswith("^"): - pat = f"^{pat}" - return self._str_contains(pat, case, flags, na, regex=True) - - def _str_fullmatch( - self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None - ): - if not pat.endswith("$") or pat.endswith("\\$"): - pat = f"{pat}$" - return self._str_match(pat, case, flags, na) - def _str_slice( self, start: int | None = None, stop: int | None = None, step: int | None = None ) -> Self: @@ -362,23 +348,20 @@ def _str_len(self): result = pc.utf8_length(self._pa_array) return self._convert_int_result(result) + _str_match = ArrowExtensionArray._str_match + _str_fullmatch = ArrowExtensionArray._str_fullmatch _str_lower = ArrowExtensionArray._str_lower _str_upper = ArrowExtensionArray._str_upper _str_strip = ArrowExtensionArray._str_strip _str_lstrip = ArrowExtensionArray._str_lstrip _str_rstrip = ArrowExtensionArray._str_rstrip + _str_removesuffix = ArrowStringArrayMixin._str_removesuffix def _str_removeprefix(self, prefix: str): if not pa_version_under13p0: return ArrowExtensionArray._str_removeprefix(self, prefix) return super()._str_removeprefix(prefix) - def _str_removesuffix(self, suffix: str): - ends_with = pc.ends_with(self._pa_array, pattern=suffix) - removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix)) - result = pc.if_else(ends_with, removed, self._pa_array) - return type(self)(result) - def _str_count(self, pat: str, flags: int = 0): if flags: return super()._str_count(pat, flags) From 5e0679eef341aad93c6f3c982d5e04fb36c5976d Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 27 Aug 2024 07:50:06 -0700 Subject: [PATCH 3/8] CLN: remove redundant override --- pandas/core/arrays/string_arrow.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 7852825600493..3237f81858395 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -462,7 +462,6 @@ def _cmp_method(self, other, op): class ArrowStringArrayNumpySemantics(ArrowStringArray): _na_value = np.nan _str_get = ArrowStringArrayMixin._str_get - _str_removesuffix = ArrowStringArrayMixin._str_removesuffix _str_capitalize = ArrowStringArrayMixin._str_capitalize _str_title = ArrowStringArrayMixin._str_title _str_swapcase = ArrowStringArrayMixin._str_swapcase From 0bd75c0f138e8d9b041c97c3cca7ca35a4cb6c21 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 4 Sep 2024 07:58:12 -0700 Subject: [PATCH 4/8] REF: re-use ArrowEA methods in ArrowStringArray --- pandas/core/arrays/string_arrow.py | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 3237f81858395..85e4d119af7d3 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -56,7 +56,6 @@ ArrayLike, AxisInt, Dtype, - Scalar, Self, npt, ) @@ -293,6 +292,19 @@ def astype(self, dtype, copy: bool = True): _str_startswith = ArrowStringArrayMixin._str_startswith _str_endswith = ArrowStringArrayMixin._str_endswith _str_pad = ArrowStringArrayMixin._str_pad + _str_match = ArrowExtensionArray._str_match + _str_fullmatch = ArrowExtensionArray._str_fullmatch + _str_lower = ArrowExtensionArray._str_lower + _str_upper = ArrowExtensionArray._str_upper + _str_strip = ArrowExtensionArray._str_strip + _str_lstrip = ArrowExtensionArray._str_lstrip + _str_rstrip = ArrowExtensionArray._str_rstrip + _str_removesuffix = ArrowStringArrayMixin._str_removesuffix + _str_get = ArrowStringArrayMixin._str_get + _str_capitalize = ArrowStringArrayMixin._str_capitalize + _str_title = ArrowStringArrayMixin._str_title + _str_swapcase = ArrowStringArrayMixin._str_swapcase + _str_slice_replace = ArrowStringArrayMixin._str_slice_replace def _str_contains( self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True @@ -348,15 +360,6 @@ def _str_len(self): result = pc.utf8_length(self._pa_array) return self._convert_int_result(result) - _str_match = ArrowExtensionArray._str_match - _str_fullmatch = ArrowExtensionArray._str_fullmatch - _str_lower = ArrowExtensionArray._str_lower - _str_upper = ArrowExtensionArray._str_upper - _str_strip = ArrowExtensionArray._str_strip - _str_lstrip = ArrowExtensionArray._str_lstrip - _str_rstrip = ArrowExtensionArray._str_rstrip - _str_removesuffix = ArrowStringArrayMixin._str_removesuffix - def _str_removeprefix(self, prefix: str): if not pa_version_under13p0: return ArrowExtensionArray._str_removeprefix(self, prefix) @@ -461,8 +464,3 @@ def _cmp_method(self, other, op): class ArrowStringArrayNumpySemantics(ArrowStringArray): _na_value = np.nan - _str_get = ArrowStringArrayMixin._str_get - _str_capitalize = ArrowStringArrayMixin._str_capitalize - _str_title = ArrowStringArrayMixin._str_title - _str_swapcase = ArrowStringArrayMixin._str_swapcase - _str_slice_replace = ArrowStringArrayMixin._str_slice_replace From 7afa1602ee2be6e2dac693148946626f6117edb6 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 4 Sep 2024 15:33:07 -0700 Subject: [PATCH 5/8] REF: move implementations to mixing class --- pandas/core/arrays/_arrow_string_mixins.py | 45 ++++++++++++++++++++ pandas/core/arrays/arrow/array.py | 49 +--------------------- pandas/core/arrays/string_arrow.py | 44 +++++-------------- 3 files changed, 56 insertions(+), 82 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 950d4cd7cc92e..1547361d955b3 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -50,6 +50,37 @@ def _convert_int_result(self, result): def _apply_elementwise(self, func: Callable) -> list[list[Any]]: raise NotImplementedError + def _str_len(self): + result = pc.utf8_length(self._pa_array) + return self._convert_int_result(result) + + def _str_lower(self) -> Self: + return type(self)(pc.utf8_lower(self._pa_array)) + + def _str_upper(self) -> Self: + return type(self)(pc.utf8_upper(self._pa_array)) + + def _str_strip(self, to_strip=None) -> Self: + if to_strip is None: + result = pc.utf8_trim_whitespace(self._pa_array) + else: + result = pc.utf8_trim(self._pa_array, characters=to_strip) + return type(self)(result) + + def _str_lstrip(self, to_strip=None) -> Self: + if to_strip is None: + result = pc.utf8_ltrim_whitespace(self._pa_array) + else: + result = pc.utf8_ltrim(self._pa_array, characters=to_strip) + return type(self)(result) + + def _str_rstrip(self, to_strip=None) -> Self: + if to_strip is None: + result = pc.utf8_rtrim_whitespace(self._pa_array) + else: + result = pc.utf8_rtrim(self._pa_array, characters=to_strip) + return type(self)(result) + def _str_pad( self, width: int, @@ -247,3 +278,17 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None): offset_result = pc.add(result, start_offset) result = pc.if_else(found, offset_result, -1) return self._convert_int_result(result) + + def _str_match( + self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None + ): + if not pat.startswith("^"): + pat = f"^{pat}" + return self._str_contains(pat, case, flags, na, regex=True) + + def _str_fullmatch( + self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None + ): + if not pat.endswith("$") or pat.endswith("\\$"): + pat = f"{pat}$" + return self._str_match(pat, case, flags, na) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 15f9ba611a642..a46156b6b7db3 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1998,7 +1998,7 @@ def _rank( """ See Series.rank.__doc__. """ - return type(self)( + return self._convert_int_result( self._rank_calc( axis=axis, method=method, @@ -2322,9 +2322,6 @@ def _str_count(self, pat: str, flags: int = 0) -> Self: raise NotImplementedError(f"count not implemented with {flags=}") return type(self)(pc.count_substring_regex(self._pa_array, pat)) - def _result_converter(self, result): - return type(self)(result) - def _str_replace( self, pat: str | re.Pattern, @@ -2359,20 +2356,6 @@ def _str_repeat(self, repeats: int | Sequence[int]) -> Self: ) return type(self)(pc.binary_repeat(self._pa_array, repeats)) - def _str_match( - self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None - ) -> Self: - if not pat.startswith("^"): - pat = f"^{pat}" - return self._str_contains(pat, case, flags, na, regex=True) - - def _str_fullmatch( - self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None - ) -> Self: - if not pat.endswith("$") or pat.endswith("\\$"): - pat = f"{pat}$" - return self._str_match(pat, case, flags, na) - def _str_join(self, sep: str) -> Self: if pa.types.is_string(self._pa_array.type) or pa.types.is_large_string( self._pa_array.type @@ -2404,36 +2387,6 @@ def _str_slice( pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) ) - def _str_len(self) -> Self: - return type(self)(pc.utf8_length(self._pa_array)) - - def _str_lower(self) -> Self: - return type(self)(pc.utf8_lower(self._pa_array)) - - def _str_upper(self) -> Self: - return type(self)(pc.utf8_upper(self._pa_array)) - - def _str_strip(self, to_strip=None) -> Self: - if to_strip is None: - result = pc.utf8_trim_whitespace(self._pa_array) - else: - result = pc.utf8_trim(self._pa_array, characters=to_strip) - return type(self)(result) - - def _str_lstrip(self, to_strip=None) -> Self: - if to_strip is None: - result = pc.utf8_ltrim_whitespace(self._pa_array) - else: - result = pc.utf8_ltrim(self._pa_array, characters=to_strip) - return type(self)(result) - - def _str_rstrip(self, to_strip=None) -> Self: - if to_strip is None: - result = pc.utf8_rtrim_whitespace(self._pa_array) - else: - result = pc.utf8_rtrim(self._pa_array, characters=to_strip) - return type(self)(result) - def _str_removeprefix(self, prefix: str): if not pa_version_under13p0: starts_with = pc.starts_with(self._pa_array, pattern=prefix) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 85e4d119af7d3..3a3822c556d04 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -54,7 +54,6 @@ from pandas._typing import ( ArrayLike, - AxisInt, Dtype, Self, npt, @@ -292,19 +291,22 @@ def astype(self, dtype, copy: bool = True): _str_startswith = ArrowStringArrayMixin._str_startswith _str_endswith = ArrowStringArrayMixin._str_endswith _str_pad = ArrowStringArrayMixin._str_pad - _str_match = ArrowExtensionArray._str_match - _str_fullmatch = ArrowExtensionArray._str_fullmatch - _str_lower = ArrowExtensionArray._str_lower - _str_upper = ArrowExtensionArray._str_upper - _str_strip = ArrowExtensionArray._str_strip - _str_lstrip = ArrowExtensionArray._str_lstrip - _str_rstrip = ArrowExtensionArray._str_rstrip + _str_match = ArrowStringArrayMixin._str_match + _str_fullmatch = ArrowStringArrayMixin._str_fullmatch + _str_lower = ArrowStringArrayMixin._str_lower + _str_upper = ArrowStringArrayMixin._str_upper + _str_strip = ArrowStringArrayMixin._str_strip + _str_lstrip = ArrowStringArrayMixin._str_lstrip + _str_rstrip = ArrowStringArrayMixin._str_rstrip _str_removesuffix = ArrowStringArrayMixin._str_removesuffix _str_get = ArrowStringArrayMixin._str_get _str_capitalize = ArrowStringArrayMixin._str_capitalize _str_title = ArrowStringArrayMixin._str_title _str_swapcase = ArrowStringArrayMixin._str_swapcase _str_slice_replace = ArrowStringArrayMixin._str_slice_replace + _str_len = ArrowStringArrayMixin._str_len + + _rank = ArrowExtensionArray._rank def _str_contains( self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True @@ -356,10 +358,6 @@ def _str_slice( return super()._str_slice(start, stop, step) return ArrowExtensionArray._str_slice(self, start=start, stop=stop, step=step) - def _str_len(self): - result = pc.utf8_length(self._pa_array) - return self._convert_int_result(result) - def _str_removeprefix(self, prefix: str): if not pa_version_under13p0: return ArrowExtensionArray._str_removeprefix(self, prefix) @@ -421,28 +419,6 @@ def _reduce( else: return result - def _rank( - self, - *, - axis: AxisInt = 0, - method: str = "average", - na_option: str = "keep", - ascending: bool = True, - pct: bool = False, - ): - """ - See Series.rank.__doc__. - """ - return self._convert_int_result( - self._rank_calc( - axis=axis, - method=method, - na_option=na_option, - ascending=ascending, - pct=pct, - ) - ) - def value_counts(self, dropna: bool = True) -> Series: result = super().value_counts(dropna=dropna) if self.dtype.na_value is np.nan: From fb17fb4fac73b3ed27f2bfbb202d14a92804d31c Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 6 Sep 2024 09:14:40 -0700 Subject: [PATCH 6/8] re-order --- pandas/core/arrays/_arrow_string_mixins.py | 28 +++++++++++----------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 1547361d955b3..3058b4fb966f6 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -245,6 +245,20 @@ def _str_contains( result = result.fill_null(na) return self._convert_bool_result(result) + def _str_match( + self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None + ): + if not pat.startswith("^"): + pat = f"^{pat}" + return self._str_contains(pat, case, flags, na, regex=True) + + def _str_fullmatch( + self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None + ): + if not pat.endswith("$") or pat.endswith("\\$"): + pat = f"{pat}$" + return self._str_match(pat, case, flags, na) + def _str_find(self, sub: str, start: int = 0, end: int | None = None): if ( pa_version_under13p0 @@ -278,17 +292,3 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None): offset_result = pc.add(result, start_offset) result = pc.if_else(found, offset_result, -1) return self._convert_int_result(result) - - def _str_match( - self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None - ): - if not pat.startswith("^"): - pat = f"^{pat}" - return self._str_contains(pat, case, flags, na, regex=True) - - def _str_fullmatch( - self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None - ): - if not pat.endswith("$") or pat.endswith("\\$"): - pat = f"{pat}$" - return self._str_match(pat, case, flags, na) From feb30a4e4e0c8ea1e868705acbcb298727e7950b Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 9 Sep 2024 09:02:36 -0700 Subject: [PATCH 7/8] suggested edits --- pandas/core/arrays/string_arrow.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 3a3822c556d04..50aa48ded58a5 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -306,8 +306,6 @@ def astype(self, dtype, copy: bool = True): _str_slice_replace = ArrowStringArrayMixin._str_slice_replace _str_len = ArrowStringArrayMixin._str_len - _rank = ArrowExtensionArray._rank - def _str_contains( self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True ): @@ -343,7 +341,9 @@ def _str_replace( fallback_performancewarning() return super()._str_replace(pat, repl, n, case, flags, regex) - return ArrowExtensionArray._str_replace(self, pat, repl, n, case, flags, regex) + return ArrowStringArrayMixin._str_replace( + self, pat, repl, n, case, flags, regex + ) def _str_repeat(self, repeats: int | Sequence[int]): if not isinstance(repeats, int): @@ -360,7 +360,7 @@ def _str_slice( def _str_removeprefix(self, prefix: str): if not pa_version_under13p0: - return ArrowExtensionArray._str_removeprefix(self, prefix) + return ArrowStringArrayMixin._str_removeprefix(self, prefix) return super()._str_removeprefix(prefix) def _str_count(self, pat: str, flags: int = 0): From 8f430f9818fb7fa09a4673dc9138dc0c9787114f Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 10 Sep 2024 13:23:36 -0700 Subject: [PATCH 8/8] REF: move implementations to mixin --- pandas/core/arrays/_arrow_string_mixins.py | 38 ++++++++++++++++++++++ pandas/core/arrays/arrow/array.py | 37 --------------------- 2 files changed, 38 insertions(+), 37 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 0db529b7262ee..aa5b28c71b12a 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -1,6 +1,7 @@ from __future__ import annotations from functools import partial +import re from typing import ( TYPE_CHECKING, Any, @@ -159,6 +160,33 @@ def _str_slice_replace( stop = np.iinfo(np.int64).max return type(self)(pc.utf8_replace_slice(self._pa_array, start, stop, repl)) + def _str_replace( + self, + pat: str | re.Pattern, + repl: str | Callable, + n: int = -1, + case: bool = True, + flags: int = 0, + regex: bool = True, + ) -> Self: + if isinstance(pat, re.Pattern) or callable(repl) or not case or flags: + raise NotImplementedError( + "replace is not supported with a re.Pattern, callable repl, " + "case=False, or flags!=0" + ) + + func = pc.replace_substring_regex if regex else pc.replace_substring + # https://github.com/apache/arrow/issues/39149 + # GH 56404, unexpected behavior with negative max_replacements with pyarrow. + pa_max_replacements = None if n < 0 else n + result = func( + self._pa_array, + pattern=pat, + replacement=repl, + max_replacements=pa_max_replacements, + ) + return type(self)(result) + def _str_capitalize(self) -> Self: return type(self)(pc.utf8_capitalize(self._pa_array)) @@ -168,6 +196,16 @@ def _str_title(self) -> Self: def _str_swapcase(self) -> Self: return type(self)(pc.utf8_swapcase(self._pa_array)) + def _str_removeprefix(self, prefix: str): + if not pa_version_under13p0: + starts_with = pc.starts_with(self._pa_array, pattern=prefix) + removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) + result = pc.if_else(starts_with, removed, self._pa_array) + return type(self)(result) + predicate = lambda val: val.removeprefix(prefix) + result = self._apply_elementwise(predicate) + return type(self)(pa.chunked_array(result)) + def _str_removesuffix(self, suffix: str): ends_with = pc.ends_with(self._pa_array, pattern=suffix) removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix)) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index be9cca50d09de..bd94447f0cd80 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2323,33 +2323,6 @@ def _str_count(self, pat: str, flags: int = 0) -> Self: raise NotImplementedError(f"count not implemented with {flags=}") return type(self)(pc.count_substring_regex(self._pa_array, pat)) - def _str_replace( - self, - pat: str | re.Pattern, - repl: str | Callable, - n: int = -1, - case: bool = True, - flags: int = 0, - regex: bool = True, - ) -> Self: - if isinstance(pat, re.Pattern) or callable(repl) or not case or flags: - raise NotImplementedError( - "replace is not supported with a re.Pattern, callable repl, " - "case=False, or flags!=0" - ) - - func = pc.replace_substring_regex if regex else pc.replace_substring - # https://github.com/apache/arrow/issues/39149 - # GH 56404, unexpected behavior with negative max_replacements with pyarrow. - pa_max_replacements = None if n < 0 else n - result = func( - self._pa_array, - pattern=pat, - replacement=repl, - max_replacements=pa_max_replacements, - ) - return type(self)(result) - def _str_repeat(self, repeats: int | Sequence[int]) -> Self: if not isinstance(repeats, int): raise NotImplementedError( @@ -2377,16 +2350,6 @@ def _str_rpartition(self, sep: str, expand: bool) -> Self: result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) - def _str_removeprefix(self, prefix: str): - if not pa_version_under13p0: - starts_with = pc.starts_with(self._pa_array, pattern=prefix) - removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) - result = pc.if_else(starts_with, removed, self._pa_array) - return type(self)(result) - predicate = lambda val: val.removeprefix(prefix) - result = self._apply_elementwise(predicate) - return type(self)(pa.chunked_array(result)) - def _str_casefold(self) -> Self: predicate = lambda val: val.casefold() result = self._apply_elementwise(predicate)