1
1
from __future__ import annotations
2
2
3
+ import functools
3
4
import operator
4
5
import re
6
+ import sys
7
+ import textwrap
5
8
from typing import (
6
9
TYPE_CHECKING ,
7
10
Any ,
10
13
Sequence ,
11
14
cast ,
12
15
)
16
+ import unicodedata
13
17
14
18
import numpy as np
15
19
@@ -1749,6 +1753,16 @@ def _groupby_op(
1749
1753
return result
1750
1754
return type (self )._from_sequence (result , copy = False )
1751
1755
1756
+ def _apply_elementwise (self , func : Callable ) -> list [list [Any ]]:
1757
+ """Apply a callable to each element while maintaining the chunking structure."""
1758
+ return [
1759
+ [
1760
+ None if val is None else func (val )
1761
+ for val in chunk .to_numpy (zero_copy_only = False )
1762
+ ]
1763
+ for chunk in self ._pa_array .iterchunks ()
1764
+ ]
1765
+
1752
1766
def _str_count (self , pat : str , flags : int = 0 ):
1753
1767
if flags :
1754
1768
raise NotImplementedError (f"count not implemented with { flags = } " )
@@ -1882,14 +1896,14 @@ def _str_join(self, sep: str):
1882
1896
return type (self )(pc .binary_join (self ._pa_array , sep ))
1883
1897
1884
1898
def _str_partition (self , sep : str , expand : bool ):
1885
- raise NotImplementedError (
1886
- "str.partition not supported with pd.ArrowDtype(pa.string())."
1887
- )
1899
+ predicate = lambda val : val . partition ( sep )
1900
+ result = self . _apply_elementwise ( predicate )
1901
+ return type ( self )( pa . chunked_array ( result ) )
1888
1902
1889
1903
def _str_rpartition (self , sep : str , expand : bool ):
1890
- raise NotImplementedError (
1891
- "str.rpartition not supported with pd.ArrowDtype(pa.string())."
1892
- )
1904
+ predicate = lambda val : val . rpartition ( sep )
1905
+ result = self . _apply_elementwise ( predicate )
1906
+ return type ( self )( pa . chunked_array ( result ) )
1893
1907
1894
1908
def _str_slice (
1895
1909
self , start : int | None = None , stop : int | None = None , step : int | None = None
@@ -1978,14 +1992,21 @@ def _str_rstrip(self, to_strip=None):
1978
1992
return type (self )(result )
1979
1993
1980
1994
def _str_removeprefix (self , prefix : str ):
1981
- raise NotImplementedError (
1982
- "str.removeprefix not supported with pd.ArrowDtype(pa.string())."
1983
- )
1984
1995
# TODO: Should work once https://github.com/apache/arrow/issues/14991 is fixed
1985
1996
# starts_with = pc.starts_with(self._pa_array, pattern=prefix)
1986
1997
# removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix))
1987
1998
# result = pc.if_else(starts_with, removed, self._pa_array)
1988
1999
# return type(self)(result)
2000
+ if sys .version_info < (3 , 9 ):
2001
+ # NOTE pyupgrade will remove this when we run it with --py39-plus
2002
+ # so don't remove the unnecessary `else` statement below
2003
+ from pandas .util ._str_methods import removeprefix
2004
+
2005
+ predicate = functools .partial (removeprefix , prefix = prefix )
2006
+ else :
2007
+ predicate = lambda val : val .removeprefix (prefix )
2008
+ result = self ._apply_elementwise (predicate )
2009
+ return type (self )(pa .chunked_array (result ))
1989
2010
1990
2011
def _str_removesuffix (self , suffix : str ):
1991
2012
ends_with = pc .ends_with (self ._pa_array , pattern = suffix )
@@ -1994,49 +2015,59 @@ def _str_removesuffix(self, suffix: str):
1994
2015
return type (self )(result )
1995
2016
1996
2017
def _str_casefold (self ):
1997
- raise NotImplementedError (
1998
- "str.casefold not supported with pd.ArrowDtype(pa.string())."
1999
- )
2018
+ predicate = lambda val : val . casefold ()
2019
+ result = self . _apply_elementwise ( predicate )
2020
+ return type ( self )( pa . chunked_array ( result ) )
2000
2021
2001
- def _str_encode (self , encoding , errors : str = "strict" ):
2002
- raise NotImplementedError (
2003
- "str.encode not supported with pd.ArrowDtype(pa.string())."
2004
- )
2022
+ def _str_encode (self , encoding : str , errors : str = "strict" ):
2023
+ predicate = lambda val : val . encode ( encoding , errors )
2024
+ result = self . _apply_elementwise ( predicate )
2025
+ return type ( self )( pa . chunked_array ( result ) )
2005
2026
2006
2027
def _str_extract (self , pat : str , flags : int = 0 , expand : bool = True ):
2007
2028
raise NotImplementedError (
2008
2029
"str.extract not supported with pd.ArrowDtype(pa.string())."
2009
2030
)
2010
2031
2011
- def _str_findall (self , pat , flags : int = 0 ):
2012
- raise NotImplementedError (
2013
- "str.findall not supported with pd.ArrowDtype(pa.string())."
2014
- )
2032
+ def _str_findall (self , pat : str , flags : int = 0 ):
2033
+ regex = re .compile (pat , flags = flags )
2034
+ predicate = lambda val : regex .findall (val )
2035
+ result = self ._apply_elementwise (predicate )
2036
+ return type (self )(pa .chunked_array (result ))
2015
2037
2016
2038
def _str_get_dummies (self , sep : str = "|" ):
2017
- raise NotImplementedError (
2018
- "str.get_dummies not supported with pd.ArrowDtype(pa.string())."
2019
- )
2020
-
2021
- def _str_index (self , sub , start : int = 0 , end = None ):
2022
- raise NotImplementedError (
2023
- "str.index not supported with pd.ArrowDtype(pa.string())."
2024
- )
2025
-
2026
- def _str_rindex (self , sub , start : int = 0 , end = None ):
2027
- raise NotImplementedError (
2028
- "str.rindex not supported with pd.ArrowDtype(pa.string())."
2029
- )
2030
-
2031
- def _str_normalize (self , form ):
2032
- raise NotImplementedError (
2033
- "str.normalize not supported with pd.ArrowDtype(pa.string())."
2034
- )
2035
-
2036
- def _str_rfind (self , sub , start : int = 0 , end = None ):
2037
- raise NotImplementedError (
2038
- "str.rfind not supported with pd.ArrowDtype(pa.string())."
2039
- )
2039
+ split = pc .split_pattern (self ._pa_array , sep ).combine_chunks ()
2040
+ uniques = split .flatten ().unique ()
2041
+ uniques_sorted = uniques .take (pa .compute .array_sort_indices (uniques ))
2042
+ result_data = []
2043
+ for lst in split .to_pylist ():
2044
+ if lst is None :
2045
+ result_data .append ([False ] * len (uniques_sorted ))
2046
+ else :
2047
+ res = pc .is_in (uniques_sorted , pa .array (set (lst )))
2048
+ result_data .append (res .to_pylist ())
2049
+ result = type (self )(pa .array (result_data ))
2050
+ return result , uniques_sorted .to_pylist ()
2051
+
2052
+ def _str_index (self , sub : str , start : int = 0 , end : int | None = None ):
2053
+ predicate = lambda val : val .index (sub , start , end )
2054
+ result = self ._apply_elementwise (predicate )
2055
+ return type (self )(pa .chunked_array (result ))
2056
+
2057
+ def _str_rindex (self , sub : str , start : int = 0 , end : int | None = None ):
2058
+ predicate = lambda val : val .rindex (sub , start , end )
2059
+ result = self ._apply_elementwise (predicate )
2060
+ return type (self )(pa .chunked_array (result ))
2061
+
2062
+ def _str_normalize (self , form : str ):
2063
+ predicate = lambda val : unicodedata .normalize (form , val )
2064
+ result = self ._apply_elementwise (predicate )
2065
+ return type (self )(pa .chunked_array (result ))
2066
+
2067
+ def _str_rfind (self , sub : str , start : int = 0 , end = None ):
2068
+ predicate = lambda val : val .rfind (sub , start , end )
2069
+ result = self ._apply_elementwise (predicate )
2070
+ return type (self )(pa .chunked_array (result ))
2040
2071
2041
2072
def _str_split (
2042
2073
self ,
@@ -2060,15 +2091,17 @@ def _str_rsplit(self, pat: str | None = None, n: int | None = -1):
2060
2091
pc .split_pattern (self ._pa_array , pat , max_splits = n , reverse = True )
2061
2092
)
2062
2093
2063
- def _str_translate (self , table ):
2064
- raise NotImplementedError (
2065
- "str.translate not supported with pd.ArrowDtype(pa.string())."
2066
- )
2094
+ def _str_translate (self , table : dict [ int , str ] ):
2095
+ predicate = lambda val : val . translate ( table )
2096
+ result = self . _apply_elementwise ( predicate )
2097
+ return type ( self )( pa . chunked_array ( result ) )
2067
2098
2068
2099
def _str_wrap (self , width : int , ** kwargs ):
2069
- raise NotImplementedError (
2070
- "str.wrap not supported with pd.ArrowDtype(pa.string())."
2071
- )
2100
+ kwargs ["width" ] = width
2101
+ tw = textwrap .TextWrapper (** kwargs )
2102
+ predicate = lambda val : "\n " .join (tw .wrap (val ))
2103
+ result = self ._apply_elementwise (predicate )
2104
+ return type (self )(pa .chunked_array (result ))
2072
2105
2073
2106
@property
2074
2107
def _dt_year (self ):
0 commit comments