1
1
from __future__ import annotations
2
2
3
3
from copy import deepcopy
4
+ import functools
4
5
import operator
5
6
import re
7
+ import sys
8
+ import textwrap
6
9
from typing import (
7
10
TYPE_CHECKING ,
8
11
Any ,
12
15
TypeVar ,
13
16
cast ,
14
17
)
18
+ import unicodedata
15
19
16
20
import numpy as np
17
21
@@ -1655,6 +1659,16 @@ def _replace_with_mask(
1655
1659
result [mask ] = replacements
1656
1660
return pa .array (result , type = values .type , from_pandas = True )
1657
1661
1662
+ def _apply_elementwise (self , func : Callable ) -> list [list [Any ]]:
1663
+ """Apply a callable to each element while maintaining the chunking structure."""
1664
+ return [
1665
+ [
1666
+ None if val is None else func (val )
1667
+ for val in chunk .to_numpy (zero_copy_only = False )
1668
+ ]
1669
+ for chunk in self ._data .iterchunks ()
1670
+ ]
1671
+
1658
1672
def _str_count (self , pat : str , flags : int = 0 ):
1659
1673
if flags :
1660
1674
raise NotImplementedError (f"count not implemented with { flags = } " )
@@ -1788,14 +1802,14 @@ def _str_join(self, sep: str):
1788
1802
return type (self )(pc .binary_join (self ._data , sep ))
1789
1803
1790
1804
def _str_partition (self , sep : str , expand : bool ):
1791
- raise NotImplementedError (
1792
- "str.partition not supported with pd.ArrowDtype(pa.string())."
1793
- )
1805
+ predicate = lambda val : val . partition ( sep )
1806
+ result = self . _apply_elementwise ( predicate )
1807
+ return type ( self )( pa . chunked_array ( result ) )
1794
1808
1795
1809
def _str_rpartition (self , sep : str , expand : bool ):
1796
- raise NotImplementedError (
1797
- "str.rpartition not supported with pd.ArrowDtype(pa.string())."
1798
- )
1810
+ predicate = lambda val : val . rpartition ( sep )
1811
+ result = self . _apply_elementwise ( predicate )
1812
+ return type ( self )( pa . chunked_array ( result ) )
1799
1813
1800
1814
def _str_slice (
1801
1815
self , start : int | None = None , stop : int | None = None , step : int | None = None
@@ -1884,14 +1898,21 @@ def _str_rstrip(self, to_strip=None):
1884
1898
return type (self )(result )
1885
1899
1886
1900
def _str_removeprefix (self , prefix : str ):
1887
- raise NotImplementedError (
1888
- "str.removeprefix not supported with pd.ArrowDtype(pa.string())."
1889
- )
1890
1901
# TODO: Should work once https://github.com/apache/arrow/issues/14991 is fixed
1891
1902
# starts_with = pc.starts_with(self._data, pattern=prefix)
1892
1903
# removed = pc.utf8_slice_codeunits(self._data, len(prefix))
1893
1904
# result = pc.if_else(starts_with, removed, self._data)
1894
1905
# return type(self)(result)
1906
+ if sys .version_info < (3 , 9 ):
1907
+ # NOTE pyupgrade will remove this when we run it with --py39-plus
1908
+ # so don't remove the unnecessary `else` statement below
1909
+ from pandas .util ._str_methods import removeprefix
1910
+
1911
+ predicate = functools .partial (removeprefix , prefix = prefix )
1912
+ else :
1913
+ predicate = lambda val : val .removeprefix (prefix )
1914
+ result = self ._apply_elementwise (predicate )
1915
+ return type (self )(pa .chunked_array (result ))
1895
1916
1896
1917
def _str_removesuffix (self , suffix : str ):
1897
1918
ends_with = pc .ends_with (self ._data , pattern = suffix )
@@ -1900,49 +1921,59 @@ def _str_removesuffix(self, suffix: str):
1900
1921
return type (self )(result )
1901
1922
1902
1923
def _str_casefold (self ):
1903
- raise NotImplementedError (
1904
- "str.casefold not supported with pd.ArrowDtype(pa.string())."
1905
- )
1924
+ predicate = lambda val : val . casefold ()
1925
+ result = self . _apply_elementwise ( predicate )
1926
+ return type ( self )( pa . chunked_array ( result ) )
1906
1927
1907
- def _str_encode (self , encoding , errors : str = "strict" ):
1908
- raise NotImplementedError (
1909
- "str.encode not supported with pd.ArrowDtype(pa.string())."
1910
- )
1928
+ def _str_encode (self , encoding : str , errors : str = "strict" ):
1929
+ predicate = lambda val : val . encode ( encoding , errors )
1930
+ result = self . _apply_elementwise ( predicate )
1931
+ return type ( self )( pa . chunked_array ( result ) )
1911
1932
1912
1933
def _str_extract (self , pat : str , flags : int = 0 , expand : bool = True ):
1913
1934
raise NotImplementedError (
1914
1935
"str.extract not supported with pd.ArrowDtype(pa.string())."
1915
1936
)
1916
1937
1917
- def _str_findall (self , pat , flags : int = 0 ):
1918
- raise NotImplementedError (
1919
- "str.findall not supported with pd.ArrowDtype(pa.string())."
1920
- )
1938
+ def _str_findall (self , pat : str , flags : int = 0 ):
1939
+ regex = re .compile (pat , flags = flags )
1940
+ predicate = lambda val : regex .findall (val )
1941
+ result = self ._apply_elementwise (predicate )
1942
+ return type (self )(pa .chunked_array (result ))
1921
1943
1922
1944
def _str_get_dummies (self , sep : str = "|" ):
1923
- raise NotImplementedError (
1924
- "str.get_dummies not supported with pd.ArrowDtype(pa.string())."
1925
- )
1926
-
1927
- def _str_index (self , sub , start : int = 0 , end = None ):
1928
- raise NotImplementedError (
1929
- "str.index not supported with pd.ArrowDtype(pa.string())."
1930
- )
1931
-
1932
- def _str_rindex (self , sub , start : int = 0 , end = None ):
1933
- raise NotImplementedError (
1934
- "str.rindex not supported with pd.ArrowDtype(pa.string())."
1935
- )
1936
-
1937
- def _str_normalize (self , form ):
1938
- raise NotImplementedError (
1939
- "str.normalize not supported with pd.ArrowDtype(pa.string())."
1940
- )
1941
-
1942
- def _str_rfind (self , sub , start : int = 0 , end = None ):
1943
- raise NotImplementedError (
1944
- "str.rfind not supported with pd.ArrowDtype(pa.string())."
1945
- )
1945
+ split = pc .split_pattern (self ._data , sep ).combine_chunks ()
1946
+ uniques = split .flatten ().unique ()
1947
+ uniques_sorted = uniques .take (pa .compute .array_sort_indices (uniques ))
1948
+ result_data = []
1949
+ for lst in split .to_pylist ():
1950
+ if lst is None :
1951
+ result_data .append ([False ] * len (uniques_sorted ))
1952
+ else :
1953
+ res = pc .is_in (uniques_sorted , pa .array (set (lst )))
1954
+ result_data .append (res .to_pylist ())
1955
+ result = type (self )(pa .array (result_data ))
1956
+ return result , uniques_sorted .to_pylist ()
1957
+
1958
+ def _str_index (self , sub : str , start : int = 0 , end : int | None = None ):
1959
+ predicate = lambda val : val .index (sub , start , end )
1960
+ result = self ._apply_elementwise (predicate )
1961
+ return type (self )(pa .chunked_array (result ))
1962
+
1963
+ def _str_rindex (self , sub : str , start : int = 0 , end : int | None = None ):
1964
+ predicate = lambda val : val .rindex (sub , start , end )
1965
+ result = self ._apply_elementwise (predicate )
1966
+ return type (self )(pa .chunked_array (result ))
1967
+
1968
+ def _str_normalize (self , form : str ):
1969
+ predicate = lambda val : unicodedata .normalize (form , val )
1970
+ result = self ._apply_elementwise (predicate )
1971
+ return type (self )(pa .chunked_array (result ))
1972
+
1973
+ def _str_rfind (self , sub : str , start : int = 0 , end = None ):
1974
+ predicate = lambda val : val .rfind (sub , start , end )
1975
+ result = self ._apply_elementwise (predicate )
1976
+ return type (self )(pa .chunked_array (result ))
1946
1977
1947
1978
def _str_split (
1948
1979
self ,
@@ -1964,15 +1995,17 @@ def _str_rsplit(self, pat: str | None = None, n: int | None = -1):
1964
1995
n = None
1965
1996
return type (self )(pc .split_pattern (self ._data , pat , max_splits = n , reverse = True ))
1966
1997
1967
- def _str_translate (self , table ):
1968
- raise NotImplementedError (
1969
- "str.translate not supported with pd.ArrowDtype(pa.string())."
1970
- )
1971
-
1972
- def _str_wrap (self , width , ** kwargs ):
1973
- raise NotImplementedError (
1974
- "str.wrap not supported with pd.ArrowDtype(pa.string())."
1975
- )
1998
+ def _str_translate (self , table : dict [int , str ]):
1999
+ predicate = lambda val : val .translate (table )
2000
+ result = self ._apply_elementwise (predicate )
2001
+ return type (self )(pa .chunked_array (result ))
2002
+
2003
+ def _str_wrap (self , width : int , ** kwargs ):
2004
+ kwargs ["width" ] = width
2005
+ tw = textwrap .TextWrapper (** kwargs )
2006
+ predicate = lambda val : "\n " .join (tw .wrap (val ))
2007
+ result = self ._apply_elementwise (predicate )
2008
+ return type (self )(pa .chunked_array (result ))
1976
2009
1977
2010
@property
1978
2011
def _dt_year (self ):
0 commit comments