5
5
Callable ,
6
6
Union ,
7
7
)
8
+ import warnings
8
9
9
10
import numpy as np
10
11
18
19
npt ,
19
20
)
20
21
from pandas .compat import pa_version_under7p0
22
+ from pandas .util ._exceptions import find_stack_level
21
23
22
24
from pandas .core .dtypes .common import (
23
25
is_bool_dtype ,
@@ -112,7 +114,7 @@ def __init__(self, values) -> None:
112
114
super ().__init__ (values )
113
115
self ._dtype = StringDtype (storage = "pyarrow" )
114
116
115
- if not pa .types .is_string (self ._data .type ):
117
+ if not pa .types .is_string (self ._pa_array .type ):
116
118
raise ValueError (
117
119
"ArrowStringArray requires a PyArrow (chunked) array of string type"
118
120
)
@@ -125,7 +127,7 @@ def __len__(self) -> int:
125
127
-------
126
128
length : int
127
129
"""
128
- return len (self ._data )
130
+ return len (self ._pa_array )
129
131
130
132
@classmethod
131
133
def _from_sequence (cls , scalars , dtype : Dtype | None = None , copy : bool = False ):
@@ -193,7 +195,7 @@ def isin(self, values) -> npt.NDArray[np.bool_]:
193
195
if not len (value_set ):
194
196
return np .zeros (len (self ), dtype = bool )
195
197
196
- result = pc .is_in (self ._data , value_set = pa .array (value_set ))
198
+ result = pc .is_in (self ._pa_array , value_set = pa .array (value_set ))
197
199
# pyarrow 2.0.0 returned nulls, so we explicily specify dtype to convert nulls
198
200
# to False
199
201
return np .array (result , dtype = np .bool_ )
@@ -206,13 +208,24 @@ def astype(self, dtype, copy: bool = True):
206
208
return self .copy ()
207
209
return self
208
210
elif isinstance (dtype , NumericDtype ):
209
- data = self ._data .cast (pa .from_numpy_dtype (dtype .numpy_dtype ))
211
+ data = self ._pa_array .cast (pa .from_numpy_dtype (dtype .numpy_dtype ))
210
212
return dtype .__from_arrow__ (data )
211
213
elif isinstance (dtype , np .dtype ) and np .issubdtype (dtype , np .floating ):
212
214
return self .to_numpy (dtype = dtype , na_value = np .nan )
213
215
214
216
return super ().astype (dtype , copy = copy )
215
217
218
+ @property
219
+ def _data (self ):
220
+ # dask accesses ._data directlys
221
+ warnings .warn (
222
+ f"{ type (self ).__name__ } ._data is a deprecated and will be removed "
223
+ "in a future version, use ._pa_array instead" ,
224
+ FutureWarning ,
225
+ stacklevel = find_stack_level (),
226
+ )
227
+ return self ._pa_array
228
+
216
229
# ------------------------------------------------------------------------
217
230
# String methods interface
218
231
@@ -292,12 +305,12 @@ def _str_contains(
292
305
fallback_performancewarning ()
293
306
return super ()._str_contains (pat , case , flags , na , regex )
294
307
else :
295
- result = pc .match_substring_regex (self ._data , pat )
308
+ result = pc .match_substring_regex (self ._pa_array , pat )
296
309
else :
297
310
if case :
298
- result = pc .match_substring (self ._data , pat )
311
+ result = pc .match_substring (self ._pa_array , pat )
299
312
else :
300
- result = pc .match_substring (pc .utf8_upper (self ._data ), pat .upper ())
313
+ result = pc .match_substring (pc .utf8_upper (self ._pa_array ), pat .upper ())
301
314
result = BooleanDtype ().__from_arrow__ (result )
302
315
if not isna (na ):
303
316
result [isna (result )] = bool (na )
@@ -325,7 +338,7 @@ def _str_replace(
325
338
return super ()._str_replace (pat , repl , n , case , flags , regex )
326
339
327
340
func = pc .replace_substring_regex if regex else pc .replace_substring
328
- result = func (self ._data , pattern = pat , replacement = repl , max_replacements = n )
341
+ result = func (self ._pa_array , pattern = pat , replacement = repl , max_replacements = n )
329
342
return type (self )(result )
330
343
331
344
def _str_match (
@@ -343,68 +356,68 @@ def _str_fullmatch(
343
356
return self ._str_match (pat , case , flags , na )
344
357
345
358
def _str_isalnum (self ):
346
- result = pc .utf8_is_alnum (self ._data )
359
+ result = pc .utf8_is_alnum (self ._pa_array )
347
360
return BooleanDtype ().__from_arrow__ (result )
348
361
349
362
def _str_isalpha (self ):
350
- result = pc .utf8_is_alpha (self ._data )
363
+ result = pc .utf8_is_alpha (self ._pa_array )
351
364
return BooleanDtype ().__from_arrow__ (result )
352
365
353
366
def _str_isdecimal (self ):
354
- result = pc .utf8_is_decimal (self ._data )
367
+ result = pc .utf8_is_decimal (self ._pa_array )
355
368
return BooleanDtype ().__from_arrow__ (result )
356
369
357
370
def _str_isdigit (self ):
358
- result = pc .utf8_is_digit (self ._data )
371
+ result = pc .utf8_is_digit (self ._pa_array )
359
372
return BooleanDtype ().__from_arrow__ (result )
360
373
361
374
def _str_islower (self ):
362
- result = pc .utf8_is_lower (self ._data )
375
+ result = pc .utf8_is_lower (self ._pa_array )
363
376
return BooleanDtype ().__from_arrow__ (result )
364
377
365
378
def _str_isnumeric (self ):
366
- result = pc .utf8_is_numeric (self ._data )
379
+ result = pc .utf8_is_numeric (self ._pa_array )
367
380
return BooleanDtype ().__from_arrow__ (result )
368
381
369
382
def _str_isspace (self ):
370
- result = pc .utf8_is_space (self ._data )
383
+ result = pc .utf8_is_space (self ._pa_array )
371
384
return BooleanDtype ().__from_arrow__ (result )
372
385
373
386
def _str_istitle (self ):
374
- result = pc .utf8_is_title (self ._data )
387
+ result = pc .utf8_is_title (self ._pa_array )
375
388
return BooleanDtype ().__from_arrow__ (result )
376
389
377
390
def _str_isupper (self ):
378
- result = pc .utf8_is_upper (self ._data )
391
+ result = pc .utf8_is_upper (self ._pa_array )
379
392
return BooleanDtype ().__from_arrow__ (result )
380
393
381
394
def _str_len (self ):
382
- result = pc .utf8_length (self ._data )
395
+ result = pc .utf8_length (self ._pa_array )
383
396
return Int64Dtype ().__from_arrow__ (result )
384
397
385
398
def _str_lower (self ):
386
- return type (self )(pc .utf8_lower (self ._data ))
399
+ return type (self )(pc .utf8_lower (self ._pa_array ))
387
400
388
401
def _str_upper (self ):
389
- return type (self )(pc .utf8_upper (self ._data ))
402
+ return type (self )(pc .utf8_upper (self ._pa_array ))
390
403
391
404
def _str_strip (self , to_strip = None ):
392
405
if to_strip is None :
393
- result = pc .utf8_trim_whitespace (self ._data )
406
+ result = pc .utf8_trim_whitespace (self ._pa_array )
394
407
else :
395
- result = pc .utf8_trim (self ._data , characters = to_strip )
408
+ result = pc .utf8_trim (self ._pa_array , characters = to_strip )
396
409
return type (self )(result )
397
410
398
411
def _str_lstrip (self , to_strip = None ):
399
412
if to_strip is None :
400
- result = pc .utf8_ltrim_whitespace (self ._data )
413
+ result = pc .utf8_ltrim_whitespace (self ._pa_array )
401
414
else :
402
- result = pc .utf8_ltrim (self ._data , characters = to_strip )
415
+ result = pc .utf8_ltrim (self ._pa_array , characters = to_strip )
403
416
return type (self )(result )
404
417
405
418
def _str_rstrip (self , to_strip = None ):
406
419
if to_strip is None :
407
- result = pc .utf8_rtrim_whitespace (self ._data )
420
+ result = pc .utf8_rtrim_whitespace (self ._pa_array )
408
421
else :
409
- result = pc .utf8_rtrim (self ._data , characters = to_strip )
422
+ result = pc .utf8_rtrim (self ._pa_array , characters = to_strip )
410
423
return type (self )(result )
0 commit comments