14
14
missing as libmissing ,
15
15
)
16
16
from pandas ._libs .arrays import NDArrayBacked
17
- from pandas .compat import pa_version_under7p0
17
+ from pandas .compat import (
18
+ is_numpy_dev ,
19
+ pa_version_under7p0 ,
20
+ )
18
21
from pandas .compat .numpy import function as nv
19
22
from pandas .util ._decorators import doc
20
23
24
27
register_extension_dtype ,
25
28
)
26
29
from pandas .core .dtypes .common import (
30
+ get_string_dtype ,
27
31
is_array_like ,
28
32
is_bool_dtype ,
29
33
is_integer_dtype ,
@@ -76,7 +80,7 @@ class StringDtype(StorageExtensionDtype):
76
80
77
81
Parameters
78
82
----------
79
- storage : {"python", "pyarrow"}, optional
83
+ storage : {"python", "pyarrow", "numpy" }, optional
80
84
If not given, the value of ``pd.options.mode.string_storage``.
81
85
82
86
Attributes
@@ -108,14 +112,17 @@ def na_value(self) -> libmissing.NAType:
108
112
def __init__ (self , storage = None ) -> None :
109
113
if storage is None :
110
114
storage = get_option ("mode.string_storage" )
111
- if storage not in {"python" , "pyarrow" }:
115
+ if storage not in {"python" , "pyarrow" , "numpy" }:
112
116
raise ValueError (
113
- f"Storage must be 'python' or 'pyarrow'. Got { storage } instead."
117
+ "Storage must be 'python', 'pyarrow', or 'numpy'. "
118
+ "Got {storage} instead."
114
119
)
115
120
if storage == "pyarrow" and pa_version_under7p0 :
116
121
raise ImportError (
117
122
"pyarrow>=7.0.0 is required for PyArrow backed StringArray."
118
123
)
124
+ if storage == "numpy" and not is_numpy_dev :
125
+ raise ImportError ("NumPy backed string storage requires numpy dev" )
119
126
self .storage = storage
120
127
121
128
@property
@@ -139,6 +146,7 @@ def construct_from_string(cls, string):
139
146
``'string'`` pd.options.mode.string_storage, default python
140
147
``'string[python]'`` python
141
148
``'string[pyarrow]'`` pyarrow
149
+ ``'string[numpy]'`` numpy
142
150
========================== ==============================================
143
151
144
152
Returns
@@ -160,6 +168,8 @@ def construct_from_string(cls, string):
160
168
return cls (storage = "python" )
161
169
elif string == "string[pyarrow]" :
162
170
return cls (storage = "pyarrow" )
171
+ elif string == "string[numpy]" :
172
+ return cls (storage = "numpy" )
163
173
else :
164
174
raise TypeError (f"Cannot construct a '{ cls .__name__ } ' from '{ string } '" )
165
175
@@ -179,9 +189,13 @@ def construct_array_type( # type: ignore[override]
179
189
from pandas .core .arrays .string_arrow import ArrowStringArray
180
190
181
191
if self .storage == "python" :
182
- return StringArray
183
- else :
192
+ return ObjectStringArray
193
+ elif self . storage == "pyarrow" :
184
194
return ArrowStringArray
195
+ elif self .storage == "numpy" :
196
+ return NumpyStringArray
197
+ else :
198
+ raise NotImplementedError
185
199
186
200
def __from_arrow__ (
187
201
self , array : pyarrow .Array | pyarrow .ChunkedArray
@@ -231,7 +245,7 @@ def tolist(self):
231
245
232
246
# error: Definition of "_concat_same_type" in base class "NDArrayBacked" is
233
247
# incompatible with definition in base class "ExtensionArray"
234
- class StringArray (BaseStringArray , PandasArray ): # type: ignore[misc]
248
+ class BaseNumpyStringArray (BaseStringArray , PandasArray ): # type: ignore[misc]
235
249
"""
236
250
Extension array for string data.
237
251
@@ -321,54 +335,23 @@ def __init__(self, values, copy: bool = False) -> None:
321
335
super ().__init__ (values , copy = copy )
322
336
if not isinstance (values , type (self )):
323
337
self ._validate ()
324
- NDArrayBacked .__init__ (self , self ._ndarray , StringDtype (storage = "python" ))
338
+ NDArrayBacked .__init__ (self , self ._ndarray , StringDtype (storage = self . _storage ))
325
339
326
340
def _validate (self ):
327
341
"""Validate that we only store NA or strings."""
328
342
if len (self ._ndarray ) and not lib .is_string_array (self ._ndarray , skipna = True ):
329
343
raise ValueError ("StringArray requires a sequence of strings or pandas.NA" )
330
- if self ._ndarray .dtype != "object" :
344
+ if self ._ndarray .dtype != self . _cache_dtype :
331
345
raise ValueError (
332
- "StringArray requires a sequence of strings or pandas.NA. Got "
346
+ f"{ type (self ).__name__ } requires a sequence of strings or "
347
+ "pandas.NA convertible to a NumPy array with dtype "
348
+ f"{ self ._cache_dtype } . Got "
333
349
f"'{ self ._ndarray .dtype } ' dtype instead."
334
350
)
335
- # Check to see if need to convert Na values to pd.NA
336
- if self ._ndarray .ndim > 2 :
337
- # Ravel if ndims > 2 b/c no cythonized version available
338
- lib .convert_nans_to_NA (self ._ndarray .ravel ("K" ))
339
- else :
340
- lib .convert_nans_to_NA (self ._ndarray )
341
351
342
352
@classmethod
343
353
def _from_sequence (cls , scalars , * , dtype : Dtype | None = None , copy : bool = False ):
344
- if dtype and not (isinstance (dtype , str ) and dtype == "string" ):
345
- dtype = pandas_dtype (dtype )
346
- assert isinstance (dtype , StringDtype ) and dtype .storage == "python"
347
-
348
- from pandas .core .arrays .masked import BaseMaskedArray
349
-
350
- if isinstance (scalars , BaseMaskedArray ):
351
- # avoid costly conversion to object dtype
352
- na_values = scalars ._mask
353
- result = scalars ._data
354
- result = lib .ensure_string_array (result , copy = copy , convert_na_value = False )
355
- result [na_values ] = libmissing .NA
356
-
357
- else :
358
- if hasattr (scalars , "type" ):
359
- # pyarrow array; we cannot rely on the "to_numpy" check in
360
- # ensure_string_array because calling scalars.to_numpy would set
361
- # zero_copy_only to True which caused problems see GH#52076
362
- scalars = np .array (scalars )
363
- # convert non-na-likes to str, and nan-likes to StringDtype().na_value
364
- result = lib .ensure_string_array (scalars , na_value = libmissing .NA , copy = copy )
365
-
366
- # Manually creating new array avoids the validation step in the __init__, so is
367
- # faster. Refactor need for validation?
368
- new_string_array = cls .__new__ (cls )
369
- NDArrayBacked .__init__ (new_string_array , result , StringDtype (storage = "python" ))
370
-
371
- return new_string_array
354
+ raise NotImplementedError ("_from_sequence must be implemented in subclasses" )
372
355
373
356
@classmethod
374
357
def _from_sequence_of_strings (
@@ -612,3 +595,71 @@ def _str_map(
612
595
# or .findall returns a list).
613
596
# -> We don't know the result type. E.g. `.get` can return anything.
614
597
return lib .map_infer_mask (arr , f , mask .view ("uint8" ))
598
+
599
+
600
+ class ObjectStringArray (BaseNumpyStringArray ):
601
+ _cache_dtype = "object"
602
+ _storage = "python"
603
+
604
+ def _validate (self ):
605
+ super ()._validate ()
606
+ # Check to see if need to convert Na values to pd.NA
607
+ if self ._ndarray .ndim > 2 :
608
+ # Ravel if ndims > 2 b/c no cythonized version available
609
+ lib .convert_nans_to_NA (self ._ndarray .ravel ("K" ))
610
+ else :
611
+ lib .convert_nans_to_NA (self ._ndarray )
612
+
613
+ @classmethod
614
+ def _from_sequence (cls , scalars , * , dtype : Dtype | None = None , copy : bool = False ):
615
+ if dtype and not (isinstance (dtype , str ) and dtype == "string" ):
616
+ dtype = pandas_dtype (dtype )
617
+ assert isinstance (dtype , StringDtype ) and dtype .storage == "python"
618
+
619
+ from pandas .core .arrays .masked import BaseMaskedArray
620
+
621
+ if isinstance (scalars , BaseMaskedArray ):
622
+ # avoid costly conversion to object dtype
623
+ na_values = scalars ._mask
624
+ result = scalars ._data
625
+ result = lib .ensure_string_array (result , copy = copy , convert_na_value = False )
626
+ result [na_values ] = libmissing .NA
627
+
628
+ else :
629
+ if hasattr (scalars , "type" ):
630
+ # pyarrow array; we cannot rely on the "to_numpy" check in
631
+ # ensure_string_array because calling scalars.to_numpy would set
632
+ # zero_copy_only to True which caused problems see GH#52076
633
+ scalars = np .array (scalars )
634
+ # convert non-na-likes to str, and nan-likes to StringDtype().na_value
635
+ result = lib .ensure_string_array (scalars , na_value = libmissing .NA , copy = copy )
636
+
637
+ # Manually creating new array avoids the validation step in the __init__, so is
638
+ # faster. Refactor need for validation?
639
+ new_string_array = cls .__new__ (cls )
640
+ NDArrayBacked .__init__ (
641
+ new_string_array , result , StringDtype (storage = cls ._storage )
642
+ )
643
+
644
+ return new_string_array
645
+
646
+
647
+ StringArray = ObjectStringArray
648
+
649
+
650
+ class NumpyStringArray (BaseNumpyStringArray ):
651
+ _cache_dtype = get_string_dtype ()
652
+ _storage = "numpy"
653
+
654
+ @classmethod
655
+ def _from_sequence (cls , scalars , * , dtype : Dtype | None = None , copy : bool = False ):
656
+ result = np .array (scalars , dtype = cls ._cache_dtype )
657
+
658
+ # Manually creating new array avoids the validation step in the __init__, so is
659
+ # faster. Refactor need for validation?
660
+ new_string_array = cls .__new__ (cls )
661
+ NDArrayBacked .__init__ (
662
+ new_string_array , result , StringDtype (storage = cls ._storage )
663
+ )
664
+
665
+ return new_string_array
0 commit comments