9
9
import numpy as np
10
10
11
11
from pandas ._typing import (
12
+ Dtype ,
13
+ PositionalIndexer ,
12
14
TakeIndexer ,
13
15
npt ,
14
16
)
24
26
is_array_like ,
25
27
is_bool_dtype ,
26
28
is_integer ,
29
+ is_integer_dtype ,
27
30
is_scalar ,
28
31
)
29
32
from pandas .core .dtypes .missing import isna
30
33
31
34
from pandas .core .arrays .base import ExtensionArray
32
35
from pandas .core .indexers import (
33
36
check_array_indexer ,
37
+ unpack_tuple_and_ellipses ,
34
38
validate_indices ,
35
39
)
36
40
39
43
import pyarrow .compute as pc
40
44
41
45
from pandas .core .arrays .arrow ._arrow_utils import fallback_performancewarning
46
+ from pandas .core .arrays .arrow .dtype import ArrowDtype
42
47
43
48
if TYPE_CHECKING :
44
49
from pandas import Series
48
53
49
54
class ArrowExtensionArray (ExtensionArray ):
50
55
"""
51
- Base class for ExtensionArray backed by Arrow array .
56
+ Base class for ExtensionArray backed by Arrow ChunkedArray .
52
57
"""
53
58
54
59
_data : pa .ChunkedArray
55
60
56
- def __init__ (self , values : pa .ChunkedArray ) -> None :
57
- self ._data = values
61
+ def __init__ (self , values : pa .Array | pa .ChunkedArray ) -> None :
62
+ if pa_version_under1p01 :
63
+ msg = "pyarrow>=1.0.0 is required for PyArrow backed ArrowExtensionArray."
64
+ raise ImportError (msg )
65
+ if isinstance (values , pa .Array ):
66
+ self ._data = pa .chunked_array ([values ])
67
+ elif isinstance (values , pa .ChunkedArray ):
68
+ self ._data = values
69
+ else :
70
+ raise ValueError (
71
+ f"Unsupported type '{ type (values )} ' for ArrowExtensionArray"
72
+ )
73
+ self ._dtype = ArrowDtype (self ._data .type )
74
+
75
+ @classmethod
76
+ def _from_sequence (cls , scalars , * , dtype : Dtype | None = None , copy = False ):
77
+ """
78
+ Construct a new ExtensionArray from a sequence of scalars.
79
+ """
80
+ if isinstance (dtype , ArrowDtype ):
81
+ pa_dtype = dtype .pyarrow_dtype
82
+ elif dtype :
83
+ pa_dtype = pa .from_numpy_dtype (dtype )
84
+ else :
85
+ pa_dtype = None
86
+
87
+ if isinstance (scalars , cls ):
88
+ data = scalars ._data
89
+ if pa_dtype :
90
+ data = data .cast (pa_dtype )
91
+ return cls (data )
92
+ else :
93
+ return cls (
94
+ pa .chunked_array (pa .array (scalars , type = pa_dtype , from_pandas = True ))
95
+ )
96
+
97
+ @classmethod
98
+ def _from_sequence_of_strings (
99
+ cls , strings , * , dtype : Dtype | None = None , copy = False
100
+ ):
101
+ """
102
+ Construct a new ExtensionArray from a sequence of strings.
103
+ """
104
+ return cls ._from_sequence (strings , dtype = dtype , copy = copy )
105
+
106
+ def __getitem__ (self , item : PositionalIndexer ):
107
+ """Select a subset of self.
108
+
109
+ Parameters
110
+ ----------
111
+ item : int, slice, or ndarray
112
+ * int: The position in 'self' to get.
113
+ * slice: A slice object, where 'start', 'stop', and 'step' are
114
+ integers or None
115
+ * ndarray: A 1-d boolean NumPy ndarray the same length as 'self'
116
+
117
+ Returns
118
+ -------
119
+ item : scalar or ExtensionArray
120
+
121
+ Notes
122
+ -----
123
+ For scalar ``item``, return a scalar value suitable for the array's
124
+ type. This should be an instance of ``self.dtype.type``.
125
+ For slice ``key``, return an instance of ``ExtensionArray``, even
126
+ if the slice is length 0 or 1.
127
+ For a boolean mask, return an instance of ``ExtensionArray``, filtered
128
+ to the values where ``item`` is True.
129
+ """
130
+ item = check_array_indexer (self , item )
131
+
132
+ if isinstance (item , np .ndarray ):
133
+ if not len (item ):
134
+ # Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string]
135
+ if self ._dtype .name == "string" and self ._dtype .storage == "pyarrow" :
136
+ pa_dtype = pa .string ()
137
+ else :
138
+ pa_dtype = self ._dtype .pyarrow_dtype
139
+ return type (self )(pa .chunked_array ([], type = pa_dtype ))
140
+ elif is_integer_dtype (item .dtype ):
141
+ return self .take (item )
142
+ elif is_bool_dtype (item .dtype ):
143
+ return type (self )(self ._data .filter (item ))
144
+ else :
145
+ raise IndexError (
146
+ "Only integers, slices and integer or "
147
+ "boolean arrays are valid indices."
148
+ )
149
+ elif isinstance (item , tuple ):
150
+ item = unpack_tuple_and_ellipses (item )
151
+
152
+ # error: Non-overlapping identity check (left operand type:
153
+ # "Union[Union[int, integer[Any]], Union[slice, List[int],
154
+ # ndarray[Any, Any]]]", right operand type: "ellipsis")
155
+ if item is Ellipsis : # type: ignore[comparison-overlap]
156
+ # TODO: should be handled by pyarrow?
157
+ item = slice (None )
158
+
159
+ if is_scalar (item ) and not is_integer (item ):
160
+ # e.g. "foo" or 2.5
161
+ # exception message copied from numpy
162
+ raise IndexError (
163
+ r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis "
164
+ r"(`None`) and integer or boolean arrays are valid indices"
165
+ )
166
+ # We are not an array indexer, so maybe e.g. a slice or integer
167
+ # indexer. We dispatch to pyarrow.
168
+ value = self ._data [item ]
169
+ if isinstance (value , pa .ChunkedArray ):
170
+ return type (self )(value )
171
+ else :
172
+ scalar = value .as_py ()
173
+ if scalar is None :
174
+ return self ._dtype .na_value
175
+ else :
176
+ return scalar
58
177
59
178
def __arrow_array__ (self , type = None ):
60
- """Convert myself to a pyarrow Array or ChunkedArray."""
179
+ """Convert myself to a pyarrow ChunkedArray."""
61
180
return self ._data
62
181
63
182
def equals (self , other ) -> bool :
@@ -67,6 +186,13 @@ def equals(self, other) -> bool:
67
186
# TODO: is this documented somewhere?
68
187
return self ._data == other ._data
69
188
189
+ @property
190
+ def dtype (self ) -> ArrowDtype :
191
+ """
192
+ An instance of 'ExtensionDtype'.
193
+ """
194
+ return self ._dtype
195
+
70
196
@property
71
197
def nbytes (self ) -> int :
72
198
"""
@@ -377,7 +503,8 @@ def _indexing_key_to_indices(
377
503
378
504
def _maybe_convert_setitem_value (self , value ):
379
505
"""Maybe convert value to be pyarrow compatible."""
380
- raise NotImplementedError ()
506
+ # TODO: Make more robust like ArrowStringArray._maybe_convert_setitem_value
507
+ return value
381
508
382
509
def _set_via_chunk_iteration (
383
510
self , indices : npt .NDArray [np .intp ], value : npt .NDArray [Any ]
0 commit comments