2
2
3
3
from __future__ import annotations
4
4
5
+ from abc import (
6
+ ABCMeta ,
7
+ abstractmethod ,
8
+ )
5
9
from typing import TYPE_CHECKING
6
10
7
- from pandas .compat import pa_version_under10p1
11
+ from pandas .compat import (
12
+ pa_version_under10p1 ,
13
+ pa_version_under11p0 ,
14
+ )
8
15
9
16
if not pa_version_under10p1 :
10
17
import pyarrow as pa
13
20
from pandas .core .dtypes .dtypes import ArrowDtype
14
21
15
22
if TYPE_CHECKING :
23
+ from collections .abc import Iterator
24
+
16
25
from pandas import (
17
26
DataFrame ,
18
27
Series ,
19
28
)
20
29
21
30
22
- class StructAccessor :
31
+ class ArrowAccessor (metaclass = ABCMeta ):
32
+ @abstractmethod
33
+ def __init__ (self , data , validation_msg : str ) -> None :
34
+ self ._data = data
35
+ self ._validation_msg = validation_msg
36
+ self ._validate (data )
37
+
38
+ @abstractmethod
39
+ def _is_valid_pyarrow_dtype (self , pyarrow_dtype ) -> bool :
40
+ pass
41
+
42
+ def _validate (self , data ):
43
+ dtype = data .dtype
44
+ if not isinstance (dtype , ArrowDtype ):
45
+ # Raise AttributeError so that inspect can handle non-struct Series.
46
+ raise AttributeError (self ._validation_msg .format (dtype = dtype ))
47
+
48
+ if not self ._is_valid_pyarrow_dtype (dtype .pyarrow_dtype ):
49
+ # Raise AttributeError so that inspect can handle invalid Series.
50
+ raise AttributeError (self ._validation_msg .format (dtype = dtype ))
51
+
52
+ @property
53
+ def _pa_array (self ):
54
+ return self ._data .array ._pa_array
55
+
56
+
57
+ class ListAccessor (ArrowAccessor ):
58
+ """
59
+ Accessor object for list data properties of the Series values.
60
+
61
+ Parameters
62
+ ----------
63
+ data : Series
64
+ Series containing Arrow list data.
65
+ """
66
+
67
+ def __init__ (self , data = None ) -> None :
68
+ super ().__init__ (
69
+ data ,
70
+ validation_msg = "Can only use the '.list' accessor with "
71
+ "'list[pyarrow]' dtype, not {dtype}." ,
72
+ )
73
+
74
+ def _is_valid_pyarrow_dtype (self , pyarrow_dtype ) -> bool :
75
+ return (
76
+ pa .types .is_list (pyarrow_dtype )
77
+ or pa .types .is_fixed_size_list (pyarrow_dtype )
78
+ or pa .types .is_large_list (pyarrow_dtype )
79
+ )
80
+
81
+ def len (self ) -> Series :
82
+ """
83
+ Return the length of each list in the Series.
84
+
85
+ Returns
86
+ -------
87
+ pandas.Series
88
+ The length of each list.
89
+
90
+ Examples
91
+ --------
92
+ >>> import pyarrow as pa
93
+ >>> s = pd.Series(
94
+ ... [
95
+ ... [1, 2, 3],
96
+ ... [3],
97
+ ... ],
98
+ ... dtype=pd.ArrowDtype(pa.list_(
99
+ ... pa.int64()
100
+ ... ))
101
+ ... )
102
+ >>> s.list.len()
103
+ 0 3
104
+ 1 1
105
+ dtype: int32[pyarrow]
106
+ """
107
+ from pandas import Series
108
+
109
+ value_lengths = pc .list_value_length (self ._pa_array )
110
+ return Series (value_lengths , dtype = ArrowDtype (value_lengths .type ))
111
+
112
+ def __getitem__ (self , key : int | slice ) -> Series :
113
+ """
114
+ Index or slice lists in the Series.
115
+
116
+ Parameters
117
+ ----------
118
+ key : int | slice
119
+ Index or slice of indices to access from each list.
120
+
121
+ Returns
122
+ -------
123
+ pandas.Series
124
+ The list at requested index.
125
+
126
+ Examples
127
+ --------
128
+ >>> import pyarrow as pa
129
+ >>> s = pd.Series(
130
+ ... [
131
+ ... [1, 2, 3],
132
+ ... [3],
133
+ ... ],
134
+ ... dtype=pd.ArrowDtype(pa.list_(
135
+ ... pa.int64()
136
+ ... ))
137
+ ... )
138
+ >>> s.list[0]
139
+ 0 1
140
+ 1 3
141
+ dtype: int64[pyarrow]
142
+ """
143
+ from pandas import Series
144
+
145
+ if isinstance (key , int ):
146
+ # TODO: Support negative key but pyarrow does not allow
147
+ # element index to be an array.
148
+ # if key < 0:
149
+ # key = pc.add(key, pc.list_value_length(self._pa_array))
150
+ element = pc .list_element (self ._pa_array , key )
151
+ return Series (element , dtype = ArrowDtype (element .type ))
152
+ elif isinstance (key , slice ):
153
+ if pa_version_under11p0 :
154
+ raise NotImplementedError (
155
+ f"List slice not supported by pyarrow { pa .__version__ } ."
156
+ )
157
+
158
+ # TODO: Support negative start/stop/step, ideally this would be added
159
+ # upstream in pyarrow.
160
+ start , stop , step = key .start , key .stop , key .step
161
+ if start is None :
162
+ # TODO: When adding negative step support
163
+ # this should be setto last element of array
164
+ # when step is negative.
165
+ start = 0
166
+ if step is None :
167
+ step = 1
168
+ sliced = pc .list_slice (self ._pa_array , start , stop , step )
169
+ return Series (sliced , dtype = ArrowDtype (sliced .type ))
170
+ else :
171
+ raise ValueError (f"key must be an int or slice, got { type (key ).__name__ } " )
172
+
173
+ def __iter__ (self ) -> Iterator :
174
+ raise TypeError (f"'{ type (self ).__name__ } ' object is not iterable" )
175
+
176
+ def flatten (self ) -> Series :
177
+ """
178
+ Flatten list values.
179
+
180
+ Returns
181
+ -------
182
+ pandas.Series
183
+ The data from all lists in the series flattened.
184
+
185
+ Examples
186
+ --------
187
+ >>> import pyarrow as pa
188
+ >>> s = pd.Series(
189
+ ... [
190
+ ... [1, 2, 3],
191
+ ... [3],
192
+ ... ],
193
+ ... dtype=pd.ArrowDtype(pa.list_(
194
+ ... pa.int64()
195
+ ... ))
196
+ ... )
197
+ >>> s.list.flatten()
198
+ 0 1
199
+ 1 2
200
+ 2 3
201
+ 3 3
202
+ dtype: int64[pyarrow]
203
+ """
204
+ from pandas import Series
205
+
206
+ flattened = pc .list_flatten (self ._pa_array )
207
+ return Series (flattened , dtype = ArrowDtype (flattened .type ))
208
+
209
+
210
+ class StructAccessor (ArrowAccessor ):
23
211
"""
24
212
Accessor object for structured data properties of the Series values.
25
213
@@ -29,23 +217,17 @@ class StructAccessor:
29
217
Series containing Arrow struct data.
30
218
"""
31
219
32
- _validation_msg = (
33
- "Can only use the '.struct' accessor with 'struct[pyarrow]' dtype, not {dtype}."
34
- )
35
-
36
220
def __init__ (self , data = None ) -> None :
37
- self ._parent = data
38
- self ._validate (data )
39
-
40
- def _validate (self , data ):
41
- dtype = data .dtype
42
- if not isinstance (dtype , ArrowDtype ):
43
- # Raise AttributeError so that inspect can handle non-struct Series.
44
- raise AttributeError (self ._validation_msg .format (dtype = dtype ))
221
+ super ().__init__ (
222
+ data ,
223
+ validation_msg = (
224
+ "Can only use the '.struct' accessor with 'struct[pyarrow]' "
225
+ "dtype, not {dtype}."
226
+ ),
227
+ )
45
228
46
- if not pa .types .is_struct (dtype .pyarrow_dtype ):
47
- # Raise AttributeError so that inspect can handle non-struct Series.
48
- raise AttributeError (self ._validation_msg .format (dtype = dtype ))
229
+ def _is_valid_pyarrow_dtype (self , pyarrow_dtype ) -> bool :
230
+ return pa .types .is_struct (pyarrow_dtype )
49
231
50
232
@property
51
233
def dtypes (self ) -> Series :
@@ -80,7 +262,7 @@ def dtypes(self) -> Series:
80
262
Series ,
81
263
)
82
264
83
- pa_type = self ._parent .dtype .pyarrow_dtype
265
+ pa_type = self ._data .dtype .pyarrow_dtype
84
266
types = [ArrowDtype (struct .type ) for struct in pa_type ]
85
267
names = [struct .name for struct in pa_type ]
86
268
return Series (types , index = Index (names ))
@@ -135,7 +317,7 @@ def field(self, name_or_index: str | int) -> Series:
135
317
"""
136
318
from pandas import Series
137
319
138
- pa_arr = self ._parent .array ._pa_array
320
+ pa_arr = self ._data .array ._pa_array
139
321
if isinstance (name_or_index , int ):
140
322
index = name_or_index
141
323
elif isinstance (name_or_index , str ):
@@ -151,7 +333,7 @@ def field(self, name_or_index: str | int) -> Series:
151
333
return Series (
152
334
field_arr ,
153
335
dtype = ArrowDtype (field_arr .type ),
154
- index = self ._parent .index ,
336
+ index = self ._data .index ,
155
337
name = pa_field .name ,
156
338
)
157
339
@@ -190,7 +372,7 @@ def explode(self) -> DataFrame:
190
372
"""
191
373
from pandas import concat
192
374
193
- pa_type = self ._parent . dtype . pyarrow_dtype
375
+ pa_type = self ._pa_array . type
194
376
return concat (
195
377
[self .field (i ) for i in range (pa_type .num_fields )], axis = "columns"
196
378
)
0 commit comments