13
13
pa_version_under11p0 ,
14
14
)
15
15
16
+ from pandas .core .dtypes .common import is_list_like
17
+
16
18
if not pa_version_under10p1 :
17
19
import pyarrow as pa
18
20
import pyarrow .compute as pc
@@ -267,7 +269,16 @@ def dtypes(self) -> Series:
267
269
names = [struct .name for struct in pa_type ]
268
270
return Series (types , index = Index (names ))
269
271
270
- def field (self , name_or_index : str | int ) -> Series :
272
+ def field (
273
+ self ,
274
+ name_or_index : list [str ]
275
+ | list [bytes ]
276
+ | list [int ]
277
+ | pc .Expression
278
+ | bytes
279
+ | str
280
+ | int ,
281
+ ) -> Series :
271
282
"""
272
283
Extract a child field of a struct as a Series.
273
284
@@ -281,6 +292,17 @@ def field(self, name_or_index: str | int) -> Series:
281
292
pandas.Series
282
293
The data corresponding to the selected child field.
283
294
295
+ Notes
296
+ -----
297
+ The name of the resulting Series will be set using the following
298
+ rules:
299
+
300
+ - For string, bytes, or integer `name_or_index` (or a list of these, for
301
+ a nested selection), the Series name is set to the selected
302
+ field's name.
303
+ - For a :class:`pyarrow.compute.Expression`, this is set to
304
+ the string form of the expression.
305
+
284
306
See Also
285
307
--------
286
308
Series.struct.explode : Return all child fields as a DataFrame.
@@ -314,27 +336,81 @@ def field(self, name_or_index: str | int) -> Series:
314
336
1 2
315
337
2 1
316
338
Name: version, dtype: int64[pyarrow]
339
+
340
+ Or an expression
341
+
342
+ >>> import pyarrow.compute as pc
343
+ >>> s.struct.field(pc.field("project"))
344
+ 0 pandas
345
+ 1 pandas
346
+ 2 numpy
347
+ Name: project, dtype: string[pyarrow]
348
+
349
+ For nested struct types, you can
350
+
351
+ >>> version_type = pa.struct([
352
+ ... ("major", pa.int64()),
353
+ ... ("minor", pa.int64()),
354
+ ... ])
355
+ >>> s = pd.Series(
356
+ ... [
357
+ ... {"version": {"major": 1, "minor": 5}, "project": "pandas"},
358
+ ... {"version": {"major": 2, "minor": 1}, "project": "pandas"},
359
+ ... {"version": {"major": 1, "minor": 26}, "project": "numpy"},
360
+ ... ],
361
+ ... dtype=pd.ArrowDtype(pa.struct(
362
+ ... [("version", version_type), ("project", pa.string())]
363
+ ... ))
364
+ ... )
365
+ >>> s.struct.field(["version", "minor"])
366
+ 0 5
367
+ 1 1
368
+ 2 26
369
+ Name: minor, dtype: int64[pyarrow]
370
+ >>> s.struct.field([0, 0])
371
+ 0 1
372
+ 1 2
373
+ 2 1
374
+ Name: major, dtype: int64[pyarrow]
317
375
"""
318
376
from pandas import Series
319
377
378
+ def get_name (level_name_or_index , data ):
379
+ if isinstance (level_name_or_index , int ):
380
+ index = data .type .field (level_name_or_index ).name
381
+ elif isinstance (level_name_or_index , (str , bytes )):
382
+ # index = pa_arr.type.get_field_index(level_name_or_index)
383
+ index = level_name_or_index
384
+ elif isinstance (level_name_or_index , pc .Expression ):
385
+ index = str (level_name_or_index )
386
+ elif is_list_like (level_name_or_index ):
387
+ # For nested input like [2, 1, 2]
388
+ # iteratively get the struct and field name. The last
389
+ # one is used for the name of the index.
390
+ level_name_or_index = list (reversed (level_name_or_index ))
391
+ selected = data
392
+ while level_name_or_index :
393
+ name_or_index = level_name_or_index .pop ()
394
+ name = get_name (name_or_index , selected )
395
+ selected = selected .type .field (selected .type .get_field_index (name ))
396
+ index = selected .name
397
+ return index
398
+ else :
399
+ raise ValueError (
400
+ "name_or_index must be an int, str, bytes, "
401
+ "pyarrow.compute.Expression, or list of those"
402
+ )
403
+ return index
404
+
320
405
pa_arr = self ._data .array ._pa_array
321
- if isinstance (name_or_index , int ):
322
- index = name_or_index
323
- elif isinstance (name_or_index , str ):
324
- index = pa_arr .type .get_field_index (name_or_index )
325
- else :
326
- raise ValueError (
327
- "name_or_index must be an int or str, "
328
- f"got { type (name_or_index ).__name__ } "
329
- )
406
+ name = get_name (name_or_index , pa_arr )
407
+ field_arr = pc .struct_field (pa_arr , name_or_index )
330
408
331
- pa_field = pa_arr .type [index ]
332
- field_arr = pc .struct_field (pa_arr , [index ])
333
409
return Series (
334
410
field_arr ,
335
411
dtype = ArrowDtype (field_arr .type ),
336
412
index = self ._data .index ,
337
- name = pa_field . name ,
413
+ name = name ,
338
414
)
339
415
340
416
def explode (self ) -> DataFrame :
0 commit comments