13
13
pa_version_under11p0 ,
14
14
)
15
15
16
+ from pandas .core .dtypes .common import is_list_like
17
+
16
18
if not pa_version_under10p1 :
17
19
import pyarrow as pa
18
20
import pyarrow .compute as pc
@@ -267,7 +269,16 @@ def dtypes(self) -> Series:
267
269
names = [struct .name for struct in pa_type ]
268
270
return Series (types , index = Index (names ))
269
271
270
- def field (self , name_or_index : str | int ) -> Series :
272
+ def field (
273
+ self ,
274
+ name_or_index : list [str ]
275
+ | list [bytes ]
276
+ | list [int ]
277
+ | pc .Expression
278
+ | bytes
279
+ | str
280
+ | int ,
281
+ ) -> Series :
271
282
"""
272
283
Extract a child field of a struct as a Series.
273
284
@@ -281,6 +292,17 @@ def field(self, name_or_index: str | int) -> Series:
281
292
pandas.Series
282
293
The data corresponding to the selected child field.
283
294
295
+ Notes
296
+ -----
297
+ The name of the resulting Series will be set using the following
298
+ rules:
299
+
300
+ - For string, bytes, or integer `name_or_index` (or a list of these, for
301
+ a nested selection), the Series name is set to the selected
302
+ field's name.
303
+ - For a :class:`pyarrow.compute.Expression`, this is set to
304
+ the string form of the expression.
305
+
284
306
See Also
285
307
--------
286
308
Series.struct.explode : Return all child fields as a DataFrame.
@@ -314,27 +336,90 @@ def field(self, name_or_index: str | int) -> Series:
314
336
1 2
315
337
2 1
316
338
Name: version, dtype: int64[pyarrow]
339
+
340
+ Or an expression
341
+
342
+ >>> import pyarrow.compute as pc
343
+ >>> s.struct.field(pc.field("project"))
344
+ 0 pandas
345
+ 1 pandas
346
+ 2 numpy
347
+ Name: project, dtype: string[pyarrow]
348
+
349
+ For nested struct types, you can pass a list of values to index
350
+ multiple levels:
351
+
352
+ >>> version_type = pa.struct([
353
+ ... ("major", pa.int64()),
354
+ ... ("minor", pa.int64()),
355
+ ... ])
356
+ >>> s = pd.Series(
357
+ ... [
358
+ ... {"version": {"major": 1, "minor": 5}, "project": "pandas"},
359
+ ... {"version": {"major": 2, "minor": 1}, "project": "pandas"},
360
+ ... {"version": {"major": 1, "minor": 26}, "project": "numpy"},
361
+ ... ],
362
+ ... dtype=pd.ArrowDtype(pa.struct(
363
+ ... [("version", version_type), ("project", pa.string())]
364
+ ... ))
365
+ ... )
366
+ >>> s.struct.field(["version", "minor"])
367
+ 0 5
368
+ 1 1
369
+ 2 26
370
+ Name: minor, dtype: int64[pyarrow]
371
+ >>> s.struct.field([0, 0])
372
+ 0 1
373
+ 1 2
374
+ 2 1
375
+ Name: major, dtype: int64[pyarrow]
317
376
"""
318
377
from pandas import Series
319
378
379
+ def get_name (
380
+ level_name_or_index : list [str ]
381
+ | list [bytes ]
382
+ | list [int ]
383
+ | pc .Expression
384
+ | bytes
385
+ | str
386
+ | int ,
387
+ data : pa .ChunkedArray ,
388
+ ):
389
+ if isinstance (level_name_or_index , int ):
390
+ index = data .type .field (level_name_or_index ).name
391
+ elif isinstance (level_name_or_index , (str , bytes )):
392
+ index = level_name_or_index
393
+ elif isinstance (level_name_or_index , pc .Expression ):
394
+ index = str (level_name_or_index )
395
+ elif is_list_like (level_name_or_index ):
396
+ # For nested input like [2, 1, 2]
397
+ # iteratively get the struct and field name. The last
398
+ # one is used for the name of the index.
399
+ level_name_or_index = list (reversed (level_name_or_index ))
400
+ selected = data
401
+ while level_name_or_index :
402
+ name_or_index = level_name_or_index .pop ()
403
+ name = get_name (name_or_index , selected )
404
+ selected = selected .type .field (selected .type .get_field_index (name ))
405
+ index = selected .name
406
+ return index
407
+ else :
408
+ raise ValueError (
409
+ "name_or_index must be an int, str, bytes, "
410
+ "pyarrow.compute.Expression, or list of those"
411
+ )
412
+ return index
413
+
320
414
pa_arr = self ._data .array ._pa_array
321
- if isinstance (name_or_index , int ):
322
- index = name_or_index
323
- elif isinstance (name_or_index , str ):
324
- index = pa_arr .type .get_field_index (name_or_index )
325
- else :
326
- raise ValueError (
327
- "name_or_index must be an int or str, "
328
- f"got { type (name_or_index ).__name__ } "
329
- )
415
+ name = get_name (name_or_index , pa_arr )
416
+ field_arr = pc .struct_field (pa_arr , name_or_index )
330
417
331
- pa_field = pa_arr .type [index ]
332
- field_arr = pc .struct_field (pa_arr , [index ])
333
418
return Series (
334
419
field_arr ,
335
420
dtype = ArrowDtype (field_arr .type ),
336
421
index = self ._data .index ,
337
- name = pa_field . name ,
422
+ name = name ,
338
423
)
339
424
340
425
def explode (self ) -> DataFrame :
0 commit comments