6
6
ABCMeta ,
7
7
abstractmethod ,
8
8
)
9
- from typing import TYPE_CHECKING
9
+ from typing import (
10
+ TYPE_CHECKING ,
11
+ cast ,
12
+ )
10
13
11
14
from pandas .compat import (
12
15
pa_version_under10p1 ,
13
16
pa_version_under11p0 ,
14
17
)
15
18
19
+ from pandas .core .dtypes .common import is_list_like
20
+
16
21
if not pa_version_under10p1 :
17
22
import pyarrow as pa
18
23
import pyarrow .compute as pc
@@ -267,15 +272,27 @@ def dtypes(self) -> Series:
267
272
names = [struct .name for struct in pa_type ]
268
273
return Series (types , index = Index (names ))
269
274
270
- def field (self , name_or_index : str | int ) -> Series :
275
+ def field (
276
+ self ,
277
+ name_or_index : list [str ]
278
+ | list [bytes ]
279
+ | list [int ]
280
+ | pc .Expression
281
+ | bytes
282
+ | str
283
+ | int ,
284
+ ) -> Series :
271
285
"""
272
286
Extract a child field of a struct as a Series.
273
287
274
288
Parameters
275
289
----------
276
- name_or_index : str | int
290
+ name_or_index : str | bytes | int | expression | list
277
291
Name or index of the child field to extract.
278
292
293
+ For list-like inputs, this will index into a nested
294
+ struct.
295
+
279
296
Returns
280
297
-------
281
298
pandas.Series
@@ -285,6 +302,19 @@ def field(self, name_or_index: str | int) -> Series:
285
302
--------
286
303
Series.struct.explode : Return all child fields as a DataFrame.
287
304
305
+ Notes
306
+ -----
307
+ The name of the resulting Series will be set using the following
308
+ rules:
309
+
310
+ - For string, bytes, or integer `name_or_index` (or a list of these, for
311
+ a nested selection), the Series name is set to the selected
312
+ field's name.
313
+ - For a :class:`pyarrow.compute.Expression`, this is set to
314
+ the string form of the expression.
315
+ - For list-like `name_or_index`, the name will be set to the
316
+ name of the final field selected.
317
+
288
318
Examples
289
319
--------
290
320
>>> import pyarrow as pa
@@ -314,27 +344,92 @@ def field(self, name_or_index: str | int) -> Series:
314
344
1 2
315
345
2 1
316
346
Name: version, dtype: int64[pyarrow]
347
+
348
+ Or an expression
349
+
350
+ >>> import pyarrow.compute as pc
351
+ >>> s.struct.field(pc.field("project"))
352
+ 0 pandas
353
+ 1 pandas
354
+ 2 numpy
355
+ Name: project, dtype: string[pyarrow]
356
+
357
+ For nested struct types, you can pass a list of values to index
358
+ multiple levels:
359
+
360
+ >>> version_type = pa.struct([
361
+ ... ("major", pa.int64()),
362
+ ... ("minor", pa.int64()),
363
+ ... ])
364
+ >>> s = pd.Series(
365
+ ... [
366
+ ... {"version": {"major": 1, "minor": 5}, "project": "pandas"},
367
+ ... {"version": {"major": 2, "minor": 1}, "project": "pandas"},
368
+ ... {"version": {"major": 1, "minor": 26}, "project": "numpy"},
369
+ ... ],
370
+ ... dtype=pd.ArrowDtype(pa.struct(
371
+ ... [("version", version_type), ("project", pa.string())]
372
+ ... ))
373
+ ... )
374
+ >>> s.struct.field(["version", "minor"])
375
+ 0 5
376
+ 1 1
377
+ 2 26
378
+ Name: minor, dtype: int64[pyarrow]
379
+ >>> s.struct.field([0, 0])
380
+ 0 1
381
+ 1 2
382
+ 2 1
383
+ Name: major, dtype: int64[pyarrow]
317
384
"""
318
385
from pandas import Series
319
386
387
+ def get_name (
388
+ level_name_or_index : list [str ]
389
+ | list [bytes ]
390
+ | list [int ]
391
+ | pc .Expression
392
+ | bytes
393
+ | str
394
+ | int ,
395
+ data : pa .ChunkedArray ,
396
+ ):
397
+ if isinstance (level_name_or_index , int ):
398
+ name = data .type .field (level_name_or_index ).name
399
+ elif isinstance (level_name_or_index , (str , bytes )):
400
+ name = level_name_or_index
401
+ elif isinstance (level_name_or_index , pc .Expression ):
402
+ name = str (level_name_or_index )
403
+ elif is_list_like (level_name_or_index ):
404
+ # For nested input like [2, 1, 2]
405
+ # iteratively get the struct and field name. The last
406
+ # one is used for the name of the index.
407
+ level_name_or_index = list (reversed (level_name_or_index ))
408
+ selected = data
409
+ while level_name_or_index :
410
+ # we need the cast, otherwise mypy complains about
411
+ # getting ints, bytes, or str here, which isn't possible.
412
+ level_name_or_index = cast (list , level_name_or_index )
413
+ name_or_index = level_name_or_index .pop ()
414
+ name = get_name (name_or_index , selected )
415
+ selected = selected .type .field (selected .type .get_field_index (name ))
416
+ name = selected .name
417
+ else :
418
+ raise ValueError (
419
+ "name_or_index must be an int, str, bytes, "
420
+ "pyarrow.compute.Expression, or list of those"
421
+ )
422
+ return name
423
+
320
424
pa_arr = self ._data .array ._pa_array
321
- if isinstance (name_or_index , int ):
322
- index = name_or_index
323
- elif isinstance (name_or_index , str ):
324
- index = pa_arr .type .get_field_index (name_or_index )
325
- else :
326
- raise ValueError (
327
- "name_or_index must be an int or str, "
328
- f"got { type (name_or_index ).__name__ } "
329
- )
425
+ name = get_name (name_or_index , pa_arr )
426
+ field_arr = pc .struct_field (pa_arr , name_or_index )
330
427
331
- pa_field = pa_arr .type [index ]
332
- field_arr = pc .struct_field (pa_arr , [index ])
333
428
return Series (
334
429
field_arr ,
335
430
dtype = ArrowDtype (field_arr .type ),
336
431
index = self ._data .index ,
337
- name = pa_field . name ,
432
+ name = name ,
338
433
)
339
434
340
435
def explode (self ) -> DataFrame :
0 commit comments