Skip to content

Commit 36aa531

Browse files
authored
ENH: add Series.struct accessor for ArrowDtype[struct] (#54977)
Features: * Series.struct.dtypes -- see dtypes and field names * Series.struct.field(name_or_index) -- extract a field as a Series * Series.struct.explode() -- convert all fields into a DataFrame
1 parent f4f598f commit 36aa531

File tree

6 files changed

+398
-1
lines changed

6 files changed

+398
-1
lines changed

doc/source/reference/series.rst

+23
Original file line numberDiff line numberDiff line change
@@ -525,6 +525,29 @@ Sparse-dtype specific methods and attributes are provided under the
525525
Series.sparse.from_coo
526526
Series.sparse.to_coo
527527

528+
529+
.. _api.series.struct:
530+
531+
Struct accessor
532+
~~~~~~~~~~~~~~~
533+
534+
Arrow struct-dtype specific methods and attributes are provided under the
535+
``Series.struct`` accessor.
536+
537+
.. autosummary::
538+
:toctree: api/
539+
:template: autosummary/accessor_attribute.rst
540+
541+
Series.struct.dtypes
542+
543+
.. autosummary::
544+
:toctree: api/
545+
:template: autosummary/accessor_method.rst
546+
547+
Series.struct.field
548+
Series.struct.explode
549+
550+
528551
.. _api.series.flags:
529552

530553
Flags

doc/source/whatsnew/v2.2.0.rst

+28
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,34 @@ There are two advantages of this engine:
3636
3737
For more, see :ref:`io.calamine` in the user guide on IO tools.
3838

39+
.. _whatsnew_220.enhancements.struct_accessor:
40+
41+
Series.struct accessor to with PyArrow structured data
42+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
43+
44+
The ``Series.struct`` accessor provides attributes and methods for processing
45+
data with ``struct[pyarrow]`` dtype Series. For example,
46+
:meth:`Series.struct.explode` converts PyArrow structured data to a pandas
47+
DataFrame. (:issue:`54938`)
48+
49+
.. ipython:: python
50+
51+
import pyarrow as pa
52+
series = pd.Series(
53+
[
54+
{"project": "pandas", "version": "2.2.0"},
55+
{"project": "numpy", "version": "1.25.2"},
56+
{"project": "pyarrow", "version": "13.0.0"},
57+
],
58+
dtype=pd.ArrowDtype(
59+
pa.struct([
60+
("project", pa.string()),
61+
("version", pa.string()),
62+
])
63+
),
64+
)
65+
series.struct.explode()
66+
3967
.. _whatsnew_220.enhancements.enhancement2:
4068

4169
enhancement2

pandas/core/arrays/arrow/__init__.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from pandas.core.arrays.arrow.accessors import StructAccessor
12
from pandas.core.arrays.arrow.array import ArrowExtensionArray
23

3-
__all__ = ["ArrowExtensionArray"]
4+
__all__ = ["ArrowExtensionArray", "StructAccessor"]

pandas/core/arrays/arrow/accessors.py

+196
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
"""Accessors for arrow-backed data."""
2+
3+
from __future__ import annotations
4+
5+
from typing import TYPE_CHECKING
6+
7+
from pandas.compat import pa_version_under7p0
8+
9+
if not pa_version_under7p0:
10+
import pyarrow as pa
11+
import pyarrow.compute as pc
12+
13+
from pandas.core.dtypes.dtypes import ArrowDtype
14+
15+
if TYPE_CHECKING:
16+
from pandas import (
17+
DataFrame,
18+
Series,
19+
)
20+
21+
22+
class StructAccessor:
23+
"""
24+
Accessor object for structured data properties of the Series values.
25+
26+
Parameters
27+
----------
28+
data : Series
29+
Series containing Arrow struct data.
30+
"""
31+
32+
_validation_msg = (
33+
"Can only use the '.struct' accessor with 'struct[pyarrow]' dtype, not {dtype}."
34+
)
35+
36+
def __init__(self, data=None) -> None:
37+
self._parent = data
38+
self._validate(data)
39+
40+
def _validate(self, data):
41+
dtype = data.dtype
42+
if not isinstance(dtype, ArrowDtype):
43+
# Raise AttributeError so that inspect can handle non-struct Series.
44+
raise AttributeError(self._validation_msg.format(dtype=dtype))
45+
46+
if not pa.types.is_struct(dtype.pyarrow_dtype):
47+
# Raise AttributeError so that inspect can handle non-struct Series.
48+
raise AttributeError(self._validation_msg.format(dtype=dtype))
49+
50+
@property
51+
def dtypes(self) -> Series:
52+
"""
53+
Return the dtype object of each child field of the struct.
54+
55+
Returns
56+
-------
57+
pandas.Series
58+
The data type of each child field.
59+
60+
Examples
61+
--------
62+
>>> import pyarrow as pa
63+
>>> s = pd.Series(
64+
... [
65+
... {"version": 1, "project": "pandas"},
66+
... {"version": 2, "project": "pandas"},
67+
... {"version": 1, "project": "numpy"},
68+
... ],
69+
... dtype=pd.ArrowDtype(pa.struct(
70+
... [("version", pa.int64()), ("project", pa.string())]
71+
... ))
72+
... )
73+
>>> s.struct.dtypes
74+
version int64[pyarrow]
75+
project string[pyarrow]
76+
dtype: object
77+
"""
78+
from pandas import (
79+
Index,
80+
Series,
81+
)
82+
83+
pa_type = self._parent.dtype.pyarrow_dtype
84+
types = [ArrowDtype(struct.type) for struct in pa_type]
85+
names = [struct.name for struct in pa_type]
86+
return Series(types, index=Index(names))
87+
88+
def field(self, name_or_index: str | int) -> Series:
89+
"""
90+
Extract a child field of a struct as a Series.
91+
92+
Parameters
93+
----------
94+
name_or_index : str | int
95+
Name or index of the child field to extract.
96+
97+
Returns
98+
-------
99+
pandas.Series
100+
The data corresponding to the selected child field.
101+
102+
See Also
103+
--------
104+
Series.struct.explode : Return all child fields as a DataFrame.
105+
106+
Examples
107+
--------
108+
>>> import pyarrow as pa
109+
>>> s = pd.Series(
110+
... [
111+
... {"version": 1, "project": "pandas"},
112+
... {"version": 2, "project": "pandas"},
113+
... {"version": 1, "project": "numpy"},
114+
... ],
115+
... dtype=pd.ArrowDtype(pa.struct(
116+
... [("version", pa.int64()), ("project", pa.string())]
117+
... ))
118+
... )
119+
120+
Extract by field name.
121+
122+
>>> s.struct.field("project")
123+
0 pandas
124+
1 pandas
125+
2 numpy
126+
Name: project, dtype: string[pyarrow]
127+
128+
Extract by field index.
129+
130+
>>> s.struct.field(0)
131+
0 1
132+
1 2
133+
2 1
134+
Name: version, dtype: int64[pyarrow]
135+
"""
136+
from pandas import Series
137+
138+
pa_arr = self._parent.array._pa_array
139+
if isinstance(name_or_index, int):
140+
index = name_or_index
141+
elif isinstance(name_or_index, str):
142+
index = pa_arr.type.get_field_index(name_or_index)
143+
else:
144+
raise ValueError(
145+
"name_or_index must be an int or str, "
146+
f"got {type(name_or_index).__name__}"
147+
)
148+
149+
pa_field = pa_arr.type[index]
150+
field_arr = pc.struct_field(pa_arr, [index])
151+
return Series(
152+
field_arr,
153+
dtype=ArrowDtype(field_arr.type),
154+
index=self._parent.index,
155+
name=pa_field.name,
156+
)
157+
158+
def explode(self) -> DataFrame:
159+
"""
160+
Extract all child fields of a struct as a DataFrame.
161+
162+
Returns
163+
-------
164+
pandas.DataFrame
165+
The data corresponding to all child fields.
166+
167+
See Also
168+
--------
169+
Series.struct.field : Return a single child field as a Series.
170+
171+
Examples
172+
--------
173+
>>> import pyarrow as pa
174+
>>> s = pd.Series(
175+
... [
176+
... {"version": 1, "project": "pandas"},
177+
... {"version": 2, "project": "pandas"},
178+
... {"version": 1, "project": "numpy"},
179+
... ],
180+
... dtype=pd.ArrowDtype(pa.struct(
181+
... [("version", pa.int64()), ("project", pa.string())]
182+
... ))
183+
... )
184+
185+
>>> s.struct.explode()
186+
version project
187+
0 1 pandas
188+
1 2 pandas
189+
2 1 numpy
190+
"""
191+
from pandas import concat
192+
193+
pa_type = self._parent.dtype.pyarrow_dtype
194+
return concat(
195+
[self.field(i) for i in range(pa_type.num_fields)], axis="columns"
196+
)

pandas/core/series.py

+2
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@
101101
from pandas.core.accessor import CachedAccessor
102102
from pandas.core.apply import SeriesApply
103103
from pandas.core.arrays import ExtensionArray
104+
from pandas.core.arrays.arrow import StructAccessor
104105
from pandas.core.arrays.categorical import CategoricalAccessor
105106
from pandas.core.arrays.sparse import SparseAccessor
106107
from pandas.core.construction import (
@@ -5787,6 +5788,7 @@ def to_period(self, freq: str | None = None, copy: bool | None = None) -> Series
57875788
cat = CachedAccessor("cat", CategoricalAccessor)
57885789
plot = CachedAccessor("plot", pandas.plotting.PlotAccessor)
57895790
sparse = CachedAccessor("sparse", SparseAccessor)
5791+
struct = CachedAccessor("struct", StructAccessor)
57905792

57915793
# ----------------------------------------------------------------------
57925794
# Add plotting methods to Series

0 commit comments

Comments
 (0)