Skip to content

Commit bbf9c72

Browse files
committed
ENH: add Series.struct accessor for ArrowDtype[struct]
Features: * Series.struct.dtypes -- see dtypes and field names * Series.struct.field(name_or_index) -- extract a field as a Series * Series.struct.explode() -- convert all fields into a DataFrame
1 parent 9aa3f95 commit bbf9c72

File tree

6 files changed

+397
-4
lines changed

6 files changed

+397
-4
lines changed

doc/source/reference/series.rst

+23
Original file line numberDiff line numberDiff line change
@@ -525,6 +525,29 @@ Sparse-dtype specific methods and attributes are provided under the
525525
Series.sparse.from_coo
526526
Series.sparse.to_coo
527527

528+
529+
.. _api.series.struct:
530+
531+
Struct accessor
532+
~~~~~~~~~~~~~~~
533+
534+
Arrow struct-dtype specific methods and attributes are provided under the
535+
``Series.struct`` accessor.
536+
537+
.. autosummary::
538+
:toctree: api/
539+
:template: autosummary/accessor_attribute.rst
540+
541+
Series.struct.dtypes
542+
543+
.. autosummary::
544+
:toctree: api/
545+
:template: autosummary/accessor_method.rst
546+
547+
Series.struct.field
548+
Series.struct.explode
549+
550+
528551
.. _api.series.flags:
529552

530553
Flags

doc/source/whatsnew/v2.2.0.rst

+28-3
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,35 @@ including other versions of pandas.
1414
Enhancements
1515
~~~~~~~~~~~~
1616

17-
.. _whatsnew_220.enhancements.enhancement1:
17+
.. _whatsnew_220.enhancements.struct_accessor:
1818

19-
enhancement1
20-
^^^^^^^^^^^^
19+
Series.struct accessor to with PyArrow structured data
20+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
21+
22+
The ``Series.struct`` accessor provides attributes and methods for processing
23+
data with ``struct[pyarrow]`` dtype Series. For example,
24+
:meth:`Series.struct.explode` converts PyArrow structured data to a pandas
25+
DataFrame. (:issue:`54938`)
26+
27+
.. code-block:: ipython
28+
29+
In [1]: import pyarrow as pa
30+
...: struct_type = pa.struct([
31+
...: ("int_col", pa.int64()),
32+
...: ("string_col", pa.string()),
33+
...: ])
34+
...: struct_array = pa.array([
35+
...: {"int_col": 1, "string_col": "a"},
36+
...: {"int_col": 2, "string_col": "b"},
37+
...: {"int_col": 3, "string_col": "c"},
38+
...: ], type=struct_type)
39+
...: series = pd.Series(struct_array, dtype=pd.ArrowDtype(struct_type))
40+
In [2]: series.struct.explode()
41+
Out[2]:
42+
int_col string_col
43+
0 1 a
44+
1 2 b
45+
2 3 c
2146
2247
.. _whatsnew_220.enhancements.enhancement2:
2348

pandas/core/arrays/arrow/__init__.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from pandas.core.arrays.arrow.accessors import StructAccessor
12
from pandas.core.arrays.arrow.array import ArrowExtensionArray
23

3-
__all__ = ["ArrowExtensionArray"]
4+
__all__ = ["ArrowExtensionArray", "StructAccessor"]

pandas/core/arrays/arrow/accessors.py

+195
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
"""Accessors for arrow-backed data."""
2+
3+
from __future__ import annotations
4+
5+
from typing import TYPE_CHECKING
6+
7+
from pandas.compat import pa_version_under7p0
8+
9+
if not pa_version_under7p0:
10+
import pyarrow as pa
11+
import pyarrow.compute as pc
12+
13+
from pandas.core.dtypes.dtypes import ArrowDtype
14+
15+
if TYPE_CHECKING:
16+
from pandas import (
17+
DataFrame,
18+
Series,
19+
)
20+
21+
22+
class StructAccessor:
23+
"""
24+
Accessor object for structured data properties of the Series values.
25+
26+
Parameters
27+
----------
28+
data : Series
29+
Series containing Arrow struct data.
30+
"""
31+
32+
_validation_msg = (
33+
"Can only use the '.struct' accessor with 'struct[pyarrow]' dtype, not {dtype}."
34+
)
35+
36+
def __init__(self, data=None) -> None:
37+
self._parent = data
38+
self._validate(data)
39+
40+
def _validate(self, data):
41+
dtype = data.dtype
42+
if not isinstance(dtype, ArrowDtype):
43+
# Raise AttributeError so that inspect can handle non-struct Series.
44+
raise AttributeError(self._validation_msg.format(dtype=dtype))
45+
46+
if not pa.types.is_struct(dtype.pyarrow_dtype):
47+
# Raise AttributeError so that inspect can handle non-struct Series.
48+
raise AttributeError(self._validation_msg.format(dtype=dtype))
49+
50+
@property
51+
def dtypes(self) -> Series:
52+
"""
53+
Return the dtype object of each child field of the struct.
54+
55+
Returns
56+
-------
57+
pandas.Series
58+
The data type of each child field.
59+
60+
Examples
61+
--------
62+
>>> import pyarrow as pa
63+
>>> s = pd.Series(
64+
... [
65+
... {"version": 1, "project": "pandas"},
66+
... {"version": 2, "project": "pandas"},
67+
... {"version": 1, "project": "numpy"},
68+
... ],
69+
... dtype=pd.ArrowDtype(pa.struct(
70+
... [("version", pa.int64()), ("project", pa.string())]
71+
... ))
72+
... )
73+
>>> s.struct.dtypes
74+
version int64[pyarrow]
75+
project string[pyarrow]
76+
dtype: object
77+
"""
78+
from pandas import (
79+
Index,
80+
Series,
81+
)
82+
83+
pa_type = self._parent.dtype.pyarrow_dtype
84+
types = [ArrowDtype(struct.type) for struct in pa_type]
85+
names = [struct.name for struct in pa_type]
86+
return Series(types, index=Index(names))
87+
88+
def field(self, name_or_index: str | int) -> Series:
89+
"""
90+
Extract a child field of a struct as a Series.
91+
92+
Parameters
93+
----------
94+
name_or_index : str | int
95+
Name or index of the child field to extract.
96+
97+
Returns
98+
-------
99+
pandas.Series
100+
The data corresponding to the selected child field.
101+
102+
See Also
103+
--------
104+
Series.struct.explode : Return all child fields as a DataFrame.
105+
106+
Examples
107+
--------
108+
>>> import pyarrow as pa
109+
>>> s = pd.Series(
110+
... [
111+
... {"version": 1, "project": "pandas"},
112+
... {"version": 2, "project": "pandas"},
113+
... {"version": 1, "project": "numpy"},
114+
... ],
115+
... dtype=pd.ArrowDtype(pa.struct(
116+
... [("version", pa.int64()), ("project", pa.string())]
117+
... ))
118+
... )
119+
120+
Extract by field name.
121+
122+
>>> s.struct.field("project")
123+
0 pandas
124+
1 pandas
125+
2 numpy
126+
Name: project, dtype: string[pyarrow]
127+
128+
Extract by field index.
129+
130+
>>> s.struct.field(0)
131+
0 1
132+
1 2
133+
2 1
134+
Name: version, dtype: int64[pyarrow]
135+
"""
136+
from pandas import Series
137+
138+
pa_arr = self._parent.array._pa_array
139+
if isinstance(name_or_index, int):
140+
index = name_or_index
141+
elif isinstance(name_or_index, str):
142+
index = pa_arr.type.get_field_index(name_or_index)
143+
else:
144+
raise ValueError(
145+
f"name_or_index must be an int or str, got {type(name_or_index)}"
146+
)
147+
148+
pa_field = pa_arr.type[index]
149+
field_arr = pc.struct_field(pa_arr, [index])
150+
return Series(
151+
field_arr,
152+
dtype=ArrowDtype(field_arr.type),
153+
index=self._parent.index,
154+
name=pa_field.name,
155+
)
156+
157+
def explode(self) -> DataFrame:
158+
"""
159+
Extract all child fields of a struct as a DataFrame.
160+
161+
Returns
162+
-------
163+
pandas.DataFrame
164+
The data corresponding to all child fields.
165+
166+
See Also
167+
--------
168+
Series.struct.field : Return a single child field as a Series.
169+
170+
Examples
171+
--------
172+
>>> import pyarrow as pa
173+
>>> s = pd.Series(
174+
... [
175+
... {"version": 1, "project": "pandas"},
176+
... {"version": 2, "project": "pandas"},
177+
... {"version": 1, "project": "numpy"},
178+
... ],
179+
... dtype=pd.ArrowDtype(pa.struct(
180+
... [("version", pa.int64()), ("project", pa.string())]
181+
... ))
182+
... )
183+
184+
>>> s.struct.explode()
185+
version project
186+
0 1 pandas
187+
1 2 pandas
188+
2 1 numpy
189+
"""
190+
from pandas import concat
191+
192+
pa_type = self._parent.dtype.pyarrow_dtype
193+
return concat(
194+
[self.field(i) for i in range(pa_type.num_fields)], axis="columns"
195+
)

pandas/core/series.py

+2
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@
101101
from pandas.core.accessor import CachedAccessor
102102
from pandas.core.apply import SeriesApply
103103
from pandas.core.arrays import ExtensionArray
104+
from pandas.core.arrays.arrow import StructAccessor
104105
from pandas.core.arrays.categorical import CategoricalAccessor
105106
from pandas.core.arrays.sparse import SparseAccessor
106107
from pandas.core.construction import (
@@ -5787,6 +5788,7 @@ def to_period(self, freq: str | None = None, copy: bool | None = None) -> Series
57875788
cat = CachedAccessor("cat", CategoricalAccessor)
57885789
plot = CachedAccessor("plot", pandas.plotting.PlotAccessor)
57895790
sparse = CachedAccessor("sparse", SparseAccessor)
5791+
struct = CachedAccessor("struct", StructAccessor)
57905792

57915793
# ----------------------------------------------------------------------
57925794
# Add plotting methods to Series

0 commit comments

Comments
 (0)