Skip to content

Commit 2f64ded

Browse files
committed
ENH: add Series.struct accessor for ArrowDtype[struct]
Features: * Series.struct.dtypes -- see dtypes and field names * Series.struct.field(name_or_index) -- extract a field as a Series * Series.struct.explode() -- convert all fields into a DataFrame
1 parent 9aa3f95 commit 2f64ded

File tree

6 files changed

+395
-4
lines changed

6 files changed

+395
-4
lines changed

doc/source/reference/series.rst

+23
Original file line numberDiff line numberDiff line change
@@ -525,6 +525,29 @@ Sparse-dtype specific methods and attributes are provided under the
525525
Series.sparse.from_coo
526526
Series.sparse.to_coo
527527

528+
529+
.. _api.series.struct:
530+
531+
Struct accessor
532+
~~~~~~~~~~~~~~~
533+
534+
Arrow struct-dtype specific methods and attributes are provided under the
535+
``Series.struct`` accessor.
536+
537+
.. autosummary::
538+
:toctree: api/
539+
:template: autosummary/accessor_attribute.rst
540+
541+
Series.struct.dtypes
542+
543+
.. autosummary::
544+
:toctree: api/
545+
:template: autosummary/accessor_method.rst
546+
547+
Series.struct.field
548+
Series.struct.explode
549+
550+
528551
.. _api.series.flags:
529552

530553
Flags

doc/source/whatsnew/v2.2.0.rst

+28-3
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,35 @@ including other versions of pandas.
1414
Enhancements
1515
~~~~~~~~~~~~
1616

17-
.. _whatsnew_220.enhancements.enhancement1:
17+
.. _whatsnew_220.enhancements.struct_accessor:
1818

19-
enhancement1
20-
^^^^^^^^^^^^
19+
Series.struct accessor to with PyArrow structured data
20+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
21+
22+
The ``Series.struct`` accessor provides attributes and methods for processing
23+
data with ``struct[pyarrow]`` dtype Series. For example,
24+
:meth:`Series.struct.explode` converts PyArrow structured data to a pandas
25+
DataFrame. (:issue:`54938`)
26+
27+
.. code-block:: ipython
28+
29+
In [1]: import pyarrow as pa
30+
...: struct_type = pa.struct([
31+
...: ("int_col", pa.int64()),
32+
...: ("string_col", pa.string()),
33+
...: ])
34+
...: struct_array = pa.array([
35+
...: {"int_col": 1, "string_col": "a"},
36+
...: {"int_col": 2, "string_col": "b"},
37+
...: {"int_col": 3, "string_col": "c"},
38+
...: ], type=struct_type)
39+
...: series = pd.Series(struct_array, dtype=pd.ArrowDtype(struct_type))
40+
In [2]: series.struct.explode()
41+
Out[2]:
42+
int_col string_col
43+
0 1 a
44+
1 2 b
45+
2 3 c
2146
2247
.. _whatsnew_220.enhancements.enhancement2:
2348

pandas/core/arrays/arrow/__init__.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from pandas.core.arrays.arrow.accessors import StructAccessor
12
from pandas.core.arrays.arrow.array import ArrowExtensionArray
23

3-
__all__ = ["ArrowExtensionArray"]
4+
__all__ = ["ArrowExtensionArray", "StructAccessor"]

pandas/core/arrays/arrow/accessors.py

+193
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
"""Accessors for arrow-backed data."""
2+
3+
from __future__ import annotations
4+
5+
from typing import TYPE_CHECKING
6+
7+
from pandas.compat import pa_version_under7p0
8+
9+
if not pa_version_under7p0:
10+
import pyarrow as pa
11+
import pyarrow.compute as pc
12+
13+
from pandas.core.dtypes.dtypes import ArrowDtype
14+
15+
if TYPE_CHECKING:
16+
from pandas import (
17+
DataFrame,
18+
Series,
19+
)
20+
21+
22+
class StructAccessor:
23+
"""
24+
Accessor object for structured data properties of the Series values.
25+
26+
Parameters
27+
----------
28+
data : Series
29+
Series containing Arrow struct data.
30+
"""
31+
32+
_validation_msg = (
33+
"Can only use the '.struct' accessor with 'struct[pyarrow]' dtype, not {dtype}."
34+
)
35+
36+
def __init__(self, data=None) -> None:
37+
self._parent = data
38+
self._validate(data)
39+
40+
def _validate(self, data):
41+
dtype = data.dtype
42+
if not isinstance(dtype, ArrowDtype):
43+
raise TypeError(self._validation_msg.format(dtype=dtype))
44+
45+
if not pa.types.is_struct(dtype.pyarrow_dtype):
46+
raise TypeError(self._validation_msg.format(dtype=dtype))
47+
48+
@property
49+
def dtypes(self) -> Series:
50+
"""
51+
Return the dtype object of each child field of the struct.
52+
53+
Returns
54+
-------
55+
pandas.Series
56+
The data type of each child field.
57+
58+
Examples
59+
--------
60+
>>> import pyarrow as pa
61+
>>> s = pd.Series(
62+
... [
63+
... {"version": 1, "project": "pandas"},
64+
... {"version": 2, "project": "pandas"},
65+
... {"version": 1, "project": "numpy"},
66+
... ],
67+
... dtype=pd.ArrowDtype(pa.struct(
68+
... [("version", pa.int64()), ("project", pa.string())]
69+
... ))
70+
... )
71+
>>> s.struct.dtypes
72+
version int64[pyarrow]
73+
project string[pyarrow]
74+
dtype: object
75+
"""
76+
from pandas import (
77+
Index,
78+
Series,
79+
)
80+
81+
pa_type = self._parent.dtype.pyarrow_dtype
82+
types = [ArrowDtype(struct.type) for struct in pa_type]
83+
names = [struct.name for struct in pa_type]
84+
return Series(types, index=Index(names))
85+
86+
def field(self, name_or_index: str | int) -> Series:
87+
"""
88+
Extract a child field of a struct as a Series.
89+
90+
Parameters
91+
----------
92+
name_or_index : str | int
93+
Name or index of the child field to extract.
94+
95+
Returns
96+
-------
97+
pandas.Series
98+
The data corresponding to the selected child field.
99+
100+
See Also
101+
--------
102+
Series.struct.explode : Return all child fields as a DataFrame.
103+
104+
Examples
105+
--------
106+
>>> import pyarrow as pa
107+
>>> s = pd.Series(
108+
... [
109+
... {"version": 1, "project": "pandas"},
110+
... {"version": 2, "project": "pandas"},
111+
... {"version": 1, "project": "numpy"},
112+
... ],
113+
... dtype=pd.ArrowDtype(pa.struct(
114+
... [("version", pa.int64()), ("project", pa.string())]
115+
... ))
116+
... )
117+
118+
Extract by field name.
119+
120+
>>> s.struct.field("project")
121+
0 pandas
122+
1 pandas
123+
2 numpy
124+
Name: project, dtype: string[pyarrow]
125+
126+
Extract by field index.
127+
128+
>>> s.struct.field(0)
129+
0 1
130+
1 2
131+
2 1
132+
Name: version, dtype: int64[pyarrow]
133+
"""
134+
from pandas import Series
135+
136+
pa_arr = self._parent.array._pa_array
137+
if isinstance(name_or_index, int):
138+
index = name_or_index
139+
elif isinstance(name_or_index, str):
140+
index = pa_arr.type.get_field_index(name_or_index)
141+
else:
142+
raise ValueError(
143+
f"name_or_index must be an int or str, got {type(name_or_index)}"
144+
)
145+
146+
pa_field = pa_arr.type[index]
147+
field_arr = pc.struct_field(pa_arr, [index])
148+
return Series(
149+
field_arr,
150+
dtype=ArrowDtype(field_arr.type),
151+
index=self._parent.index,
152+
name=pa_field.name,
153+
)
154+
155+
def explode(self) -> DataFrame:
156+
"""
157+
Extract all child fields of a struct as a DataFrame.
158+
159+
Returns
160+
-------
161+
pandas.DataFrame
162+
The data corresponding to all child fields.
163+
164+
See Also
165+
--------
166+
Series.struct.field : Return a single child field as a Series.
167+
168+
Examples
169+
--------
170+
>>> import pyarrow as pa
171+
>>> s = pd.Series(
172+
... [
173+
... {"version": 1, "project": "pandas"},
174+
... {"version": 2, "project": "pandas"},
175+
... {"version": 1, "project": "numpy"},
176+
... ],
177+
... dtype=pd.ArrowDtype(pa.struct(
178+
... [("version", pa.int64()), ("project", pa.string())]
179+
... ))
180+
... )
181+
182+
>>> s.struct.explode()
183+
version project
184+
0 1 pandas
185+
1 2 pandas
186+
2 1 numpy
187+
"""
188+
from pandas import concat
189+
190+
pa_type = self._parent.dtype.pyarrow_dtype
191+
return concat(
192+
[self.field(i) for i in range(pa_type.num_fields)], axis="columns"
193+
)

pandas/core/series.py

+2
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@
101101
from pandas.core.accessor import CachedAccessor
102102
from pandas.core.apply import SeriesApply
103103
from pandas.core.arrays import ExtensionArray
104+
from pandas.core.arrays.arrow import StructAccessor
104105
from pandas.core.arrays.categorical import CategoricalAccessor
105106
from pandas.core.arrays.sparse import SparseAccessor
106107
from pandas.core.construction import (
@@ -5787,6 +5788,7 @@ def to_period(self, freq: str | None = None, copy: bool | None = None) -> Series
57875788
cat = CachedAccessor("cat", CategoricalAccessor)
57885789
plot = CachedAccessor("plot", pandas.plotting.PlotAccessor)
57895790
sparse = CachedAccessor("sparse", SparseAccessor)
5791+
struct = CachedAccessor("struct", StructAccessor)
57905792

57915793
# ----------------------------------------------------------------------
57925794
# Add plotting methods to Series

0 commit comments

Comments
 (0)