Skip to content

Commit 00dff2c

Browse files
committed
ENH: add Series.struct accessor for ArrowDtype[struct]
Features: * Series.struct.dtypes -- see dtypes and field names * Series.struct.field(name_or_index) -- extract a field as a Series * Series.struct.to_frame() -- convert all fields into a DataFrame
1 parent 982d619 commit 00dff2c

File tree

6 files changed

+356
-4
lines changed

6 files changed

+356
-4
lines changed

doc/source/reference/series.rst

+23
Original file line numberDiff line numberDiff line change
@@ -525,6 +525,29 @@ Sparse-dtype specific methods and attributes are provided under the
525525
Series.sparse.from_coo
526526
Series.sparse.to_coo
527527

528+
529+
.. _api.series.struct:
530+
531+
Struct accessor
532+
~~~~~~~~~~~~~~~
533+
534+
Arrow struct-dtype specific methods and attributes are provided under the
535+
``Series.struct`` accessor.
536+
537+
.. autosummary::
538+
:toctree: api/
539+
:template: autosummary/accessor_attribute.rst
540+
541+
Series.struct.dtypes
542+
543+
.. autosummary::
544+
:toctree: api/
545+
:template: autosummary/accessor_method.rst
546+
547+
Series.struct.field
548+
Series.struct.to_frame
549+
550+
528551
.. _api.series.flags:
529552

530553
Flags

doc/source/whatsnew/v2.2.0.rst

+28-3
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,35 @@ including other versions of pandas.
1414
Enhancements
1515
~~~~~~~~~~~~
1616

17-
.. _whatsnew_220.enhancements.enhancement1:
17+
.. _whatsnew_220.enhancements.struct_accessor:
1818

19-
enhancement1
20-
^^^^^^^^^^^^
19+
Series.struct accessor to with PyArrow structured data
20+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
21+
22+
The ``Series.struct`` accessor provides attributes and methods for processing
23+
data with ``struct[pyarrow]`` dtype Series. For example,
24+
:meth:`Series.struct.to_frame` converts PyArrow structured data to a pandas
25+
DataFrame. (:issue:`54938`)
26+
27+
.. code-block:: ipython
28+
29+
In [1]: import pyarrow as pa
30+
...: struct_type = pa.struct([
31+
...: ("int_col", pa.int64()),
32+
...: ("string_col", pa.string()),
33+
...: ])
34+
...: struct_array = pa.array([
35+
...: {"int_col": 1, "string_col": "a"},
36+
...: {"int_col": 2, "string_col": "b"},
37+
...: {"int_col": 3, "string_col": "c"},
38+
...: ], type=struct_type)
39+
...: series = pd.Series(struct_array, dtype=pd.ArrowDtype(struct_type))
40+
In [2]: series.struct.to_frame()
41+
Out[2]:
42+
int_col string_col
43+
0 1 a
44+
1 2 b
45+
2 3 c
2146
2247
.. _whatsnew_220.enhancements.enhancement2:
2348

pandas/core/arrays/arrow/__init__.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from pandas.core.arrays.arrow.accessors import StructAccessor
12
from pandas.core.arrays.arrow.array import ArrowExtensionArray
23

3-
__all__ = ["ArrowExtensionArray"]
4+
__all__ = ["ArrowExtensionArray", "StructAccessor"]

pandas/core/arrays/arrow/accessors.py

+187
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
"""Accessors for arrow-backed data."""
2+
3+
from __future__ import annotations
4+
5+
from typing import TYPE_CHECKING
6+
7+
from pandas.compat import pa_version_under7p0
8+
9+
if not pa_version_under7p0:
10+
import pyarrow as pa
11+
import pyarrow.compute as pc
12+
13+
from pandas.core.dtypes.dtypes import ArrowDtype
14+
15+
if TYPE_CHECKING:
16+
from pandas import (
17+
DataFrame,
18+
Series,
19+
)
20+
21+
22+
class StructAccessor:
23+
"""
24+
Accessor object for structured data properties of the Series values.
25+
26+
Parameters
27+
----------
28+
data : Series
29+
Series containing Arrow struct data.
30+
"""
31+
32+
_validation_msg = "Can only use the '.struct' accessor with 'struct[pyarrow]' data."
33+
34+
def __init__(self, data=None) -> None:
35+
self._parent = data
36+
self._validate(data)
37+
38+
def _validate(self, data):
39+
dtype = data.dtype
40+
if not isinstance(dtype, ArrowDtype):
41+
raise AttributeError(self._validation_msg)
42+
43+
if not pa.types.is_struct(dtype.pyarrow_dtype):
44+
raise AttributeError(self._validation_msg)
45+
46+
@property
47+
def dtypes(self) -> Series:
48+
"""
49+
Return the dtype object of each child field of the struct.
50+
51+
Returns
52+
-------
53+
pandas.Series
54+
The data type of each child field.
55+
56+
Examples
57+
--------
58+
>>> import pyarrow as pa
59+
>>> s = pd.Series(
60+
... [
61+
... {"version": 1, "project": "pandas"},
62+
... {"version": 2, "project": "pandas"},
63+
... {"version": 1, "project": "numpy"},
64+
... ],
65+
... dtype=pd.ArrowDtype(pa.struct(
66+
... [("version", pa.int64()), ("project", pa.string())]
67+
... ))
68+
... )
69+
>>> s.struct.dtypes
70+
version int64[pyarrow]
71+
project string[pyarrow]
72+
dtype: object
73+
"""
74+
from pandas import (
75+
Index,
76+
Series,
77+
)
78+
79+
pa_type = self._parent.dtype.pyarrow_dtype
80+
types = [ArrowDtype(pa_type[i].type) for i in range(pa_type.num_fields)]
81+
names = [pa_type[i].name for i in range(pa_type.num_fields)]
82+
return Series(types, index=Index(names))
83+
84+
def field(self, name_or_index: str | int) -> Series:
85+
"""
86+
Extract a child field of a struct as a Series.
87+
88+
Parameters
89+
----------
90+
name_or_index : str | int
91+
Name or index of the child field to extract.
92+
93+
Returns
94+
-------
95+
pandas.Series
96+
The data corresponding to the selected child field.
97+
98+
See Also
99+
--------
100+
Series.struct.to_frame : Return all child fields as a DataFrame.
101+
102+
Examples
103+
--------
104+
>>> import pyarrow as pa
105+
>>> s = pd.Series(
106+
... [
107+
... {"version": 1, "project": "pandas"},
108+
... {"version": 2, "project": "pandas"},
109+
... {"version": 1, "project": "numpy"},
110+
... ],
111+
... dtype=pd.ArrowDtype(pa.struct(
112+
... [("version", pa.int64()), ("project", pa.string())]
113+
... ))
114+
... )
115+
116+
Extract by field name.
117+
118+
>>> s.struct.field("project")
119+
0 pandas
120+
1 pandas
121+
2 numpy
122+
Name: project, dtype: string[pyarrow]
123+
124+
Extract by field index.
125+
126+
>>> s.struct.field(0)
127+
0 1
128+
1 2
129+
2 1
130+
Name: version, dtype: int64[pyarrow]
131+
"""
132+
from pandas import Series
133+
134+
pa_arr = self._parent.array._pa_array
135+
if isinstance(name_or_index, int):
136+
index = name_or_index
137+
else:
138+
index = pa_arr.type.get_field_index(name_or_index)
139+
140+
pa_field = pa_arr.type[index]
141+
field_arr = pc.struct_field(pa_arr, [index])
142+
return Series(
143+
field_arr,
144+
dtype=ArrowDtype(field_arr.type),
145+
index=self._parent.index,
146+
name=pa_field.name,
147+
)
148+
149+
def to_frame(self) -> DataFrame:
150+
"""
151+
Extract all child fields of a struct as a DataFrame.
152+
153+
Returns
154+
-------
155+
pandas.DataFrame
156+
The data corresponding to all child fields.
157+
158+
See Also
159+
--------
160+
Series.struct.field : Return a single child field as a Series.
161+
162+
Examples
163+
--------
164+
>>> import pyarrow as pa
165+
>>> s = pd.Series(
166+
... [
167+
... {"version": 1, "project": "pandas"},
168+
... {"version": 2, "project": "pandas"},
169+
... {"version": 1, "project": "numpy"},
170+
... ],
171+
... dtype=pd.ArrowDtype(pa.struct(
172+
... [("version", pa.int64()), ("project", pa.string())]
173+
... ))
174+
... )
175+
176+
>>> s.struct.to_frame()
177+
version project
178+
0 1 pandas
179+
1 2 pandas
180+
2 1 numpy
181+
"""
182+
from pandas import concat
183+
184+
pa_type = self._parent.dtype.pyarrow_dtype
185+
return concat(
186+
[self.field(i) for i in range(pa_type.num_fields)], axis="columns"
187+
)

pandas/core/series.py

+2
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@
101101
from pandas.core.accessor import CachedAccessor
102102
from pandas.core.apply import SeriesApply
103103
from pandas.core.arrays import ExtensionArray
104+
from pandas.core.arrays.arrow import StructAccessor
104105
from pandas.core.arrays.categorical import CategoricalAccessor
105106
from pandas.core.arrays.sparse import SparseAccessor
106107
from pandas.core.construction import (
@@ -5787,6 +5788,7 @@ def to_period(self, freq: str | None = None, copy: bool | None = None) -> Series
57875788
cat = CachedAccessor("cat", CategoricalAccessor)
57885789
plot = CachedAccessor("plot", pandas.plotting.PlotAccessor)
57895790
sparse = CachedAccessor("sparse", SparseAccessor)
5791+
struct = CachedAccessor("struct", StructAccessor)
57905792

57915793
# ----------------------------------------------------------------------
57925794
# Add plotting methods to Series
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
import re
2+
3+
import pytest
4+
5+
from pandas import (
6+
ArrowDtype,
7+
DataFrame,
8+
Index,
9+
Series,
10+
)
11+
import pandas._testing as tm
12+
13+
pa = pytest.importorskip("pyarrow")
14+
15+
16+
class TestStructAccessor:
17+
def test_struct_accessor_dtypes(self):
18+
ser = Series(
19+
[],
20+
dtype=ArrowDtype(
21+
pa.struct([("int_col", pa.int64()), ("string_col", pa.string())])
22+
),
23+
)
24+
actual = ser.struct.dtypes
25+
expected = Series(
26+
[ArrowDtype(pa.int64()), ArrowDtype(pa.string())],
27+
index=Index(["int_col", "string_col"]),
28+
)
29+
tm.assert_series_equal(actual, expected)
30+
31+
def test_struct_accessor_field(self):
32+
index = Index([-100, 42, 123])
33+
ser = Series(
34+
[
35+
{"rice": 1.0, "maize": -1, "wheat": "a"},
36+
{"rice": 2.0, "maize": 0, "wheat": "b"},
37+
{"rice": 3.0, "maize": 1, "wheat": "c"},
38+
],
39+
dtype=ArrowDtype(
40+
pa.struct(
41+
[
42+
("rice", pa.float64()),
43+
("maize", pa.int64()),
44+
("wheat", pa.string()),
45+
]
46+
)
47+
),
48+
index=index,
49+
)
50+
by_name = ser.struct.field("maize")
51+
by_name_expected = Series(
52+
[-1, 0, 1],
53+
dtype=ArrowDtype(pa.int64()),
54+
index=index,
55+
name="maize",
56+
)
57+
tm.assert_series_equal(by_name, by_name_expected)
58+
59+
by_index = ser.struct.field(2)
60+
by_index_expected = Series(
61+
["a", "b", "c"],
62+
dtype=ArrowDtype(pa.string()),
63+
index=index,
64+
name="wheat",
65+
)
66+
tm.assert_series_equal(by_index, by_index_expected)
67+
68+
def test_struct_accessor_to_frame(self):
69+
index = Index([-100, 42, 123])
70+
ser = Series(
71+
[
72+
{"painted": 1, "snapping": {"sea": "green"}},
73+
{"painted": 2, "snapping": {"sea": "leatherback"}},
74+
{"painted": 3, "snapping": {"sea": "hawksbill"}},
75+
],
76+
dtype=ArrowDtype(
77+
pa.struct(
78+
[
79+
("painted", pa.int64()),
80+
("snapping", pa.struct([("sea", pa.string())])),
81+
]
82+
)
83+
),
84+
index=index,
85+
)
86+
actual = ser.struct.to_frame()
87+
expected = DataFrame(
88+
{
89+
"painted": Series([1, 2, 3], index=index, dtype=ArrowDtype(pa.int64())),
90+
"snapping": Series(
91+
[{"sea": "green"}, {"sea": "leatherback"}, {"sea": "hawksbill"}],
92+
index=index,
93+
dtype=ArrowDtype(pa.struct([("sea", pa.string())])),
94+
),
95+
},
96+
)
97+
tm.assert_frame_equal(actual, expected)
98+
99+
@pytest.mark.parametrize(
100+
"invalid",
101+
[
102+
pytest.param(Series([1, 2, 3], dtype="int64"), id="int64"),
103+
pytest.param(
104+
Series(["a", "b", "c"], dtype="string[pyarrow]"), id="string-pyarrow"
105+
),
106+
],
107+
)
108+
def test_struct_accessor_api_for_invalid(self, invalid):
109+
msg = re.escape(
110+
"Can only use the '.struct' accessor with 'struct[pyarrow]' data."
111+
)
112+
113+
with pytest.raises(AttributeError, match=msg):
114+
invalid.struct

0 commit comments

Comments
 (0)