Skip to content

Commit ae4088a

Browse files
committed
ENH: add Series.struct accessor for ArrowDtype[struct]
Features: * Series.struct.dtypes -- see dtypes and field names * Series.struct.field(name_or_index) -- extract a field as a Series * Series.struct.to_frame() -- convert all fields into a DataFrame
1 parent 31d4d8b commit ae4088a

File tree

6 files changed

+344
-4
lines changed

6 files changed

+344
-4
lines changed

doc/source/reference/series.rst

+23
Original file line numberDiff line numberDiff line change
@@ -525,6 +525,29 @@ Sparse-dtype specific methods and attributes are provided under the
525525
Series.sparse.from_coo
526526
Series.sparse.to_coo
527527

528+
529+
.. _api.series.struct:
530+
531+
Struct accessor
532+
~~~~~~~~~~~~~~~
533+
534+
Arrow struct-dtype specific methods and attributes are provided under the
535+
``Series.struct`` accessor.
536+
537+
.. autosummary::
538+
:toctree: api/
539+
:template: autosummary/accessor_attribute.rst
540+
541+
Series.struct.dtypes
542+
543+
.. autosummary::
544+
:toctree: api/
545+
:template: autosummary/accessor_method.rst
546+
547+
Series.struct.field
548+
Series.struct.to_frame
549+
550+
528551
.. _api.series.flags:
529552

530553
Flags

doc/source/whatsnew/v2.2.0.rst

+28-3
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,35 @@ including other versions of pandas.
1414
Enhancements
1515
~~~~~~~~~~~~
1616

17-
.. _whatsnew_220.enhancements.enhancement1:
17+
.. _whatsnew_220.enhancements.struct_accessor:
1818

19-
enhancement1
20-
^^^^^^^^^^^^
19+
Series.struct accessor to with PyArrow structured data
20+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
21+
22+
The ``Series.struct`` accessor provides attributes and methods for processing
23+
data with ``struct[pyarrow]`` dtype Series. For example,
24+
:meth:`Series.struct.to_frame` converts PyArrow structured data to a pandas
25+
DataFrame. (:issue:`54938`)
26+
27+
.. code-block:: ipython
28+
29+
In [1]: import pyarrow as pa
30+
...: struct_type = pa.struct([
31+
...: ("int_col", pa.int64()),
32+
...: ("string_col", pa.string()),
33+
...: ])
34+
...: struct_array = pa.array([
35+
...: {"int_col": 1, "string_col": "a"},
36+
...: {"int_col": 2, "string_col": "b"},
37+
...: {"int_col": 3, "string_col": "c"},
38+
...: ], type=struct_type)
39+
...: series = pd.Series(struct_array, dtype=pd.ArrowDtype(struct_type))
40+
In [2]: series.struct.to_frame()
41+
Out[2]:
42+
int_col string_col
43+
0 1 a
44+
1 2 b
45+
2 3 c
2146
2247
.. _whatsnew_220.enhancements.enhancement2:
2348

pandas/core/arrays/arrow/__init__.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from pandas.core.arrays.arrow.accessors import StructAccessor
12
from pandas.core.arrays.arrow.array import ArrowExtensionArray
23

3-
__all__ = ["ArrowExtensionArray"]
4+
__all__ = ["ArrowExtensionArray", "StructAccessor"]

pandas/core/arrays/arrow/accessors.py

+182
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
"""Accessors for arrow-backed data."""
2+
3+
from __future__ import annotations
4+
5+
from typing import TYPE_CHECKING
6+
7+
from pandas.compat import pa_version_under7p0
8+
9+
if not pa_version_under7p0:
10+
import pyarrow as pa
11+
import pyarrow.compute as pc
12+
13+
from pandas.core.dtypes.dtypes import ArrowDtype
14+
15+
if TYPE_CHECKING:
16+
from pandas import (
17+
DataFrame,
18+
Series,
19+
)
20+
21+
22+
class StructAccessor:
23+
"""
24+
Accessor object for structured data properties of the Series values.
25+
26+
Parameters
27+
----------
28+
data : Series
29+
Series containing Arrow struct data.
30+
"""
31+
32+
_validation_msg = "Can only use the '.struct' accessor with 'struct[pyarrow]' data."
33+
34+
def __init__(self, data=None) -> None:
35+
self._parent = data
36+
self._validate(data)
37+
38+
def _validate(self, data):
39+
dtype = data.dtype
40+
if not isinstance(dtype, ArrowDtype):
41+
raise AttributeError(self._validation_msg)
42+
43+
if not pa.types.is_struct(dtype.pyarrow_dtype):
44+
raise AttributeError(self._validation_msg)
45+
46+
@property
47+
def dtypes(self) -> Series:
48+
"""
49+
Return the dtype object of each child field of the struct.
50+
51+
Returns
52+
-------
53+
pandas.Series
54+
The data type of each child field.
55+
56+
Examples
57+
--------
58+
>>> import pyarrow as pa
59+
>>> s = pd.Series(
60+
... [
61+
... {"version": 1, "project": "pandas"},
62+
... {"version": 2, "project": "pandas"},
63+
... {"version": 1, "project": "numpy"},
64+
... ],
65+
... dtype=pd.ArrowDtype(pa.struct(
66+
... [("version", pa.int64()), ("project", pa.string())]
67+
... ))
68+
... )
69+
>>> s.struct.dtypes
70+
version int64[pyarrow]
71+
project string[pyarrow]
72+
dtype: object
73+
"""
74+
from pandas import (
75+
Index,
76+
Series,
77+
)
78+
79+
pa_type = self._parent.dtype.pyarrow_dtype
80+
types = [ArrowDtype(pa_type[i].type) for i in range(pa_type.num_fields)]
81+
names = [pa_type[i].name for i in range(pa_type.num_fields)]
82+
return Series(types, index=Index(names))
83+
84+
def field(self, name_or_index: str | int) -> Series:
85+
"""
86+
Extract a child field of a struct as a Series.
87+
88+
Parameters
89+
----------
90+
name_or_index : str | int
91+
Name or index of the child field to extract.
92+
93+
Returns
94+
-------
95+
pandas.Series
96+
The data corresponding to the selected child field.
97+
98+
See Also
99+
--------
100+
Series.struct.to_frame : Return all child fields as a DataFrame.
101+
102+
Examples
103+
--------
104+
>>> import pyarrow as pa
105+
>>> s = pd.Series(
106+
... [
107+
... {"version": 1, "project": "pandas"},
108+
... {"version": 2, "project": "pandas"},
109+
... {"version": 1, "project": "numpy"},
110+
... ],
111+
... dtype=pd.ArrowDtype(pa.struct(
112+
... [("version", pa.int64()), ("project", pa.string())]
113+
... ))
114+
... )
115+
116+
Extract by field name.
117+
118+
>>> s.struct.field("project")
119+
0 pandas
120+
1 pandas
121+
2 numpy
122+
Name: project, dtype: string[pyarrow]
123+
124+
Extract by field index.
125+
126+
>>> s.struct.field(0)
127+
0 1
128+
1 2
129+
2 1
130+
Name: version, dtype: int64[pyarrow]
131+
"""
132+
from pandas import Series
133+
134+
pa_arr = self._parent.array._pa_array
135+
if isinstance(name_or_index, int):
136+
index = name_or_index
137+
else:
138+
index = pa_arr.type.get_field_index(name_or_index)
139+
140+
pa_field = pa_arr.type[index]
141+
field_arr = pc.struct_field(pa_arr, [index])
142+
return Series(field_arr, dtype=ArrowDtype(field_arr.type), name=pa_field.name)
143+
144+
def to_frame(self) -> DataFrame:
145+
"""
146+
Extract all child fields of a struct as a DataFrame.
147+
148+
Returns
149+
-------
150+
pandas.DataFrame
151+
The data corresponding to all child fields.
152+
153+
See Also
154+
--------
155+
Series.struct.field : Return a single child field as a Series.
156+
157+
Examples
158+
--------
159+
>>> import pyarrow as pa
160+
>>> s = pd.Series(
161+
... [
162+
... {"version": 1, "project": "pandas"},
163+
... {"version": 2, "project": "pandas"},
164+
... {"version": 1, "project": "numpy"},
165+
... ],
166+
... dtype=pd.ArrowDtype(pa.struct(
167+
... [("version", pa.int64()), ("project", pa.string())]
168+
... ))
169+
... )
170+
171+
>>> s.struct.to_frame()
172+
version project
173+
0 1 pandas
174+
1 2 pandas
175+
2 1 numpy
176+
"""
177+
from pandas import concat
178+
179+
pa_type = self._parent.dtype.pyarrow_dtype
180+
return concat(
181+
[self.field(i) for i in range(pa_type.num_fields)], axis="columns"
182+
)

pandas/core/series.py

+2
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@
101101
from pandas.core.accessor import CachedAccessor
102102
from pandas.core.apply import SeriesApply
103103
from pandas.core.arrays import ExtensionArray
104+
from pandas.core.arrays.arrow import StructAccessor
104105
from pandas.core.arrays.categorical import CategoricalAccessor
105106
from pandas.core.arrays.sparse import SparseAccessor
106107
from pandas.core.construction import (
@@ -5787,6 +5788,7 @@ def to_period(self, freq: str | None = None, copy: bool | None = None) -> Series
57875788
cat = CachedAccessor("cat", CategoricalAccessor)
57885789
plot = CachedAccessor("plot", pandas.plotting.PlotAccessor)
57895790
sparse = CachedAccessor("sparse", SparseAccessor)
5791+
struct = CachedAccessor("struct", StructAccessor)
57905792

57915793
# ----------------------------------------------------------------------
57925794
# Add plotting methods to Series
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
import re
2+
3+
import pytest
4+
5+
from pandas import (
6+
ArrowDtype,
7+
DataFrame,
8+
Index,
9+
Series,
10+
)
11+
import pandas._testing as tm
12+
13+
pa = pytest.importorskip("pyarrow")
14+
15+
16+
class TestStructAccessor:
17+
def test_struct_accessor_dtypes(self):
18+
ser = Series(
19+
[],
20+
dtype=ArrowDtype(
21+
pa.struct([("int_col", pa.int64()), ("string_col", pa.string())])
22+
),
23+
)
24+
actual = ser.struct.dtypes
25+
expected = Series(
26+
[ArrowDtype(pa.int64()), ArrowDtype(pa.string())],
27+
index=Index(["int_col", "string_col"]),
28+
)
29+
tm.assert_series_equal(actual, expected)
30+
31+
def test_struct_accessor_field(self):
32+
ser = Series(
33+
[
34+
{"rice": 1.0, "maize": -1, "wheat": "a"},
35+
{"rice": 2.0, "maize": 0, "wheat": "b"},
36+
{"rice": 3.0, "maize": 1, "wheat": "c"},
37+
],
38+
dtype=ArrowDtype(
39+
pa.struct(
40+
[
41+
("rice", pa.float64()),
42+
("maize", pa.int64()),
43+
("wheat", pa.string()),
44+
]
45+
)
46+
),
47+
)
48+
by_name = ser.struct.field("maize")
49+
by_name_expected = Series(
50+
[-1, 0, 1],
51+
dtype=ArrowDtype(pa.int64()),
52+
name="maize",
53+
)
54+
tm.assert_series_equal(by_name, by_name_expected)
55+
56+
by_index = ser.struct.field(2)
57+
by_index_expected = Series(
58+
["a", "b", "c"],
59+
dtype=ArrowDtype(pa.string()),
60+
name="wheat",
61+
)
62+
tm.assert_series_equal(by_index, by_index_expected)
63+
64+
def test_struct_accessor_to_frame(self):
65+
ser = Series(
66+
[
67+
{"painted": 1, "snapping": {"sea": "green"}},
68+
{"painted": 2, "snapping": {"sea": "leatherback"}},
69+
{"painted": 3, "snapping": {"sea": "hawksbill"}},
70+
],
71+
dtype=ArrowDtype(
72+
pa.struct(
73+
[
74+
("painted", pa.int64()),
75+
("snapping", pa.struct([("sea", pa.string())])),
76+
]
77+
)
78+
),
79+
)
80+
actual = ser.struct.to_frame()
81+
expected = DataFrame(
82+
{
83+
"painted": Series([1, 2, 3], dtype=ArrowDtype(pa.int64())),
84+
"snapping": Series(
85+
[{"sea": "green"}, {"sea": "leatherback"}, {"sea": "hawksbill"}],
86+
dtype=ArrowDtype(pa.struct([("sea", pa.string())])),
87+
),
88+
}
89+
)
90+
tm.assert_frame_equal(actual, expected)
91+
92+
@pytest.mark.parametrize(
93+
"invalid",
94+
[
95+
pytest.param(Series([1, 2, 3], dtype="int64"), id="int64"),
96+
pytest.param(
97+
Series(["a", "b", "c"], dtype="string[pyarrow]"), id="string-pyarrow"
98+
),
99+
],
100+
)
101+
def test_struct_accessor_api_for_invalid(self, invalid):
102+
msg = re.escape(
103+
"Can only use the '.struct' accessor with 'struct[pyarrow]' data."
104+
)
105+
106+
with pytest.raises(AttributeError, match=msg):
107+
invalid.struct

0 commit comments

Comments
 (0)