Skip to content

Commit 1fd8d43

Browse files
committed
ENH: add Series.struct accessor for ArrowDtype[struct]
Features: * Series.struct.dtypes -- see dtypes and field names * Series.struct.field(name_or_index) -- extract a field as a Series * Series.struct.to_frame() -- convert all fields into a DataFrame
1 parent 31d4d8b commit 1fd8d43

File tree

6 files changed

+335
-4
lines changed

6 files changed

+335
-4
lines changed

doc/source/reference/series.rst

+23
Original file line numberDiff line numberDiff line change
@@ -525,6 +525,29 @@ Sparse-dtype specific methods and attributes are provided under the
525525
Series.sparse.from_coo
526526
Series.sparse.to_coo
527527

528+
529+
.. _api.series.struct:
530+
531+
Struct accessor
532+
~~~~~~~~~~~~~~~
533+
534+
Arrow struct-dtype specific methods and attributes are provided under the
535+
``Series.struct`` accessor.
536+
537+
.. autosummary::
538+
:toctree: api/
539+
:template: autosummary/accessor_attribute.rst
540+
541+
Series.struct.dtypes
542+
543+
.. autosummary::
544+
:toctree: api/
545+
:template: autosummary/accessor_method.rst
546+
547+
Series.struct.field
548+
Series.struct.to_frame
549+
550+
528551
.. _api.series.flags:
529552

530553
Flags

doc/source/whatsnew/v2.2.0.rst

+28-3
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,35 @@ including other versions of pandas.
1414
Enhancements
1515
~~~~~~~~~~~~
1616

17-
.. _whatsnew_220.enhancements.enhancement1:
17+
.. _whatsnew_220.enhancements.struct_accessor:
1818

19-
enhancement1
20-
^^^^^^^^^^^^
19+
Series.struct accessor to with PyArrow structured data
20+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
21+
22+
The ``Series.struct`` accessor provides attributes and methods for processing
23+
data with ``struct[pyarrow]`` dtype Series. For example,
24+
:meth:`Series.struct.to_frame` converts PyArrow structured data to a pandas
25+
DataFrame. (:issue:`54938`)
26+
27+
.. code-block:: ipython
28+
29+
In [1]: import pyarrow as pa
30+
...: struct_type = pa.struct([
31+
...: ("int_col", pa.int64()),
32+
...: ("string_col", pa.string()),
33+
...: ])
34+
...: struct_array = pa.array([
35+
...: {"int_col": 1, "string_col": "a"},
36+
...: {"int_col": 2, "string_col": "b"},
37+
...: {"int_col": 3, "string_col": "c"},
38+
...: ], type=struct_type)
39+
...: series = pd.Series(struct_array, dtype=pd.ArrowDtype(struct_type))
40+
In [2]: series.struct.to_frame()
41+
Out[2]:
42+
int_col string_col
43+
0 1 a
44+
1 2 b
45+
2 3 c
2146
2247
.. _whatsnew_220.enhancements.enhancement2:
2348

pandas/core/arrays/arrow/__init__.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from pandas.core.arrays.arrow.accessors import StructAccessor
12
from pandas.core.arrays.arrow.array import ArrowExtensionArray
23

3-
__all__ = ["ArrowExtensionArray"]
4+
__all__ = ["ArrowExtensionArray", "StructAccessor"]

pandas/core/arrays/arrow/accessors.py

+173
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
"""Accessors for arrow-backed data."""
2+
3+
from __future__ import annotations
4+
5+
from typing import TYPE_CHECKING
6+
7+
from pandas.compat import pa_version_under7p0
8+
9+
if not pa_version_under7p0:
10+
import pyarrow as pa
11+
import pyarrow.compute as pc
12+
13+
from pandas.core.dtypes.dtypes import ArrowDtype
14+
15+
if TYPE_CHECKING:
16+
from pandas import (
17+
DataFrame,
18+
Series,
19+
)
20+
21+
22+
class StructAccessor:
23+
"""
24+
Accessor object for categorical properties of the Series values.
25+
26+
Parameters
27+
----------
28+
data : Series
29+
"""
30+
31+
_validation_msg = "Can only use the '.struct' accessor with 'struct[pyarrow]' data."
32+
33+
def __init__(self, data=None) -> None:
34+
self._parent = data
35+
self._validate(data)
36+
37+
def _validate(self, data):
38+
dtype = data.dtype
39+
if not isinstance(dtype, ArrowDtype):
40+
raise AttributeError(self._validation_msg)
41+
42+
if not pa.types.is_struct(dtype.pyarrow_dtype):
43+
raise AttributeError(self._validation_msg)
44+
45+
@property
46+
def dtypes(self) -> Series:
47+
"""
48+
Return the dtype object of each child field of the struct.
49+
50+
Returns
51+
-------
52+
pandas.Series
53+
The data type of each child field.
54+
55+
Examples
56+
--------
57+
>>> import pyarrow as pa
58+
>>> s = pd.Series(
59+
... [
60+
... {"version": 1, "project": "pandas"},
61+
... {"version": 2, "project": "pandas"},
62+
... {"version": 1, "project": "numpy"},
63+
... ],
64+
... dtype=pd.ArrowDtype(pa.struct(
65+
... [("version", pa.int64()), ("project", pa.string())]
66+
... ))
67+
... )
68+
>>> s.struct.dtypes
69+
version int64[pyarrow]
70+
project string[pyarrow]
71+
dtype: object
72+
"""
73+
from pandas import (
74+
Index,
75+
Series,
76+
)
77+
78+
pa_type = self._parent.dtype.pyarrow_dtype
79+
types = [ArrowDtype(pa_type[i].type) for i in range(pa_type.num_fields)]
80+
names = [pa_type[i].name for i in range(pa_type.num_fields)]
81+
return Series(types, index=Index(names))
82+
83+
def field(self, name_or_index: str | int) -> Series:
84+
"""
85+
Extract a child field of a struct as a Series.
86+
87+
Parameters
88+
----------
89+
name_or_index : str | int
90+
Name or index of the child field to extract.
91+
92+
Returns
93+
-------
94+
pandas.Series
95+
The data corresponding to the selected child field.
96+
97+
Examples
98+
--------
99+
>>> import pyarrow as pa
100+
>>> s = pd.Series(
101+
... [
102+
... {"version": 1, "project": "pandas"},
103+
... {"version": 2, "project": "pandas"},
104+
... {"version": 1, "project": "numpy"},
105+
... ],
106+
... dtype=pd.ArrowDtype(pa.struct(
107+
... [("version", pa.int64()), ("project", pa.string())]
108+
... ))
109+
... )
110+
111+
Extract by field name.
112+
113+
>>> s.struct.field("project")
114+
0 pandas
115+
1 pandas
116+
2 numpy
117+
Name: project, dtype: string[pyarrow]
118+
119+
Extract by field index.
120+
121+
>>> s.struct.field(0)
122+
0 1
123+
1 2
124+
2 1
125+
Name: version, dtype: int64[pyarrow]
126+
"""
127+
from pandas import Series
128+
129+
pa_arr = self._parent.array._pa_array
130+
if isinstance(name_or_index, int):
131+
index = name_or_index
132+
else:
133+
index = pa_arr.type.get_field_index(name_or_index)
134+
135+
pa_field = pa_arr.type[index]
136+
field_arr = pc.struct_field(pa_arr, [index])
137+
return Series(field_arr, dtype=ArrowDtype(field_arr.type), name=pa_field.name)
138+
139+
def to_frame(self) -> DataFrame:
140+
"""
141+
Extract all child fields of a struct as a DataFrame.
142+
143+
Returns
144+
-------
145+
pandas.DataFrame
146+
The data corresponding to all child fields.
147+
148+
Examples
149+
--------
150+
>>> import pyarrow as pa
151+
>>> s = pd.Series(
152+
... [
153+
... {"version": 1, "project": "pandas"},
154+
... {"version": 2, "project": "pandas"},
155+
... {"version": 1, "project": "numpy"},
156+
... ],
157+
... dtype=pd.ArrowDtype(pa.struct(
158+
... [("version", pa.int64()), ("project", pa.string())]
159+
... ))
160+
... )
161+
162+
>>> s.struct.to_frame()
163+
version project
164+
0 1 pandas
165+
1 2 pandas
166+
2 1 numpy
167+
"""
168+
from pandas import concat
169+
170+
pa_type = self._parent.dtype.pyarrow_dtype
171+
return concat(
172+
[self.field(i) for i in range(pa_type.num_fields)], axis="columns"
173+
)

pandas/core/series.py

+2
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@
101101
from pandas.core.accessor import CachedAccessor
102102
from pandas.core.apply import SeriesApply
103103
from pandas.core.arrays import ExtensionArray
104+
from pandas.core.arrays.arrow import StructAccessor
104105
from pandas.core.arrays.categorical import CategoricalAccessor
105106
from pandas.core.arrays.sparse import SparseAccessor
106107
from pandas.core.construction import (
@@ -5787,6 +5788,7 @@ def to_period(self, freq: str | None = None, copy: bool | None = None) -> Series
57875788
cat = CachedAccessor("cat", CategoricalAccessor)
57885789
plot = CachedAccessor("plot", pandas.plotting.PlotAccessor)
57895790
sparse = CachedAccessor("sparse", SparseAccessor)
5791+
struct = CachedAccessor("struct", StructAccessor)
57905792

57915793
# ----------------------------------------------------------------------
57925794
# Add plotting methods to Series
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
import re
2+
3+
import pytest
4+
5+
from pandas import (
6+
ArrowDtype,
7+
DataFrame,
8+
Index,
9+
Series,
10+
)
11+
import pandas._testing as tm
12+
13+
pa = pytest.importorskip("pyarrow")
14+
15+
16+
class TestStructAccessor:
17+
def test_struct_accessor_dtypes(self):
18+
ser = Series(
19+
[],
20+
dtype=ArrowDtype(
21+
pa.struct([("int_col", pa.int64()), ("string_col", pa.string())])
22+
),
23+
)
24+
actual = ser.struct.dtypes
25+
expected = Series(
26+
[ArrowDtype(pa.int64()), ArrowDtype(pa.string())],
27+
index=Index(["int_col", "string_col"]),
28+
)
29+
tm.assert_series_equal(actual, expected)
30+
31+
def test_struct_accessor_field(self):
32+
ser = Series(
33+
[
34+
{"rice": 1.0, "maize": -1, "wheat": "a"},
35+
{"rice": 2.0, "maize": 0, "wheat": "b"},
36+
{"rice": 3.0, "maize": 1, "wheat": "c"},
37+
],
38+
dtype=ArrowDtype(
39+
pa.struct(
40+
[
41+
("rice", pa.float64()),
42+
("maize", pa.int64()),
43+
("wheat", pa.string()),
44+
]
45+
)
46+
),
47+
)
48+
by_name = ser.struct.field("maize")
49+
by_name_expected = Series(
50+
[-1, 0, 1],
51+
dtype=ArrowDtype(pa.int64()),
52+
name="maize",
53+
)
54+
tm.assert_series_equal(by_name, by_name_expected)
55+
56+
by_index = ser.struct.field(2)
57+
by_index_expected = Series(
58+
["a", "b", "c"],
59+
dtype=ArrowDtype(pa.string()),
60+
name="wheat",
61+
)
62+
tm.assert_series_equal(by_index, by_index_expected)
63+
64+
def test_struct_accessor_to_frame(self):
65+
ser = Series(
66+
[
67+
{"painted": 1, "snapping": {"sea": "green"}},
68+
{"painted": 2, "snapping": {"sea": "leatherback"}},
69+
{"painted": 3, "snapping": {"sea": "hawksbill"}},
70+
],
71+
dtype=ArrowDtype(
72+
pa.struct(
73+
[
74+
("painted", pa.int64()),
75+
("snapping", pa.struct([("sea", pa.string())])),
76+
]
77+
)
78+
),
79+
)
80+
actual = ser.struct.to_frame()
81+
expected = DataFrame(
82+
{
83+
"painted": Series([1, 2, 3], dtype=ArrowDtype(pa.int64())),
84+
"snapping": Series(
85+
[{"sea": "green"}, {"sea": "leatherback"}, {"sea": "hawksbill"}],
86+
dtype=ArrowDtype(pa.struct([("sea", pa.string())])),
87+
),
88+
}
89+
)
90+
tm.assert_frame_equal(actual, expected)
91+
92+
@pytest.mark.parametrize(
93+
"invalid",
94+
[
95+
pytest.param(Series([1, 2, 3], dtype="int64"), id="int64"),
96+
pytest.param(
97+
Series(["a", "b", "c"], dtype="string[pyarrow]"), id="string-pyarrow"
98+
),
99+
],
100+
)
101+
def test_struct_accessor_api_for_invalid(self, invalid):
102+
msg = re.escape(
103+
"Can only use the '.struct' accessor with 'struct[pyarrow]' data."
104+
)
105+
106+
with pytest.raises(AttributeError, match=msg):
107+
invalid.struct

0 commit comments

Comments
 (0)