Skip to content

Commit 3996cb3

Browse files
committed
ENH: add Series.struct accessor for ArrowDtype[struct]
Features: * Series.struct.dtypes -- see dtypes and field names * Series.struct.field(name_or_index) -- extract a field as a Series * Series.struct.to_frame() -- convert all fields into a DataFrame
1 parent 4b456e2 commit 3996cb3

File tree

6 files changed

+302
-4
lines changed

6 files changed

+302
-4
lines changed

doc/source/reference/series.rst

+23
Original file line numberDiff line numberDiff line change
@@ -525,6 +525,29 @@ Sparse-dtype specific methods and attributes are provided under the
525525
Series.sparse.from_coo
526526
Series.sparse.to_coo
527527

528+
529+
.. _api.series.struct:
530+
531+
Struct accessor
532+
~~~~~~~~~~~~~~~
533+
534+
Arrow struct-dtype specific methods and attributes are provided under the
535+
``Series.struct`` accessor.
536+
537+
.. autosummary::
538+
:toctree: api/
539+
:template: autosummary/accessor_attribute.rst
540+
541+
Series.struct.dtypes
542+
543+
.. autosummary::
544+
:toctree: api/
545+
:template: autosummary/accessor_method.rst
546+
547+
Series.struct.field
548+
Series.struct.to_frame
549+
550+
528551
.. _api.series.flags:
529552

530553
Flags

doc/source/whatsnew/v2.2.0.rst

+28-3
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,35 @@ including other versions of pandas.
1414
Enhancements
1515
~~~~~~~~~~~~
1616

17-
.. _whatsnew_220.enhancements.enhancement1:
17+
.. _whatsnew_220.enhancements.struct_accessor:
1818

19-
enhancement1
20-
^^^^^^^^^^^^
19+
Series.struct accessor to with PyArrow structured data
20+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
21+
22+
The ``Series.struct`` accessor provides attributes and methods for processing
23+
data with ``struct[pyarrow]`` dtype Series. For example,
24+
:meth:`Series.struct.to_frame` converts PyArrow structured data to a pandas
25+
DataFrame. (:issue:`54938`)
26+
27+
.. code-block:: ipython
28+
29+
In [1]: import pyarrow as pa
30+
...: struct_type = pa.struct([
31+
...: ("int_col", pa.int64()),
32+
...: ("string_col", pa.string()),
33+
...: ])
34+
...: struct_array = pa.array([
35+
...: {"int_col": 1, "string_col": "a"},
36+
...: {"int_col": 2, "string_col": "b"},
37+
...: {"int_col": 3, "string_col": "c"},
38+
...: ], type=struct_type)
39+
...: series = pd.Series(struct_array, dtype=pd.ArrowDtype(struct_type))
40+
In [2]: series.struct.to_frame()
41+
Out[2]:
42+
int_col string_col
43+
0 1 a
44+
1 2 b
45+
2 3 c
2146
2247
.. _whatsnew_220.enhancements.enhancement2:
2348

pandas/core/arrays/arrow/__init__.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from pandas.core.arrays.arrow.accessors import StructAccessor
12
from pandas.core.arrays.arrow.array import ArrowExtensionArray
23

3-
__all__ = ["ArrowExtensionArray"]
4+
__all__ = ["ArrowExtensionArray", "StructAccessor"]

pandas/core/arrays/arrow/accessors.py

+140
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
"""Accessors for arrow-backed data."""
2+
3+
from __future__ import annotations
4+
5+
from typing import TYPE_CHECKING
6+
7+
from pandas.compat import pa_version_under7p0
8+
9+
if not pa_version_under7p0:
10+
import pyarrow as pa
11+
import pyarrow.compute as pc
12+
13+
from pandas.core.dtypes.dtypes import ArrowDtype
14+
15+
if TYPE_CHECKING:
16+
from pandas import (
17+
DataFrame,
18+
Series,
19+
)
20+
21+
22+
class StructAccessor:
23+
"""
24+
Accessor object for categorical properties of the Series values.
25+
26+
Parameters
27+
----------
28+
data : Series or CategoricalIndex
29+
"""
30+
31+
_validation_msg = "Can only use the '.struct' accessor with 'struct[pyarrow]' data."
32+
33+
def __init__(self, data=None) -> None:
34+
self._parent = data
35+
self._validate(data)
36+
37+
def _validate(self, data):
38+
dtype = data.dtype
39+
if not isinstance(dtype, ArrowDtype):
40+
raise AttributeError(self._validation_msg)
41+
42+
if not pa.types.is_struct(dtype.pyarrow_dtype):
43+
raise AttributeError(self._validation_msg)
44+
45+
@property
46+
def dtypes(self) -> Series:
47+
"""
48+
Return the dtype object of each child field of the struct.
49+
50+
Examples
51+
--------
52+
>>> import pyarrow as pa
53+
>>> s = pd.Series(
54+
... [{"i": 1, "j": "a"}, {"i": 2, "j": "b"}, {"i": 3, "j": "c"}],
55+
... dtype=pd.ArrowDtype(pa.struct([("i", pa.int64()), ("j", pa.string())]))
56+
... )
57+
>>> s.struct.dtypes
58+
i int64[pyarrow]
59+
j string[pyarrow]
60+
dtype: object
61+
"""
62+
from pandas import (
63+
Index,
64+
Series,
65+
)
66+
67+
pa_type = self._parent.dtype.pyarrow_dtype
68+
types = [ArrowDtype(pa_type[i].type) for i in range(pa_type.num_fields)]
69+
names = [pa_type[i].name for i in range(pa_type.num_fields)]
70+
return Series(types, index=Index(names))
71+
72+
def field(self, name_or_index: str | int) -> Series:
73+
"""
74+
Extract a child field of a struct as a Series.
75+
76+
Parameters
77+
----------
78+
name_or_index : str | int
79+
Name or index of the child field to extract.
80+
81+
Examples
82+
--------
83+
>>> import pyarrow as pa
84+
>>> s = pd.Series(
85+
... [{"i": 1, "j": "a"}, {"i": 2, "j": "b"}, {"i": 3, "j": "c"}],
86+
... dtype=pd.ArrowDtype(pa.struct([("i", pa.int64()), ("j", pa.string())]))
87+
... )
88+
89+
Extract by field name.
90+
91+
>>> s.struct.field("j")
92+
0 a
93+
1 b
94+
2 c
95+
Name: j, dtype: string[pyarrow]
96+
97+
Extract by field index.
98+
99+
>>> s.struct.field(0)
100+
0 1
101+
1 2
102+
2 3
103+
Name: i, dtype: int64[pyarrow]
104+
"""
105+
from pandas import Series
106+
107+
pa_arr = self._parent.array._pa_array
108+
if isinstance(name_or_index, int):
109+
index = name_or_index
110+
else:
111+
index = pa_arr.type.get_field_index(name_or_index)
112+
113+
pa_field = pa_arr.type[index]
114+
field_arr = pc.struct_field(pa_arr, [index])
115+
return Series(field_arr, dtype=ArrowDtype(field_arr.type), name=pa_field.name)
116+
117+
def to_frame(self) -> DataFrame:
118+
"""
119+
Extract all child fields of a struct as a DataFrame.
120+
121+
Examples
122+
--------
123+
>>> import pyarrow as pa
124+
>>> s = pd.Series(
125+
... [{"i": 1, "j": "a"}, {"i": 2, "j": "b"}, {"i": 3, "j": "c"}],
126+
... dtype=pd.ArrowDtype(pa.struct([("i", pa.int64()), ("j", pa.string())]))
127+
... )
128+
129+
>>> s.struct.to_frame()
130+
i j
131+
0 1 a
132+
1 2 b
133+
2 3 c
134+
"""
135+
from pandas import concat
136+
137+
pa_type = self._parent.dtype.pyarrow_dtype
138+
return concat(
139+
[self.field(i) for i in range(pa_type.num_fields)], axis="columns"
140+
)

pandas/core/series.py

+2
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@
101101
from pandas.core.accessor import CachedAccessor
102102
from pandas.core.apply import SeriesApply
103103
from pandas.core.arrays import ExtensionArray
104+
from pandas.core.arrays.arrow import StructAccessor
104105
from pandas.core.arrays.categorical import CategoricalAccessor
105106
from pandas.core.arrays.sparse import SparseAccessor
106107
from pandas.core.construction import (
@@ -5787,6 +5788,7 @@ def to_period(self, freq: str | None = None, copy: bool | None = None) -> Series
57875788
cat = CachedAccessor("cat", CategoricalAccessor)
57885789
plot = CachedAccessor("plot", pandas.plotting.PlotAccessor)
57895790
sparse = CachedAccessor("sparse", SparseAccessor)
5791+
struct = CachedAccessor("struct", StructAccessor)
57905792

57915793
# ----------------------------------------------------------------------
57925794
# Add plotting methods to Series
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
import re
2+
3+
import pytest
4+
5+
from pandas import (
6+
ArrowDtype,
7+
DataFrame,
8+
Index,
9+
Series,
10+
)
11+
import pandas._testing as tm
12+
13+
pa = pytest.importorskip("pyarrow")
14+
15+
16+
class TestStructAccessor:
17+
def test_struct_accessor_dtypes(self):
18+
ser = Series(
19+
[],
20+
dtype=ArrowDtype(
21+
pa.struct([("int_col", pa.int64()), ("string_col", pa.string())])
22+
),
23+
)
24+
actual = ser.struct.dtypes
25+
expected = Series(
26+
[ArrowDtype(pa.int64()), ArrowDtype(pa.string())],
27+
index=Index(["int_col", "string_col"]),
28+
)
29+
tm.assert_series_equal(actual, expected)
30+
31+
def test_struct_accessor_field(self):
32+
ser = Series(
33+
[
34+
{"rice": 1.0, "maize": -1, "wheat": "a"},
35+
{"rice": 2.0, "maize": 0, "wheat": "b"},
36+
{"rice": 3.0, "maize": 1, "wheat": "c"},
37+
],
38+
dtype=ArrowDtype(
39+
pa.struct(
40+
[
41+
("rice", pa.float64()),
42+
("maize", pa.int64()),
43+
("wheat", pa.string()),
44+
]
45+
)
46+
),
47+
)
48+
by_name = ser.struct.field("maize")
49+
by_name_expected = Series(
50+
[-1, 0, 1],
51+
dtype=ArrowDtype(pa.int64()),
52+
name="maize",
53+
)
54+
tm.assert_series_equal(by_name, by_name_expected)
55+
56+
by_index = ser.struct.field(2)
57+
by_index_expected = Series(
58+
["a", "b", "c"],
59+
dtype=ArrowDtype(pa.string()),
60+
name="wheat",
61+
)
62+
tm.assert_series_equal(by_index, by_index_expected)
63+
64+
def test_struct_accessor_to_frame(self):
65+
ser = Series(
66+
[
67+
{"painted": 1, "snapping": {"sea": "green"}},
68+
{"painted": 2, "snapping": {"sea": "leatherback"}},
69+
{"painted": 3, "snapping": {"sea": "hawksbill"}},
70+
],
71+
dtype=ArrowDtype(
72+
pa.struct(
73+
[
74+
("painted", pa.int64()),
75+
("snapping", pa.struct([("sea", pa.string())])),
76+
]
77+
)
78+
),
79+
)
80+
actual = ser.struct.to_frame()
81+
expected = DataFrame(
82+
{
83+
"painted": Series([1, 2, 3], dtype=ArrowDtype(pa.int64())),
84+
"snapping": Series(
85+
[{"sea": "green"}, {"sea": "leatherback"}, {"sea": "hawksbill"}],
86+
dtype=ArrowDtype(pa.struct([("sea", pa.string())])),
87+
),
88+
}
89+
)
90+
tm.assert_frame_equal(actual, expected)
91+
92+
@pytest.mark.parametrize(
93+
"invalid",
94+
[
95+
pytest.param(Series([1, 2, 3], dtype="int64"), id="int64"),
96+
pytest.param(
97+
Series(["a", "b", "c"], dtype="string[pyarrow]"), id="string-pyarrow"
98+
),
99+
],
100+
)
101+
def test_struct_accessor_api_for_invalid(self, invalid):
102+
msg = re.escape(
103+
"Can only use the '.struct' accessor with 'struct[pyarrow]' data."
104+
)
105+
106+
with pytest.raises(AttributeError, match=msg):
107+
invalid.struct

0 commit comments

Comments
 (0)