Skip to content

Commit 2198f51

Browse files
ENH: add and register Arrow extension types for Period and Interval (#28371)
1 parent d7d96b1 commit 2198f51

File tree

11 files changed

+469
-23
lines changed

11 files changed

+469
-23
lines changed

doc/source/user_guide/io.rst

+3-3
Original file line numberDiff line numberDiff line change
@@ -4652,10 +4652,10 @@ Several caveats.
46524652
* Index level names, if specified, must be strings.
46534653
* In the ``pyarrow`` engine, categorical dtypes for non-string types can be serialized to parquet, but will de-serialize as their primitive dtype.
46544654
* The ``pyarrow`` engine preserves the ``ordered`` flag of categorical dtypes with string types. ``fastparquet`` does not preserve the ``ordered`` flag.
4655-
* Non supported types include ``Period`` and actual Python object types. These will raise a helpful error message
4656-
on an attempt at serialization.
4655+
* Non supported types include ``Interval`` and actual Python object types. These will raise a helpful error message
4656+
on an attempt at serialization. ``Period`` type is supported with pyarrow >= 0.16.0.
46574657
* The ``pyarrow`` engine preserves extension data types such as the nullable integer and string data
4658-
type (requiring pyarrow >= 1.0.0, and requiring the extension type to implement the needed protocols,
4658+
type (requiring pyarrow >= 0.16.0, and requiring the extension type to implement the needed protocols,
46594659
see the :ref:`extension types documentation <extending.extension.arrow>`).
46604660

46614661
You can specify an ``engine`` to direct the serialization. This can be one of ``pyarrow``, or ``fastparquet``, or ``auto``.

doc/source/whatsnew/v1.0.0.rst

+2-3
Original file line numberDiff line numberDiff line change
@@ -204,9 +204,9 @@ Other enhancements
204204
- Added ``encoding`` argument to :func:`DataFrame.to_html` for non-ascii text (:issue:`28663`)
205205
- :meth:`Styler.background_gradient` now accepts ``vmin`` and ``vmax`` arguments (:issue:`12145`)
206206
- :meth:`Styler.format` added the ``na_rep`` parameter to help format the missing values (:issue:`21527`, :issue:`28358`)
207-
- Roundtripping DataFrames with nullable integer or string data types to parquet
207+
- Roundtripping DataFrames with nullable integer, string and period data types to parquet
208208
(:meth:`~DataFrame.to_parquet` / :func:`read_parquet`) using the `'pyarrow'` engine
209-
now preserve those data types with pyarrow >= 1.0.0 (:issue:`20612`).
209+
now preserve those data types with pyarrow >= 0.16.0 (:issue:`20612`, :issue:`28371`).
210210
- The ``partition_cols`` argument in :meth:`DataFrame.to_parquet` now accepts a string (:issue:`27117`)
211211
- :func:`pandas.read_json` now parses ``NaN``, ``Infinity`` and ``-Infinity`` (:issue:`12213`)
212212
- The ``pandas.np`` submodule is now deprecated. Import numpy directly instead (:issue:`30296`)
@@ -221,7 +221,6 @@ Other enhancements
221221
- Added an experimental :attr:`~DataFrame.attrs` for storing global metadata about a dataset (:issue:`29062`)
222222
- :meth:`Timestamp.fromisocalendar` is now compatible with python 3.8 and above (:issue:`28115`)
223223

224-
225224
Build Changes
226225
^^^^^^^^^^^^^
227226

pandas/core/arrays/_arrow_utils.py

+124
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
from distutils.version import LooseVersion
2+
import json
3+
4+
import numpy as np
5+
import pyarrow
6+
7+
from pandas.core.arrays.interval import _VALID_CLOSED
8+
9+
_pyarrow_version_ge_015 = LooseVersion(pyarrow.__version__) >= LooseVersion("0.15")
10+
11+
12+
def pyarrow_array_to_numpy_and_mask(arr, dtype):
13+
"""
14+
Convert a primitive pyarrow.Array to a numpy array and boolean mask based
15+
on the buffers of the Array.
16+
17+
Parameters
18+
----------
19+
arr : pyarrow.Array
20+
dtype : numpy.dtype
21+
22+
Returns
23+
-------
24+
(data, mask)
25+
Tuple of two numpy arrays with the raw data (with specified dtype) and
26+
a boolean mask (validity mask, so False means missing)
27+
"""
28+
buflist = arr.buffers()
29+
data = np.frombuffer(buflist[1], dtype=dtype)[arr.offset : arr.offset + len(arr)]
30+
bitmask = buflist[0]
31+
if bitmask is not None:
32+
mask = pyarrow.BooleanArray.from_buffers(
33+
pyarrow.bool_(), len(arr), [None, bitmask]
34+
)
35+
mask = np.asarray(mask)
36+
else:
37+
mask = np.ones(len(arr), dtype=bool)
38+
return data, mask
39+
40+
41+
if _pyarrow_version_ge_015:
42+
# the pyarrow extension types are only available for pyarrow 0.15+
43+
44+
class ArrowPeriodType(pyarrow.ExtensionType):
45+
def __init__(self, freq):
46+
# attributes need to be set first before calling
47+
# super init (as that calls serialize)
48+
self._freq = freq
49+
pyarrow.ExtensionType.__init__(self, pyarrow.int64(), "pandas.period")
50+
51+
@property
52+
def freq(self):
53+
return self._freq
54+
55+
def __arrow_ext_serialize__(self):
56+
metadata = {"freq": self.freq}
57+
return json.dumps(metadata).encode()
58+
59+
@classmethod
60+
def __arrow_ext_deserialize__(cls, storage_type, serialized):
61+
metadata = json.loads(serialized.decode())
62+
return ArrowPeriodType(metadata["freq"])
63+
64+
def __eq__(self, other):
65+
if isinstance(other, pyarrow.BaseExtensionType):
66+
return type(self) == type(other) and self.freq == other.freq
67+
else:
68+
return NotImplemented
69+
70+
def __hash__(self):
71+
return hash((str(self), self.freq))
72+
73+
# register the type with a dummy instance
74+
_period_type = ArrowPeriodType("D")
75+
pyarrow.register_extension_type(_period_type)
76+
77+
class ArrowIntervalType(pyarrow.ExtensionType):
78+
def __init__(self, subtype, closed):
79+
# attributes need to be set first before calling
80+
# super init (as that calls serialize)
81+
assert closed in _VALID_CLOSED
82+
self._closed = closed
83+
if not isinstance(subtype, pyarrow.DataType):
84+
subtype = pyarrow.type_for_alias(str(subtype))
85+
self._subtype = subtype
86+
87+
storage_type = pyarrow.struct([("left", subtype), ("right", subtype)])
88+
pyarrow.ExtensionType.__init__(self, storage_type, "pandas.interval")
89+
90+
@property
91+
def subtype(self):
92+
return self._subtype
93+
94+
@property
95+
def closed(self):
96+
return self._closed
97+
98+
def __arrow_ext_serialize__(self):
99+
metadata = {"subtype": str(self.subtype), "closed": self.closed}
100+
return json.dumps(metadata).encode()
101+
102+
@classmethod
103+
def __arrow_ext_deserialize__(cls, storage_type, serialized):
104+
metadata = json.loads(serialized.decode())
105+
subtype = pyarrow.type_for_alias(metadata["subtype"])
106+
closed = metadata["closed"]
107+
return ArrowIntervalType(subtype, closed)
108+
109+
def __eq__(self, other):
110+
if isinstance(other, pyarrow.BaseExtensionType):
111+
return (
112+
type(self) == type(other)
113+
and self.subtype == other.subtype
114+
and self.closed == other.closed
115+
)
116+
else:
117+
return NotImplemented
118+
119+
def __hash__(self):
120+
return hash((str(self), str(self.subtype), self.closed))
121+
122+
# register the type with a dummy instance
123+
_interval_type = ArrowIntervalType(pyarrow.int64(), "left")
124+
pyarrow.register_extension_type(_interval_type)

pandas/core/arrays/integer.py

+2-12
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ def construct_array_type(cls):
9090
def __from_arrow__(self, array):
9191
"""Construct IntegerArray from passed pyarrow Array/ChunkedArray"""
9292
import pyarrow
93+
from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask
9394

9495
if isinstance(array, pyarrow.Array):
9596
chunks = [array]
@@ -99,18 +100,7 @@ def __from_arrow__(self, array):
99100

100101
results = []
101102
for arr in chunks:
102-
buflist = arr.buffers()
103-
data = np.frombuffer(buflist[1], dtype=self.type)[
104-
arr.offset : arr.offset + len(arr)
105-
]
106-
bitmask = buflist[0]
107-
if bitmask is not None:
108-
mask = pyarrow.BooleanArray.from_buffers(
109-
pyarrow.bool_(), len(arr), [None, bitmask]
110-
)
111-
mask = np.asarray(mask)
112-
else:
113-
mask = np.ones(len(arr), dtype=bool)
103+
data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype=self.type)
114104
int_arr = IntegerArray(data.copy(), ~mask, copy=False)
115105
results.append(int_arr)
116106

pandas/core/arrays/interval.py

+53
Original file line numberDiff line numberDiff line change
@@ -1081,6 +1081,59 @@ def __array__(self, dtype=None):
10811081
result[i] = Interval(left[i], right[i], closed)
10821082
return result
10831083

1084+
def __arrow_array__(self, type=None):
1085+
"""
1086+
Convert myself into a pyarrow Array.
1087+
"""
1088+
import pyarrow
1089+
from pandas.core.arrays._arrow_utils import ArrowIntervalType
1090+
1091+
try:
1092+
subtype = pyarrow.from_numpy_dtype(self.dtype.subtype)
1093+
except TypeError:
1094+
raise TypeError(
1095+
"Conversion to arrow with subtype '{}' "
1096+
"is not supported".format(self.dtype.subtype)
1097+
)
1098+
interval_type = ArrowIntervalType(subtype, self.closed)
1099+
storage_array = pyarrow.StructArray.from_arrays(
1100+
[
1101+
pyarrow.array(self.left, type=subtype, from_pandas=True),
1102+
pyarrow.array(self.right, type=subtype, from_pandas=True),
1103+
],
1104+
names=["left", "right"],
1105+
)
1106+
mask = self.isna()
1107+
if mask.any():
1108+
# if there are missing values, set validity bitmap also on the array level
1109+
null_bitmap = pyarrow.array(~mask).buffers()[1]
1110+
storage_array = pyarrow.StructArray.from_buffers(
1111+
storage_array.type,
1112+
len(storage_array),
1113+
[null_bitmap],
1114+
children=[storage_array.field(0), storage_array.field(1)],
1115+
)
1116+
1117+
if type is not None:
1118+
if type.equals(interval_type.storage_type):
1119+
return storage_array
1120+
elif isinstance(type, ArrowIntervalType):
1121+
# ensure we have the same subtype and closed attributes
1122+
if not type.equals(interval_type):
1123+
raise TypeError(
1124+
"Not supported to convert IntervalArray to type with "
1125+
"different 'subtype' ({0} vs {1}) and 'closed' ({2} vs {3}) "
1126+
"attributes".format(
1127+
self.dtype.subtype, type.subtype, self.closed, type.closed
1128+
)
1129+
)
1130+
else:
1131+
raise TypeError(
1132+
"Not supported to convert IntervalArray to '{0}' type".format(type)
1133+
)
1134+
1135+
return pyarrow.ExtensionArray.from_storage(interval_type, storage_array)
1136+
10841137
_interval_shared_docs[
10851138
"to_tuples"
10861139
] = """

pandas/core/arrays/period.py

+26
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,32 @@ def __array__(self, dtype=None):
283283
# overriding DatetimelikeArray
284284
return np.array(list(self), dtype=object)
285285

286+
def __arrow_array__(self, type=None):
287+
"""
288+
Convert myself into a pyarrow Array.
289+
"""
290+
import pyarrow
291+
from pandas.core.arrays._arrow_utils import ArrowPeriodType
292+
293+
if type is not None:
294+
if pyarrow.types.is_integer(type):
295+
return pyarrow.array(self._data, mask=self.isna(), type=type)
296+
elif isinstance(type, ArrowPeriodType):
297+
# ensure we have the same freq
298+
if self.freqstr != type.freq:
299+
raise TypeError(
300+
"Not supported to convert PeriodArray to array with different"
301+
" 'freq' ({0} vs {1})".format(self.freqstr, type.freq)
302+
)
303+
else:
304+
raise TypeError(
305+
"Not supported to convert PeriodArray to '{0}' type".format(type)
306+
)
307+
308+
period_type = ArrowPeriodType(self.freqstr)
309+
storage_array = pyarrow.array(self._data, mask=self.isna(), type="int64")
310+
return pyarrow.ExtensionArray.from_storage(period_type, storage_array)
311+
286312
# --------------------------------------------------------------------
287313
# Vectorized analogues of Period properties
288314

pandas/core/dtypes/dtypes.py

+39
Original file line numberDiff line numberDiff line change
@@ -949,6 +949,26 @@ def construct_array_type(cls):
949949

950950
return PeriodArray
951951

952+
def __from_arrow__(self, array):
953+
"""Construct PeriodArray from pyarrow Array/ChunkedArray."""
954+
import pyarrow
955+
from pandas.core.arrays import PeriodArray
956+
from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask
957+
958+
if isinstance(array, pyarrow.Array):
959+
chunks = [array]
960+
else:
961+
chunks = array.chunks
962+
963+
results = []
964+
for arr in chunks:
965+
data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype="int64")
966+
parr = PeriodArray(data.copy(), freq=self.freq, copy=False)
967+
parr[~mask] = NaT
968+
results.append(parr)
969+
970+
return PeriodArray._concat_same_type(results)
971+
952972

953973
@register_extension_dtype
954974
class IntervalDtype(PandasExtensionDtype):
@@ -1120,3 +1140,22 @@ def is_dtype(cls, dtype) -> bool:
11201140
else:
11211141
return False
11221142
return super().is_dtype(dtype)
1143+
1144+
def __from_arrow__(self, array):
1145+
"""Construct IntervalArray from pyarrow Array/ChunkedArray."""
1146+
import pyarrow
1147+
from pandas.core.arrays import IntervalArray
1148+
1149+
if isinstance(array, pyarrow.Array):
1150+
chunks = [array]
1151+
else:
1152+
chunks = array.chunks
1153+
1154+
results = []
1155+
for arr in chunks:
1156+
left = np.asarray(arr.storage.field("left"), dtype=self.subtype)
1157+
right = np.asarray(arr.storage.field("right"), dtype=self.subtype)
1158+
iarr = IntervalArray.from_arrays(left, right, closed=array.type.closed)
1159+
results.append(iarr)
1160+
1161+
return IntervalArray._concat_same_type(results)

pandas/io/parquet.py

+3
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,9 @@ def __init__(self):
7676
)
7777
import pyarrow.parquet
7878

79+
# import utils to register the pyarrow extension types
80+
import pandas.core.arrays._arrow_utils # noqa
81+
7982
self.api = pyarrow
8083

8184
def write(

0 commit comments

Comments
 (0)