Skip to content

Commit e3983d2

Browse files
BUG: fix parquet roundtrip for Interval dtype with datetime64[ns] subtype (pandas-dev#46034)
1 parent 9ac59dc commit e3983d2

File tree

4 files changed

+33
-7
lines changed

4 files changed

+33
-7
lines changed

doc/source/whatsnew/v1.5.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,7 @@ I/O
364364
- Bug in :func:`read_csv` not respecting a specified converter to index columns in all cases (:issue:`40589`)
365365
- Bug in :func:`read_parquet` when ``engine="pyarrow"`` which caused partial write to disk when column of unsupported datatype was passed (:issue:`44914`)
366366
- Bug in :func:`DataFrame.to_excel` and :class:`ExcelWriter` would raise when writing an empty DataFrame to a ``.ods`` file (:issue:`45793`)
367+
- Bug in Parquet roundtrip for Interval dtype with ``datetime64[ns]`` subtype (:issue:`45881`)
367368

368369
Period
369370
^^^^^^

pandas/core/dtypes/dtypes.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -1253,16 +1253,18 @@ def __from_arrow__(
12531253

12541254
results = []
12551255
for arr in chunks:
1256-
left = np.asarray(arr.storage.field("left"), dtype=self.subtype)
1257-
right = np.asarray(arr.storage.field("right"), dtype=self.subtype)
1258-
iarr = IntervalArray.from_arrays(left, right, closed=array.type.closed)
1256+
if isinstance(arr, pyarrow.ExtensionArray):
1257+
arr = arr.storage
1258+
left = np.asarray(arr.field("left"), dtype=self.subtype)
1259+
right = np.asarray(arr.field("right"), dtype=self.subtype)
1260+
iarr = IntervalArray.from_arrays(left, right, closed=self.closed)
12591261
results.append(iarr)
12601262

12611263
if not results:
12621264
return IntervalArray.from_arrays(
12631265
np.array([], dtype=self.subtype),
12641266
np.array([], dtype=self.subtype),
1265-
closed=array.type.closed,
1267+
closed=self.closed,
12661268
)
12671269
return IntervalArray._concat_same_type(results)
12681270

pandas/tests/arrays/interval/test_interval.py

+20
Original file line numberDiff line numberDiff line change
@@ -376,3 +376,23 @@ def test_arrow_table_roundtrip_without_metadata(breaks):
376376
result = table.to_pandas()
377377
assert isinstance(result["a"].dtype, pd.IntervalDtype)
378378
tm.assert_frame_equal(result, df)
379+
380+
381+
@pyarrow_skip
382+
def test_from_arrow_from_raw_struct_array():
383+
# in case pyarrow lost the Interval extension type (eg on parquet roundtrip
384+
# with datetime64[ns] subtype, see GH-45881), still allow conversion
385+
# from arrow to IntervalArray
386+
import pyarrow as pa
387+
388+
arr = pa.array([{"left": 0, "right": 1}, {"left": 1, "right": 2}])
389+
dtype = pd.IntervalDtype(np.dtype("int64"), closed="neither")
390+
391+
result = dtype.__from_arrow__(arr)
392+
expected = IntervalArray.from_breaks(
393+
np.array([0, 1, 2], dtype="int64"), closed="neither"
394+
)
395+
tm.assert_extension_array_equal(result, expected)
396+
397+
result = dtype.__from_arrow__(pa.chunked_array([arr]))
398+
tm.assert_extension_array_equal(result, expected)

pandas/tests/io/test_parquet.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -928,15 +928,18 @@ def test_pyarrow_backed_string_array(self, pa, string_storage):
928928
with pd.option_context("string_storage", string_storage):
929929
check_round_trip(df, pa, expected=df.astype(f"string[{string_storage}]"))
930930

931-
@td.skip_if_no("pyarrow")
931+
@td.skip_if_no("pyarrow", min_version="2.0.0")
932932
def test_additional_extension_types(self, pa):
933933
# test additional ExtensionArrays that are supported through the
934934
# __arrow_array__ protocol + by defining a custom ExtensionType
935935
df = pd.DataFrame(
936936
{
937-
# Arrow does not yet support struct in writing to Parquet (ARROW-1644)
938-
# "c": pd.arrays.IntervalArray.from_tuples([(0, 1), (1, 2), (3, 4)]),
937+
"c": pd.IntervalIndex.from_tuples([(0, 1), (1, 2), (3, 4)]),
939938
"d": pd.period_range("2012-01-01", periods=3, freq="D"),
939+
# GH-45881 issue with interval with datetime64[ns] subtype
940+
"e": pd.IntervalIndex.from_breaks(
941+
pd.date_range("2012-01-01", periods=4, freq="D")
942+
),
940943
}
941944
)
942945
check_round_trip(df, pa)

0 commit comments

Comments
 (0)