Skip to content

Commit d3ca332

Browse files
lukemanleyim-vinicius
authored and
im-vinicius
committed
ENH/PERF: pyarrow timestamp & duration conversion consistency/performance (pandas-dev#53326)
* ENH/PERF: pyarrow timestamp & duration conversion consistency * gh refs * typo * whatsnew
1 parent a3f219e commit d3ca332

File tree

4 files changed

+133
-7
lines changed

4 files changed

+133
-7
lines changed

doc/source/whatsnew/v2.1.0.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -298,9 +298,9 @@ Performance improvements
298298
- Performance improvement in :meth:`Series.corr` and :meth:`Series.cov` for extension dtypes (:issue:`52502`)
299299
- Performance improvement in :meth:`Series.str.get` for pyarrow-backed strings (:issue:`53152`)
300300
- Performance improvement in :meth:`Series.to_numpy` when dtype is a numpy float dtype and ``na_value`` is ``np.nan`` (:issue:`52430`)
301+
- Performance improvement in :meth:`~arrays.ArrowExtensionArray.astype` when converting from a pyarrow timestamp or duration dtype to numpy (:issue:`53326`)
301302
- Performance improvement in :meth:`~arrays.ArrowExtensionArray.to_numpy` (:issue:`52525`)
302303
- Performance improvement when doing various reshaping operations on :class:`arrays.IntegerArrays` & :class:`arrays.FloatingArray` by avoiding doing unnecessary validation (:issue:`53013`)
303-
-
304304

305305
.. ---------------------------------------------------------------------------
306306
.. _whatsnew_210.bug_fixes:
@@ -449,6 +449,7 @@ ExtensionArray
449449
- Bug in :class:`~arrays.ArrowExtensionArray` converting pandas non-nanosecond temporal objects from non-zero values to zero values (:issue:`53171`)
450450
- Bug in :meth:`Series.quantile` for pyarrow temporal types raising ArrowInvalid (:issue:`52678`)
451451
- Bug in :meth:`Series.rank` returning wrong order for small values with ``Float64`` dtype (:issue:`52471`)
452+
- Bug in :meth:`~arrays.ArrowExtensionArray.__iter__` and :meth:`~arrays.ArrowExtensionArray.__getitem__` returning python datetime and timedelta objects for non-nano dtypes (:issue:`53326`)
452453
- Bug where the ``__from_arrow__`` method of masked ExtensionDtypes(e.g. :class:`Float64Dtype`, :class:`BooleanDtype`) would not accept pyarrow arrays of type ``pyarrow.null()`` (:issue:`52223`)
453454
-
454455

pandas/core/arrays/arrow/array.py

+51-6
Original file line numberDiff line numberDiff line change
@@ -533,9 +533,16 @@ def __getitem__(self, item: PositionalIndexer):
533533
if isinstance(value, pa.ChunkedArray):
534534
return type(self)(value)
535535
else:
536+
pa_type = self._pa_array.type
536537
scalar = value.as_py()
537538
if scalar is None:
538539
return self._dtype.na_value
540+
elif pa.types.is_timestamp(pa_type) and pa_type.unit != "ns":
541+
# GH 53326
542+
return Timestamp(scalar).as_unit(pa_type.unit)
543+
elif pa.types.is_duration(pa_type) and pa_type.unit != "ns":
544+
# GH 53326
545+
return Timedelta(scalar).as_unit(pa_type.unit)
539546
else:
540547
return scalar
541548

@@ -544,10 +551,18 @@ def __iter__(self) -> Iterator[Any]:
544551
Iterate over elements of the array.
545552
"""
546553
na_value = self._dtype.na_value
554+
# GH 53326
555+
pa_type = self._pa_array.type
556+
box_timestamp = pa.types.is_timestamp(pa_type) and pa_type.unit != "ns"
557+
box_timedelta = pa.types.is_duration(pa_type) and pa_type.unit != "ns"
547558
for value in self._pa_array:
548559
val = value.as_py()
549560
if val is None:
550561
yield na_value
562+
elif box_timestamp:
563+
yield Timestamp(val).as_unit(pa_type.unit)
564+
elif box_timedelta:
565+
yield Timedelta(val).as_unit(pa_type.unit)
551566
else:
552567
yield val
553568

@@ -1157,16 +1172,46 @@ def to_numpy(
11571172
copy: bool = False,
11581173
na_value: object = lib.no_default,
11591174
) -> np.ndarray:
1160-
if dtype is None and self._hasna:
1161-
dtype = object
1175+
if dtype is not None:
1176+
dtype = np.dtype(dtype)
1177+
elif self._hasna:
1178+
dtype = np.dtype(object)
1179+
11621180
if na_value is lib.no_default:
11631181
na_value = self.dtype.na_value
11641182

11651183
pa_type = self._pa_array.type
1166-
if pa.types.is_temporal(pa_type) and not pa.types.is_date(pa_type):
1167-
# temporal types with units and/or timezones currently
1168-
# require pandas/python scalars to pass all tests
1169-
# TODO: improve performance (this is slow)
1184+
if pa.types.is_timestamp(pa_type):
1185+
from pandas.core.arrays.datetimes import (
1186+
DatetimeArray,
1187+
tz_to_dtype,
1188+
)
1189+
1190+
np_dtype = np.dtype(f"M8[{pa_type.unit}]")
1191+
result = self._pa_array.to_numpy()
1192+
result = result.astype(np_dtype, copy=copy)
1193+
if dtype is None or dtype.kind == "O":
1194+
dta_dtype = tz_to_dtype(pa_type.tz, pa_type.unit)
1195+
result = DatetimeArray._simple_new(result, dtype=dta_dtype)
1196+
result = result.to_numpy(dtype=object, na_value=na_value)
1197+
elif result.dtype != dtype:
1198+
result = result.astype(dtype, copy=False)
1199+
return result
1200+
elif pa.types.is_duration(pa_type):
1201+
from pandas.core.arrays.timedeltas import TimedeltaArray
1202+
1203+
np_dtype = np.dtype(f"m8[{pa_type.unit}]")
1204+
result = self._pa_array.to_numpy()
1205+
result = result.astype(np_dtype, copy=copy)
1206+
if dtype is None or dtype.kind == "O":
1207+
result = TimedeltaArray._simple_new(result, dtype=np_dtype)
1208+
result = result.to_numpy(dtype=object, na_value=na_value)
1209+
elif result.dtype != dtype:
1210+
result = result.astype(dtype, copy=False)
1211+
return result
1212+
elif pa.types.is_time(pa_type):
1213+
# convert to list of python datetime.time objects before
1214+
# wrapping in ndarray
11701215
result = np.array(list(self), dtype=dtype)
11711216
elif is_object_dtype(dtype) and self._hasna:
11721217
result = np.empty(len(self), dtype=object)

pandas/core/arrays/datetimelike.py

+14
Original file line numberDiff line numberDiff line change
@@ -2204,6 +2204,20 @@ def ensure_arraylike_for_datetimelike(data, copy: bool, cls_name: str):
22042204
):
22052205
data = data.to_numpy("int64", na_value=iNaT)
22062206
copy = False
2207+
elif isinstance(data, ArrowExtensionArray) and data.dtype.kind == "M":
2208+
from pandas.core.arrays import DatetimeArray
2209+
from pandas.core.arrays.datetimes import tz_to_dtype
2210+
2211+
pa_type = data._pa_array.type
2212+
dtype = tz_to_dtype(tz=pa_type.tz, unit=pa_type.unit)
2213+
data = data.to_numpy(f"M8[{pa_type.unit}]", na_value=iNaT)
2214+
data = DatetimeArray._simple_new(data, dtype=dtype)
2215+
copy = False
2216+
elif isinstance(data, ArrowExtensionArray) and data.dtype.kind == "m":
2217+
pa_type = data._pa_array.type
2218+
dtype = np.dtype(f"m8[{pa_type.unit}]")
2219+
data = data.to_numpy(dtype, na_value=iNaT)
2220+
copy = False
22072221
elif not isinstance(data, (np.ndarray, ExtensionArray)) or isinstance(
22082222
data, ArrowExtensionArray
22092223
):

pandas/tests/extension/test_arrow.py

+66
Original file line numberDiff line numberDiff line change
@@ -3008,3 +3008,69 @@ def test_comparison_temporal(pa_type):
30083008
result = arr > val
30093009
expected = ArrowExtensionArray(pa.array([False, True, True], type=pa.bool_()))
30103010
tm.assert_extension_array_equal(result, expected)
3011+
3012+
3013+
@pytest.mark.parametrize(
3014+
"pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES
3015+
)
3016+
def test_getitem_temporal(pa_type):
3017+
# GH 53326
3018+
arr = ArrowExtensionArray(pa.array([1, 2, 3], type=pa_type))
3019+
result = arr[1]
3020+
if pa.types.is_duration(pa_type):
3021+
expected = pd.Timedelta(2, unit=pa_type.unit).as_unit(pa_type.unit)
3022+
assert isinstance(result, pd.Timedelta)
3023+
else:
3024+
expected = pd.Timestamp(2, unit=pa_type.unit, tz=pa_type.tz).as_unit(
3025+
pa_type.unit
3026+
)
3027+
assert isinstance(result, pd.Timestamp)
3028+
assert result.unit == expected.unit
3029+
assert result == expected
3030+
3031+
3032+
@pytest.mark.parametrize(
3033+
"pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES
3034+
)
3035+
def test_iter_temporal(pa_type):
3036+
# GH 53326
3037+
arr = ArrowExtensionArray(pa.array([1, None], type=pa_type))
3038+
result = list(arr)
3039+
if pa.types.is_duration(pa_type):
3040+
expected = [
3041+
pd.Timedelta(1, unit=pa_type.unit).as_unit(pa_type.unit),
3042+
pd.NA,
3043+
]
3044+
assert isinstance(result[0], pd.Timedelta)
3045+
else:
3046+
expected = [
3047+
pd.Timestamp(1, unit=pa_type.unit, tz=pa_type.tz).as_unit(pa_type.unit),
3048+
pd.NA,
3049+
]
3050+
assert isinstance(result[0], pd.Timestamp)
3051+
assert result[0].unit == expected[0].unit
3052+
assert result == expected
3053+
3054+
3055+
@pytest.mark.parametrize(
3056+
"pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES
3057+
)
3058+
def test_to_numpy_temporal(pa_type):
3059+
# GH 53326
3060+
arr = ArrowExtensionArray(pa.array([1, None], type=pa_type))
3061+
result = arr.to_numpy()
3062+
if pa.types.is_duration(pa_type):
3063+
expected = [
3064+
pd.Timedelta(1, unit=pa_type.unit).as_unit(pa_type.unit),
3065+
pd.NA,
3066+
]
3067+
assert isinstance(result[0], pd.Timedelta)
3068+
else:
3069+
expected = [
3070+
pd.Timestamp(1, unit=pa_type.unit, tz=pa_type.tz).as_unit(pa_type.unit),
3071+
pd.NA,
3072+
]
3073+
assert isinstance(result[0], pd.Timestamp)
3074+
expected = np.array(expected, dtype=object)
3075+
assert result[0].unit == expected[0].unit
3076+
tm.assert_numpy_array_equal(result, expected)

0 commit comments

Comments
 (0)