Skip to content

Commit 3732cc4

Browse files
Backport PR pandas-dev#56650 on branch 2.2.x (ENH: Implement dt methods for pyarrow duration types) (pandas-dev#56656)
Backport PR pandas-dev#56650: ENH: Implement dt methods for pyarrow duration types Co-authored-by: Matthew Roeschke <[email protected]>
1 parent f8e9892 commit 3732cc4

File tree

4 files changed

+231
-1
lines changed

4 files changed

+231
-1
lines changed

doc/source/whatsnew/v2.2.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,7 @@ Other enhancements
316316
- :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, and :meth:`DataFrame.bfill` have gained the argument ``limit_area`` (:issue:`56492`)
317317
- Allow passing ``read_only``, ``data_only`` and ``keep_links`` arguments to openpyxl using ``engine_kwargs`` of :func:`read_excel` (:issue:`55027`)
318318
- Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`)
319+
- Implemented :meth:`Series.dt` methods and attributes for :class:`ArrowDtype` with ``pyarrow.duration`` type (:issue:`52284`)
319320
- Implemented :meth:`Series.str.extract` for :class:`ArrowDtype` (:issue:`56268`)
320321
- Improved error message that appears in :meth:`DatetimeIndex.to_period` with frequencies which are not supported as period frequencies, such as ``"BMS"`` (:issue:`56243`)
321322
- Improved error message when constructing :class:`Period` with invalid offsets such as ``"QS"`` (:issue:`55785`)

pandas/core/arrays/arrow/array.py

+87
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
from pandas._libs import lib
1919
from pandas._libs.tslibs import (
20+
NaT,
2021
Timedelta,
2122
Timestamp,
2223
timezones,
@@ -2498,6 +2499,92 @@ def _str_wrap(self, width: int, **kwargs):
24982499
result = self._apply_elementwise(predicate)
24992500
return type(self)(pa.chunked_array(result))
25002501

2502+
@property
2503+
def _dt_days(self):
2504+
return type(self)(
2505+
pa.array(self._to_timedeltaarray().days, from_pandas=True, type=pa.int32())
2506+
)
2507+
2508+
@property
2509+
def _dt_hours(self):
2510+
return type(self)(
2511+
pa.array(
2512+
[
2513+
td.components.hours if td is not NaT else None
2514+
for td in self._to_timedeltaarray()
2515+
],
2516+
type=pa.int32(),
2517+
)
2518+
)
2519+
2520+
@property
2521+
def _dt_minutes(self):
2522+
return type(self)(
2523+
pa.array(
2524+
[
2525+
td.components.minutes if td is not NaT else None
2526+
for td in self._to_timedeltaarray()
2527+
],
2528+
type=pa.int32(),
2529+
)
2530+
)
2531+
2532+
@property
2533+
def _dt_seconds(self):
2534+
return type(self)(
2535+
pa.array(
2536+
self._to_timedeltaarray().seconds, from_pandas=True, type=pa.int32()
2537+
)
2538+
)
2539+
2540+
@property
2541+
def _dt_milliseconds(self):
2542+
return type(self)(
2543+
pa.array(
2544+
[
2545+
td.components.milliseconds if td is not NaT else None
2546+
for td in self._to_timedeltaarray()
2547+
],
2548+
type=pa.int32(),
2549+
)
2550+
)
2551+
2552+
@property
2553+
def _dt_microseconds(self):
2554+
return type(self)(
2555+
pa.array(
2556+
self._to_timedeltaarray().microseconds,
2557+
from_pandas=True,
2558+
type=pa.int32(),
2559+
)
2560+
)
2561+
2562+
@property
2563+
def _dt_nanoseconds(self):
2564+
return type(self)(
2565+
pa.array(
2566+
self._to_timedeltaarray().nanoseconds, from_pandas=True, type=pa.int32()
2567+
)
2568+
)
2569+
2570+
def _dt_to_pytimedelta(self):
2571+
data = self._pa_array.to_pylist()
2572+
if self._dtype.pyarrow_dtype.unit == "ns":
2573+
data = [None if ts is None else ts.to_pytimedelta() for ts in data]
2574+
return np.array(data, dtype=object)
2575+
2576+
def _dt_total_seconds(self):
2577+
return type(self)(
2578+
pa.array(self._to_timedeltaarray().total_seconds(), from_pandas=True)
2579+
)
2580+
2581+
def _dt_as_unit(self, unit: str):
2582+
if pa.types.is_date(self.dtype.pyarrow_dtype):
2583+
raise NotImplementedError("as_unit not implemented for date types")
2584+
pd_array = self._maybe_convert_datelike_array()
2585+
# Don't just cast _pa_array in order to follow pandas unit conversion rules
2586+
return type(self)(pa.array(pd_array.as_unit(unit), from_pandas=True))
2587+
25012588
@property
25022589
def _dt_year(self):
25032590
return type(self)(pc.year(self._pa_array))

pandas/core/indexes/accessors.py

+38-1
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,20 @@ def _delegate_method(self, name: str, *args, **kwargs):
148148
return result
149149

150150

151+
@delegate_names(
152+
delegate=ArrowExtensionArray,
153+
accessors=TimedeltaArray._datetimelike_ops,
154+
typ="property",
155+
accessor_mapping=lambda x: f"_dt_{x}",
156+
raise_on_missing=False,
157+
)
158+
@delegate_names(
159+
delegate=ArrowExtensionArray,
160+
accessors=TimedeltaArray._datetimelike_methods,
161+
typ="method",
162+
accessor_mapping=lambda x: f"_dt_{x}",
163+
raise_on_missing=False,
164+
)
151165
@delegate_names(
152166
delegate=ArrowExtensionArray,
153167
accessors=DatetimeArray._datetimelike_ops,
@@ -213,6 +227,9 @@ def _delegate_method(self, name: str, *args, **kwargs):
213227

214228
return result
215229

230+
def to_pytimedelta(self):
231+
return cast(ArrowExtensionArray, self._parent.array)._dt_to_pytimedelta()
232+
216233
def to_pydatetime(self):
217234
# GH#20306
218235
warnings.warn(
@@ -241,6 +258,26 @@ def isocalendar(self) -> DataFrame:
241258
)
242259
return iso_calendar_df
243260

261+
@property
262+
def components(self) -> DataFrame:
263+
from pandas import DataFrame
264+
265+
components_df = DataFrame(
266+
{
267+
col: getattr(self._parent.array, f"_dt_{col}")
268+
for col in [
269+
"days",
270+
"hours",
271+
"minutes",
272+
"seconds",
273+
"milliseconds",
274+
"microseconds",
275+
"nanoseconds",
276+
]
277+
}
278+
)
279+
return components_df
280+
244281

245282
@delegate_names(
246283
delegate=DatetimeArray,
@@ -592,7 +629,7 @@ def __new__(cls, data: Series): # pyright: ignore[reportInconsistentConstructor
592629
index=orig.index,
593630
)
594631

595-
if isinstance(data.dtype, ArrowDtype) and data.dtype.kind == "M":
632+
if isinstance(data.dtype, ArrowDtype) and data.dtype.kind in "Mm":
596633
return ArrowTemporalProperties(data, orig)
597634
if lib.is_np_dtype(data.dtype, "M"):
598635
return DatetimeProperties(data, orig)

pandas/tests/extension/test_arrow.py

+105
Original file line numberDiff line numberDiff line change
@@ -2723,6 +2723,111 @@ def test_dt_tz_convert(unit):
27232723
tm.assert_series_equal(result, expected)
27242724

27252725

2726+
@pytest.mark.parametrize("dtype", ["timestamp[ms][pyarrow]", "duration[ms][pyarrow]"])
2727+
def test_as_unit(dtype):
2728+
# GH 52284
2729+
ser = pd.Series([1000, None], dtype=dtype)
2730+
result = ser.dt.as_unit("ns")
2731+
expected = ser.astype(dtype.replace("ms", "ns"))
2732+
tm.assert_series_equal(result, expected)
2733+
2734+
2735+
@pytest.mark.parametrize(
2736+
"prop, expected",
2737+
[
2738+
["days", 1],
2739+
["seconds", 2],
2740+
["microseconds", 3],
2741+
["nanoseconds", 4],
2742+
],
2743+
)
2744+
def test_dt_timedelta_properties(prop, expected):
2745+
# GH 52284
2746+
ser = pd.Series(
2747+
[
2748+
pd.Timedelta(
2749+
days=1,
2750+
seconds=2,
2751+
microseconds=3,
2752+
nanoseconds=4,
2753+
),
2754+
None,
2755+
],
2756+
dtype=ArrowDtype(pa.duration("ns")),
2757+
)
2758+
result = getattr(ser.dt, prop)
2759+
expected = pd.Series(
2760+
ArrowExtensionArray(pa.array([expected, None], type=pa.int32()))
2761+
)
2762+
tm.assert_series_equal(result, expected)
2763+
2764+
2765+
def test_dt_timedelta_total_seconds():
2766+
# GH 52284
2767+
ser = pd.Series(
2768+
[
2769+
pd.Timedelta(
2770+
days=1,
2771+
seconds=2,
2772+
microseconds=3,
2773+
nanoseconds=4,
2774+
),
2775+
None,
2776+
],
2777+
dtype=ArrowDtype(pa.duration("ns")),
2778+
)
2779+
result = ser.dt.total_seconds()
2780+
expected = pd.Series(
2781+
ArrowExtensionArray(pa.array([86402.000003, None], type=pa.float64()))
2782+
)
2783+
tm.assert_series_equal(result, expected)
2784+
2785+
2786+
def test_dt_to_pytimedelta():
2787+
# GH 52284
2788+
data = [timedelta(1, 2, 3), timedelta(1, 2, 4)]
2789+
ser = pd.Series(data, dtype=ArrowDtype(pa.duration("ns")))
2790+
2791+
result = ser.dt.to_pytimedelta()
2792+
expected = np.array(data, dtype=object)
2793+
tm.assert_numpy_array_equal(result, expected)
2794+
assert all(type(res) is timedelta for res in result)
2795+
2796+
expected = ser.astype("timedelta64[ns]").dt.to_pytimedelta()
2797+
tm.assert_numpy_array_equal(result, expected)
2798+
2799+
2800+
def test_dt_components():
2801+
# GH 52284
2802+
ser = pd.Series(
2803+
[
2804+
pd.Timedelta(
2805+
days=1,
2806+
seconds=2,
2807+
microseconds=3,
2808+
nanoseconds=4,
2809+
),
2810+
None,
2811+
],
2812+
dtype=ArrowDtype(pa.duration("ns")),
2813+
)
2814+
result = ser.dt.components
2815+
expected = pd.DataFrame(
2816+
[[1, 0, 0, 2, 0, 3, 4], [None, None, None, None, None, None, None]],
2817+
columns=[
2818+
"days",
2819+
"hours",
2820+
"minutes",
2821+
"seconds",
2822+
"milliseconds",
2823+
"microseconds",
2824+
"nanoseconds",
2825+
],
2826+
dtype="int32[pyarrow]",
2827+
)
2828+
tm.assert_frame_equal(result, expected)
2829+
2830+
27262831
@pytest.mark.parametrize("skipna", [True, False])
27272832
def test_boolean_reduce_series_all_null(all_boolean_reductions, skipna):
27282833
# GH51624

0 commit comments

Comments
 (0)