diff --git a/asv_bench/benchmarks/tslibs/fields.py b/asv_bench/benchmarks/tslibs/fields.py index 23ae73811204c..3a2baec54109a 100644 --- a/asv_bench/benchmarks/tslibs/fields.py +++ b/asv_bench/benchmarks/tslibs/fields.py @@ -12,7 +12,7 @@ class TimeGetTimedeltaField: params = [ _sizes, - ["days", "seconds", "microseconds", "nanoseconds"], + ["seconds", "microseconds", "nanoseconds"], ] param_names = ["size", "field"] diff --git a/doc/source/whatsnew/v2.0.1.rst b/doc/source/whatsnew/v2.0.1.rst index 838fd4fa28442..c64f7a46d3058 100644 --- a/doc/source/whatsnew/v2.0.1.rst +++ b/doc/source/whatsnew/v2.0.1.rst @@ -24,6 +24,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ +- Bug in :attr:`Series.dt.days` that would overflow ``int32`` number of days (:issue:`52391`) - Bug in :class:`arrays.DatetimeArray` constructor returning an incorrect unit when passed a non-nanosecond numpy datetime array (:issue:`52555`) - Bug in :func:`pandas.testing.assert_series_equal` where ``check_dtype=False`` would still raise for datetime or timedelta types with different resolutions (:issue:`52449`) - Bug in :func:`read_csv` casting PyArrow datetimes to NumPy when ``dtype_backend="pyarrow"`` and ``parse_dates`` is set causing a performance bottleneck in the process (:issue:`52546`) diff --git a/pandas/_libs/tslibs/fields.pyi b/pandas/_libs/tslibs/fields.pyi index 8b4bc1a31a1aa..c6cfd44e9f6ab 100644 --- a/pandas/_libs/tslibs/fields.pyi +++ b/pandas/_libs/tslibs/fields.pyi @@ -30,6 +30,10 @@ def get_timedelta_field( field: str, reso: int = ..., # NPY_DATETIMEUNIT ) -> npt.NDArray[np.int32]: ... +def get_timedelta_days( + tdindex: npt.NDArray[np.int64], # const int64_t[:] + reso: int = ..., # NPY_DATETIMEUNIT +) -> npt.NDArray[np.int64]: ... def isleapyear_arr( years: np.ndarray, ) -> npt.NDArray[np.bool_]: ... diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx index b162f278fcbec..1c75b47dd12ac 100644 --- a/pandas/_libs/tslibs/fields.pyx +++ b/pandas/_libs/tslibs/fields.pyx @@ -512,18 +512,7 @@ def get_timedelta_field( out = np.empty(count, dtype="i4") - if field == "days": - with nogil: - for i in range(count): - if tdindex[i] == NPY_NAT: - out[i] = -1 - continue - - pandas_timedelta_to_timedeltastruct(tdindex[i], reso, &tds) - out[i] = tds.days - return out - - elif field == "seconds": + if field == "seconds": with nogil: for i in range(count): if tdindex[i] == NPY_NAT: @@ -559,6 +548,34 @@ def get_timedelta_field( raise ValueError(f"Field {field} not supported") +@cython.wraparound(False) +@cython.boundscheck(False) +def get_timedelta_days( + const int64_t[:] tdindex, + NPY_DATETIMEUNIT reso=NPY_FR_ns, +): + """ + Given a int64-based timedelta index, extract the days, + field and return an array of these values. + """ + cdef: + Py_ssize_t i, count = len(tdindex) + ndarray[int64_t] out + pandas_timedeltastruct tds + + out = np.empty(count, dtype="i8") + + with nogil: + for i in range(count): + if tdindex[i] == NPY_NAT: + out[i] = -1 + continue + + pandas_timedelta_to_timedeltastruct(tdindex[i], reso, &tds) + out[i] = tds.days + return out + + cpdef isleapyear_arr(ndarray years): """vectorized version of isleapyear; NaT evaluates as False""" cdef: diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index e182ee08f1d58..d7e413ccec293 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -30,7 +30,10 @@ to_offset, ) from pandas._libs.tslibs.conversion import precision_from_unit -from pandas._libs.tslibs.fields import get_timedelta_field +from pandas._libs.tslibs.fields import ( + get_timedelta_days, + get_timedelta_field, +) from pandas._libs.tslibs.timedeltas import ( array_to_timedelta64, floordiv_object_array, @@ -81,7 +84,13 @@ def _field_accessor(name: str, alias: str, docstring: str): def f(self) -> np.ndarray: values = self.asi8 - result = get_timedelta_field(values, alias, reso=self._creso) + if alias == "days": + result = get_timedelta_days(values, reso=self._creso) + else: + # error: Incompatible types in assignment ( + # expression has type "ndarray[Any, dtype[signedinteger[_32Bit]]]", + # variable has type "ndarray[Any, dtype[signedinteger[_64Bit]]] + result = get_timedelta_field(values, alias, reso=self._creso) # type: ignore[assignment] # noqa: E501 if self._hasna: result = self._maybe_mask_results( result, fill_value=None, convert="float64" diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index 74f75eb9337e6..0cbc4bde4b07f 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -67,7 +67,7 @@ def test_pass_TimedeltaIndex_to_index(self): def test_fields(self): rng = timedelta_range("1 days, 10:11:12.100123456", periods=2, freq="s") - tm.assert_index_equal(rng.days, Index([1, 1], dtype=np.int32)) + tm.assert_index_equal(rng.days, Index([1, 1], dtype=np.int64)) tm.assert_index_equal( rng.seconds, Index([10 * 3600 + 11 * 60 + 12, 10 * 3600 + 11 * 60 + 13], dtype=np.int32), diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index 3f957130b6020..21c1e9ca84a35 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -796,3 +796,23 @@ def test_normalize_pre_epoch_dates(): result = ser.dt.normalize() expected = pd.to_datetime(Series(["1969-01-01", "2016-01-01"])) tm.assert_series_equal(result, expected) + + +def test_day_attribute_non_nano_beyond_int32(): + # GH 52386 + data = np.array( + [ + 136457654736252, + 134736784364431, + 245345345545332, + 223432411, + 2343241, + 3634548734, + 23234, + ], + dtype="timedelta64[s]", + ) + ser = Series(data) + result = ser.dt.days + expected = Series([1579371003, 1559453522, 2839645203, 2586, 27, 42066, 0]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/tslibs/test_fields.py b/pandas/tests/tslibs/test_fields.py index 9e6464f7727bd..da67c093b8f4d 100644 --- a/pandas/tests/tslibs/test_fields.py +++ b/pandas/tests/tslibs/test_fields.py @@ -35,6 +35,6 @@ def test_get_start_end_field_readonly(dtindex): def test_get_timedelta_field_readonly(dtindex): # treat dtindex as timedeltas for this next one - result = fields.get_timedelta_field(dtindex, "days") - expected = np.arange(5, dtype=np.int32) * 32 + result = fields.get_timedelta_field(dtindex, "seconds") + expected = np.array([0] * 5, dtype=np.int32) tm.assert_numpy_array_equal(result, expected)