From ddab89c26e195905ecf8ccb3f31405211e21fb00 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 15 Nov 2023 15:24:21 -0800 Subject: [PATCH 1/2] BUG: dt64 astype silent overflows --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/_libs/tslibs/conversion.pyx | 2 +- pandas/_libs/tslibs/np_datetime.pyx | 26 ++++++++++++++------------ pandas/tests/arrays/test_datetimes.py | 15 +++++++++++++++ 4 files changed, 31 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 48aee18c90456..262191639b1a3 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -353,6 +353,7 @@ Datetimelike - Bug in addition or subtraction of very large :class:`Tick` objects with :class:`Timestamp` or :class:`Timedelta` objects raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`) - Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond :class:`DatetimeTZDtype` and inputs that would be out of bounds with nanosecond resolution incorrectly raising ``OutOfBoundsDatetime`` (:issue:`54620`) - Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` dtype and inputs that would be out of bounds for a ``datetime64[ns]`` incorrectly raising ``OutOfBoundsDatetime`` (:issue:`55756`) +- Bug in ``.astype`` converting from a higher-resolution ``datetime64`` dtype to a lower-resolution ``datetime64`` dtype (e.g. ``datetime64[us]->datetim64[ms]``) silently overflowing with values near the lower implementation bound (:issue:`??`) - Timedelta diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 4f14782d9efbb..89b420b18a980 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -716,7 +716,7 @@ cdef int64_t parse_pydatetime( result = _ts.value else: if isinstance(val, _Timestamp): - result = (<_Timestamp>val)._as_creso(creso, round_ok=False)._value + result = (<_Timestamp>val)._as_creso(creso, round_ok=True)._value else: result = pydatetime_to_dt64(val, dts, reso=creso) return result diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 71a194177bf82..c69baeaf05bc9 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -365,13 +365,10 @@ cpdef ndarray astype_overflowsafe( return values elif from_unit > to_unit: - if round_ok: - # e.g. ns -> us, so there is no risk of overflow, so we can use - # numpy's astype safely. Note there _is_ risk of truncation. - return values.astype(dtype) - else: - iresult2 = astype_round_check(values.view("i8"), from_unit, to_unit) - return iresult2.view(dtype) + iresult2 = _astype_overflowsafe_to_smaller_unit( + values.view("i8"), from_unit, to_unit, round_ok=round_ok + ) + return iresult2.view(dtype) if (values).dtype.byteorder == ">": # GH#29684 we incorrectly get OutOfBoundsDatetime if we dont swap @@ -502,13 +499,18 @@ cdef int op_to_op_code(op): return Py_GT -cdef ndarray astype_round_check( +cdef ndarray _astype_overflowsafe_to_smaller_unit( ndarray i8values, NPY_DATETIMEUNIT from_unit, - NPY_DATETIMEUNIT to_unit + NPY_DATETIMEUNIT to_unit, + bint round_ok, ): - # cases with from_unit > to_unit, e.g. ns->us, raise if the conversion - # involves truncation, e.g. 1500ns->1us + """ + Overflow-safe conversion for cases with from_unit > to_unit, e.g. ns->us. + In addition for checking for overflows (which can occur near the lower + implementation bound, see numpy#22346), this checks for truncation, + e.g. 1500ns->1us. + """ cdef: Py_ssize_t i, N = i8values.size @@ -531,7 +533,7 @@ cdef ndarray astype_round_check( new_value = NPY_DATETIME_NAT else: new_value, mod = divmod(value, mult) - if mod != 0: + if not round_ok and mod != 0: # TODO: avoid runtime import from pandas._libs.tslibs.dtypes import npy_unit_to_abbrev from_abbrev = npy_unit_to_abbrev(from_unit) diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 566117899cfc5..af6051c99c988 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -309,6 +309,21 @@ def test_cmp_dt64_arraylike_tznaive(self, comparison_op): class TestDatetimeArray: + def test_astype_ns_to_ms_near_bounds(self): + ts = pd.Timestamp("1677-09-21 00:12:43.145225") + target = ts.as_unit("ms") + + dta = DatetimeArray._from_sequence([ts], dtype="M8[ns]") + assert (dta.view("i8") == ts.as_unit("ns").value).all() + + result = dta.astype("M8[ms]") + assert result[0] == target + + expected = DatetimeArray._from_sequence([ts], dtype="M8[ms]") + assert (expected.view("i8") == target._value).all() + + tm.assert_datetime_array_equal(result, expected) + def test_astype_non_nano_tznaive(self): dti = pd.date_range("2016-01-01", periods=3) From e44a0fa921c676e883327d6dc0bfaff679f835df Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 15 Nov 2023 15:27:00 -0800 Subject: [PATCH 2/2] GH ref --- doc/source/whatsnew/v2.2.0.rst | 2 +- pandas/_libs/tslibs/np_datetime.pyx | 2 ++ pandas/tests/arrays/test_datetimes.py | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 262191639b1a3..90534174a739d 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -347,13 +347,13 @@ Datetimelike - Bug in :meth:`Index.is_monotonic_increasing` and :meth:`Index.is_monotonic_decreasing` always caching :meth:`Index.is_unique` as ``True`` when first value in index is ``NaT`` (:issue:`55755`) - Bug in :meth:`Index.view` to a datetime64 dtype with non-supported resolution incorrectly raising (:issue:`55710`) - Bug in :meth:`Tick.delta` with very large ticks raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`) +- Bug in ``.astype`` converting from a higher-resolution ``datetime64`` dtype to a lower-resolution ``datetime64`` dtype (e.g. ``datetime64[us]->datetim64[ms]``) silently overflowing with values near the lower implementation bound (:issue:`55979`) - Bug in adding or subtracting a :class:`Week` offset to a ``datetime64`` :class:`Series`, :class:`Index`, or :class:`DataFrame` column with non-nanosecond resolution returning incorrect results (:issue:`55583`) - Bug in addition or subtraction of :class:`BusinessDay` offset with ``offset`` attribute to non-nanosecond :class:`Index`, :class:`Series`, or :class:`DataFrame` column giving incorrect results (:issue:`55608`) - Bug in addition or subtraction of :class:`DateOffset` objects with microsecond components to ``datetime64`` :class:`Index`, :class:`Series`, or :class:`DataFrame` columns with non-nanosecond resolution (:issue:`55595`) - Bug in addition or subtraction of very large :class:`Tick` objects with :class:`Timestamp` or :class:`Timedelta` objects raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`) - Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond :class:`DatetimeTZDtype` and inputs that would be out of bounds with nanosecond resolution incorrectly raising ``OutOfBoundsDatetime`` (:issue:`54620`) - Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` dtype and inputs that would be out of bounds for a ``datetime64[ns]`` incorrectly raising ``OutOfBoundsDatetime`` (:issue:`55756`) -- Bug in ``.astype`` converting from a higher-resolution ``datetime64`` dtype to a lower-resolution ``datetime64`` dtype (e.g. ``datetime64[us]->datetim64[ms]``) silently overflowing with values near the lower implementation bound (:issue:`??`) - Timedelta diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index c69baeaf05bc9..fa9b82fe46634 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -511,6 +511,8 @@ cdef ndarray _astype_overflowsafe_to_smaller_unit( implementation bound, see numpy#22346), this checks for truncation, e.g. 1500ns->1us. """ + # e.g. test_astype_ns_to_ms_near_bounds is a case with round_ok=True where + # just using numpy's astype silently fails cdef: Py_ssize_t i, N = i8values.size diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index af6051c99c988..770f19d9f13ce 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -310,6 +310,7 @@ def test_cmp_dt64_arraylike_tznaive(self, comparison_op): class TestDatetimeArray: def test_astype_ns_to_ms_near_bounds(self): + # GH#55979 ts = pd.Timestamp("1677-09-21 00:12:43.145225") target = ts.as_unit("ms")