From f3d0d15f484427c419a89c3cb211d28931684536 Mon Sep 17 00:00:00 2001 From: "Rebecca N. Palmer" Date: Sun, 11 Dec 2022 14:53:42 +0000 Subject: [PATCH 01/12] add test for float to_datetime near overflow bounds --- pandas/tests/tools/test_to_datetime.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 374db17714b06..7cbdd5f0a8467 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1629,6 +1629,27 @@ def test_to_timestamp_unit_coerce(self, bad_val): result = to_datetime([1, 2, bad_val], unit="D", errors="coerce") tm.assert_index_equal(result, expected) + def test_float_to_datetime_raise_near_bounds(self): + mag="cannot convert input with unit 'Y'" + oneyear_in_ns = 1e9 * 60 * 60 * 24 * 365.2425 + tsmax_in_years = 2**63 / oneyear_in_ns # 2**63 ns, in years + should_succeed = Series( + [0, tsmax_in_years - 0.05, -tsmax_in_years + 0.05], + dtype=float + ) + should_fail1 = Series([0, tsmax_in_years + 0.05], dtype=float) + should_fail2 = Series([0, -tsmax_in_years - 0.05], dtype=float) + + result1=to_datetime(should_succeed, unit="Y", errors='raise') + result2=to_datetime(should_succeed, unit="Y", errors='coerce') + result3=to_datetime(should_succeed, unit="Y", errors='ignore') + tm.assert_series_equal(result1, result2) + tm.assert_series_equal(result1, result3) + with pytest.raises(OutOfBoundsDatetime, match=msg): + to_datetime(should_fail1, unit="Y", errors='raise') + with pytest.raises(OutOfBoundsDatetime, match=msg): + to_datetime(should_fail2, unit="Y", errors='raise') + class TestToDatetimeDataFrame: @pytest.fixture From dff5a3ecc066e1116ba5876ac674797d9e5d37de Mon Sep 17 00:00:00 2001 From: "Rebecca N. Palmer" Date: Sun, 11 Dec 2022 14:54:11 +0000 Subject: [PATCH 02/12] fix float to_datetime near overflow bounds --- pandas/_libs/tslib.pyx | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 6f58fecd1ac81..33e0ef8a745c1 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -294,14 +294,22 @@ def array_with_unit_to_datetime( # if we have nulls that are not type-compat # then need to iterate - if values.dtype.kind in ["i", "f", "u"]: + if values.dtype.kind in ["i", "u"]: iresult = values.astype("i8", copy=False) # fill missing values by comparing to NPY_NAT mask = iresult == NPY_NAT iresult[mask] = 0 + # for bounds checking, which can't use (integer) iresult * mult + # because it needs arithmetic overflow to not wrap around fvalues = iresult.astype("f8") * mult need_to_iterate = False + if values.dtype.kind in ["f",]: + mask = (values != values) | (values == NPY_NAT) # first is NaNs + fvalues = (values * mult).astype("f8") + fvalues[mask] = 0 + need_to_iterate = False + if not need_to_iterate: # check the bounds if (fvalues < Timestamp.min.value).any() or ( @@ -313,11 +321,9 @@ def array_with_unit_to_datetime( result = (iresult * mult).astype("M8[ns]") elif values.dtype.kind == "f": - fresult = (values * mult).astype("f8") - fresult[mask] = 0 if prec: - fresult = round(fresult, prec) - result = fresult.astype("M8[ns]", copy=False) + fvalues = round(fvalues, prec) + result = fvalues.astype("M8[ns]", copy=False) iresult = result.view("i8") iresult[mask] = NPY_NAT From 9fe509193b4ea9b6f4e5f1d1c9656fd3f543d6e5 Mon Sep 17 00:00:00 2001 From: "Rebecca N. Palmer" Date: Sun, 11 Dec 2022 16:01:17 +0000 Subject: [PATCH 03/12] fix typo and formatting --- pandas/tests/tools/test_to_datetime.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 7cbdd5f0a8467..651428f9afe77 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1630,25 +1630,24 @@ def test_to_timestamp_unit_coerce(self, bad_val): tm.assert_index_equal(result, expected) def test_float_to_datetime_raise_near_bounds(self): - mag="cannot convert input with unit 'Y'" + msg = "cannot convert input with unit 'Y'" oneyear_in_ns = 1e9 * 60 * 60 * 24 * 365.2425 - tsmax_in_years = 2**63 / oneyear_in_ns # 2**63 ns, in years + tsmax_in_years = 2**63 / oneyear_in_ns # 2**63 ns, in years should_succeed = Series( - [0, tsmax_in_years - 0.05, -tsmax_in_years + 0.05], - dtype=float + [0, tsmax_in_years - 0.05, -tsmax_in_years + 0.05], dtype=float ) should_fail1 = Series([0, tsmax_in_years + 0.05], dtype=float) should_fail2 = Series([0, -tsmax_in_years - 0.05], dtype=float) - result1=to_datetime(should_succeed, unit="Y", errors='raise') - result2=to_datetime(should_succeed, unit="Y", errors='coerce') - result3=to_datetime(should_succeed, unit="Y", errors='ignore') + result1 = to_datetime(should_succeed, unit="Y", errors="raise") + result2 = to_datetime(should_succeed, unit="Y", errors="coerce") + result3 = to_datetime(should_succeed, unit="Y", errors="ignore") tm.assert_series_equal(result1, result2) tm.assert_series_equal(result1, result3) with pytest.raises(OutOfBoundsDatetime, match=msg): - to_datetime(should_fail1, unit="Y", errors='raise') + to_datetime(should_fail1, unit="Y", errors="raise") with pytest.raises(OutOfBoundsDatetime, match=msg): - to_datetime(should_fail2, unit="Y", errors='raise') + to_datetime(should_fail2, unit="Y", errors="raise") class TestToDatetimeDataFrame: From bac5f68f4cd2f96361ec3f7fc11d6756aa7348f5 Mon Sep 17 00:00:00 2001 From: "Rebecca N. Palmer" Date: Sun, 11 Dec 2022 16:12:43 +0000 Subject: [PATCH 04/12] fix formatting --- pandas/_libs/tslib.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 33e0ef8a745c1..ff68d4a6457ce 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -305,7 +305,7 @@ def array_with_unit_to_datetime( need_to_iterate = False if values.dtype.kind in ["f",]: - mask = (values != values) | (values == NPY_NAT) # first is NaNs + mask = (values != values) | (values == NPY_NAT) # first is NaNs fvalues = (values * mult).astype("f8") fvalues[mask] = 0 need_to_iterate = False From 535a4ecd0756e498e958d0cf9e92b23dd9c26438 Mon Sep 17 00:00:00 2001 From: "Rebecca N. Palmer" Date: Sun, 11 Dec 2022 17:27:53 +0000 Subject: [PATCH 05/12] fix test to not fail on rounding differences --- pandas/tests/tools/test_to_datetime.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 651428f9afe77..b5dec2a1f9766 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1633,17 +1633,17 @@ def test_float_to_datetime_raise_near_bounds(self): msg = "cannot convert input with unit 'Y'" oneyear_in_ns = 1e9 * 60 * 60 * 24 * 365.2425 tsmax_in_years = 2**63 / oneyear_in_ns # 2**63 ns, in years + # just in bounds should_succeed = Series( [0, tsmax_in_years - 0.05, -tsmax_in_years + 0.05], dtype=float ) + expected = (should_succeed * oneyear_in_ns).astype("M8[ns]") + for error_mode in ["raise", "coerce", "ignore"]: + result1 = to_datetime(should_succeed, unit="Y", errors=error_mode) + tm.assert_almost_equal(result1, expected, rtol=1e-10) + # just out of bounds should_fail1 = Series([0, tsmax_in_years + 0.05], dtype=float) should_fail2 = Series([0, -tsmax_in_years - 0.05], dtype=float) - - result1 = to_datetime(should_succeed, unit="Y", errors="raise") - result2 = to_datetime(should_succeed, unit="Y", errors="coerce") - result3 = to_datetime(should_succeed, unit="Y", errors="ignore") - tm.assert_series_equal(result1, result2) - tm.assert_series_equal(result1, result3) with pytest.raises(OutOfBoundsDatetime, match=msg): to_datetime(should_fail1, unit="Y", errors="raise") with pytest.raises(OutOfBoundsDatetime, match=msg): From a5012a9214b0fda65cabdf696531a5b751a11550 Mon Sep 17 00:00:00 2001 From: "Rebecca N. Palmer" Date: Sun, 11 Dec 2022 20:09:33 +0000 Subject: [PATCH 06/12] don't use approximate comparison on datetimes, it doesn't work --- pandas/tests/tools/test_to_datetime.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index b5dec2a1f9766..08a6d83e710fd 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1637,10 +1637,10 @@ def test_float_to_datetime_raise_near_bounds(self): should_succeed = Series( [0, tsmax_in_years - 0.05, -tsmax_in_years + 0.05], dtype=float ) - expected = (should_succeed * oneyear_in_ns).astype("M8[ns]") + expected = should_succeed * oneyear_in_ns for error_mode in ["raise", "coerce", "ignore"]: result1 = to_datetime(should_succeed, unit="Y", errors=error_mode) - tm.assert_almost_equal(result1, expected, rtol=1e-10) + tm.assert_almost_equal(result1.astype(float), expected, rtol=1e-10) # just out of bounds should_fail1 = Series([0, tsmax_in_years + 0.05], dtype=float) should_fail2 = Series([0, -tsmax_in_years - 0.05], dtype=float) From 088eda5eb4785bf273b76f1d6c0534a5bf0cc777 Mon Sep 17 00:00:00 2001 From: "Rebecca N. Palmer" Date: Sun, 11 Dec 2022 21:54:03 +0000 Subject: [PATCH 07/12] also can't convert datetime to float --- pandas/tests/tools/test_to_datetime.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 08a6d83e710fd..e0de2cc38ed51 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1640,7 +1640,7 @@ def test_float_to_datetime_raise_near_bounds(self): expected = should_succeed * oneyear_in_ns for error_mode in ["raise", "coerce", "ignore"]: result1 = to_datetime(should_succeed, unit="Y", errors=error_mode) - tm.assert_almost_equal(result1.astype(float), expected, rtol=1e-10) + tm.assert_almost_equal(result1.astype(np.int64), expected, rtol=1e-10) # just out of bounds should_fail1 = Series([0, tsmax_in_years + 0.05], dtype=float) should_fail2 = Series([0, -tsmax_in_years - 0.05], dtype=float) From 8b9c050d434e11f117f484e5a50ab96f76e6b16e Mon Sep 17 00:00:00 2001 From: "Rebecca N. Palmer" Date: Sun, 11 Dec 2022 22:45:40 +0000 Subject: [PATCH 08/12] match dtypes --- pandas/tests/tools/test_to_datetime.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index e0de2cc38ed51..d4cdc4ddebae3 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1637,7 +1637,7 @@ def test_float_to_datetime_raise_near_bounds(self): should_succeed = Series( [0, tsmax_in_years - 0.05, -tsmax_in_years + 0.05], dtype=float ) - expected = should_succeed * oneyear_in_ns + expected = (should_succeed * oneyear_in_ns).astype(np.int64) for error_mode in ["raise", "coerce", "ignore"]: result1 = to_datetime(should_succeed, unit="Y", errors=error_mode) tm.assert_almost_equal(result1.astype(np.int64), expected, rtol=1e-10) From 8ebc9100b8ea59b0703850967accc49dca3b633c Mon Sep 17 00:00:00 2001 From: "Rebecca N. Palmer" Date: Wed, 18 Jan 2023 22:16:19 +0000 Subject: [PATCH 09/12] TST: don't try to use non-integer years (see #50301) --- pandas/tests/tools/test_to_datetime.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index e30b4e70cef95..2e4d74d95ea69 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1936,24 +1936,24 @@ def test_to_timestamp_unit_coerce(self, bad_val): tm.assert_index_equal(result, expected) def test_float_to_datetime_raise_near_bounds(self): - msg = "cannot convert input with unit 'Y'" - oneyear_in_ns = 1e9 * 60 * 60 * 24 * 365.2425 - tsmax_in_years = 2**63 / oneyear_in_ns # 2**63 ns, in years + msg = "cannot convert input with unit 'D'" + oneday_in_ns = 1e9 * 60 * 60 * 24 + tsmax_in_days = 2**63 / oneday_in_ns # 2**63 ns, in days # just in bounds should_succeed = Series( - [0, tsmax_in_years - 0.05, -tsmax_in_years + 0.05], dtype=float + [0, tsmax_in_days - 0.05, -tsmax_in_days + 0.05], dtype=float ) - expected = (should_succeed * oneyear_in_ns).astype(np.int64) + expected = (should_succeed * oneday_in_ns).astype(np.int64) for error_mode in ["raise", "coerce", "ignore"]: - result1 = to_datetime(should_succeed, unit="Y", errors=error_mode) + result1 = to_datetime(should_succeed, unit="D", errors=error_mode) tm.assert_almost_equal(result1.astype(np.int64), expected, rtol=1e-10) # just out of bounds - should_fail1 = Series([0, tsmax_in_years + 0.05], dtype=float) - should_fail2 = Series([0, -tsmax_in_years - 0.05], dtype=float) + should_fail1 = Series([0, tsmax_in_days + 0.05], dtype=float) + should_fail2 = Series([0, -tsmax_in_days - 0.05], dtype=float) with pytest.raises(OutOfBoundsDatetime, match=msg): - to_datetime(should_fail1, unit="Y", errors="raise") + to_datetime(should_fail1, unit="D", errors="raise") with pytest.raises(OutOfBoundsDatetime, match=msg): - to_datetime(should_fail2, unit="Y", errors="raise") + to_datetime(should_fail2, unit="D", errors="raise") class TestToDatetimeDataFrame: From c17883ec495be286621b8e8fc46a0de8a8663ba3 Mon Sep 17 00:00:00 2001 From: "Rebecca N. Palmer" Date: Thu, 19 Jan 2023 21:01:53 +0000 Subject: [PATCH 10/12] TST: don't cross an integer (tsmax_in_days happens to be close to an integer, and this is a test of rounding) --- pandas/tests/tools/test_to_datetime.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 2e4d74d95ea69..6a6a659b80dd9 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1936,20 +1936,21 @@ def test_to_timestamp_unit_coerce(self, bad_val): tm.assert_index_equal(result, expected) def test_float_to_datetime_raise_near_bounds(self): + # GH50183 msg = "cannot convert input with unit 'D'" oneday_in_ns = 1e9 * 60 * 60 * 24 tsmax_in_days = 2**63 / oneday_in_ns # 2**63 ns, in days # just in bounds should_succeed = Series( - [0, tsmax_in_days - 0.05, -tsmax_in_days + 0.05], dtype=float + [0, tsmax_in_days - 0.005, -tsmax_in_days + 0.005], dtype=float ) expected = (should_succeed * oneday_in_ns).astype(np.int64) for error_mode in ["raise", "coerce", "ignore"]: result1 = to_datetime(should_succeed, unit="D", errors=error_mode) tm.assert_almost_equal(result1.astype(np.int64), expected, rtol=1e-10) # just out of bounds - should_fail1 = Series([0, tsmax_in_days + 0.05], dtype=float) - should_fail2 = Series([0, -tsmax_in_days - 0.05], dtype=float) + should_fail1 = Series([0, tsmax_in_days + 0.005], dtype=float) + should_fail2 = Series([0, -tsmax_in_days - 0.005], dtype=float) with pytest.raises(OutOfBoundsDatetime, match=msg): to_datetime(should_fail1, unit="D", errors="raise") with pytest.raises(OutOfBoundsDatetime, match=msg): From 4bd1e2a690f864d67a0bf317b502c212331700e0 Mon Sep 17 00:00:00 2001 From: "Rebecca N. Palmer" Date: Thu, 19 Jan 2023 21:05:48 +0000 Subject: [PATCH 11/12] PERF: remove unnecessary copy --- pandas/core/tools/datetimes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index d8cf0f46452f3..152bfcb8822a4 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -515,7 +515,7 @@ def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index: mult, _ = precision_from_unit(unit) mask = np.isnan(arg) | (arg == iNaT) - fvalues = (arg * mult).astype("f8") + fvalues = (arg * mult).astype("f8", copy=False) fvalues[mask] = 0 if (fvalues < Timestamp.min.value).any() or ( From bf5542e99a5b0dbc87e369b8cfe520a99e7b5624 Mon Sep 17 00:00:00 2001 From: "Rebecca N. Palmer" Date: Thu, 19 Jan 2023 21:25:25 +0000 Subject: [PATCH 12/12] add whatsnew --- doc/source/whatsnew/v2.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 7054d93457264..e0c2062df7586 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -915,6 +915,7 @@ Datetimelike - Bug in :func:`to_datetime` was failing to parse date strings ``'today'`` and ``'now'`` if ``format`` was not ISO8601 (:issue:`50359`) - Bug in :func:`Timestamp.utctimetuple` raising a ``TypeError`` (:issue:`32174`) - Bug in :func:`to_datetime` was raising ``ValueError`` when parsing mixed-offset :class:`Timestamp` with ``errors='ignore'`` (:issue:`50585`) +- Bug in :func:`to_datetime` was incorrectly handling floating-point inputs within 1 ``unit`` of the overflow boundaries (:issue:`50183`) Timedelta ^^^^^^^^^