From 7e1f1355668aefd38b2b239d280cd0d8e82ef709 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 17 Nov 2023 17:22:49 -0800 Subject: [PATCH 1/6] BUG: to_datetime with floats and unit not matching Timestamp --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/_libs/tslibs/conversion.pxd | 2 +- pandas/_libs/tslibs/conversion.pyi | 4 -- pandas/_libs/tslibs/conversion.pyx | 79 +++++++++++++++++++++++++- pandas/core/arrays/timedeltas.py | 23 ++------ pandas/core/tools/datetimes.py | 34 +++++------ pandas/tests/io/sas/test_sas7bdat.py | 13 ++++- pandas/tests/tools/test_to_datetime.py | 22 ++++--- 8 files changed, 123 insertions(+), 55 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 3d34955a3baa5..c012cbb7d6e29 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -357,6 +357,7 @@ Datetimelike - Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond :class:`DatetimeTZDtype` and inputs that would be out of bounds with nanosecond resolution incorrectly raising ``OutOfBoundsDatetime`` (:issue:`54620`) - Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` (or :class:`DatetimeTZDtype`) from mixed-numeric inputs treating those as nanoseconds instead of as multiples of the dtype's unit (which would happen with non-mixed numeric inputs) (:issue:`56004`) - Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` dtype and inputs that would be out of bounds for a ``datetime64[ns]`` incorrectly raising ``OutOfBoundsDatetime`` (:issue:`55756`) +- Bug in the results of :func:`pd.to_datetime` with an floating-dtype argument with ``unit`` not matching the pointwise results of :class:`Timestamp` (:issue:`??`) - Timedelta diff --git a/pandas/_libs/tslibs/conversion.pxd b/pandas/_libs/tslibs/conversion.pxd index 7ffd630d6d8e1..6b9f41b1bb06f 100644 --- a/pandas/_libs/tslibs/conversion.pxd +++ b/pandas/_libs/tslibs/conversion.pxd @@ -45,7 +45,7 @@ cdef int64_t get_datetime64_nanos(object val, NPY_DATETIMEUNIT reso) except? -1 cpdef datetime localize_pydatetime(datetime dt, tzinfo tz) cdef int64_t cast_from_unit(object ts, str unit, NPY_DATETIMEUNIT out_reso=*) except? -1 -cpdef (int64_t, int) precision_from_unit( +cdef (int64_t, int) precision_from_unit( NPY_DATETIMEUNIT in_reso, NPY_DATETIMEUNIT out_reso=* ) diff --git a/pandas/_libs/tslibs/conversion.pyi b/pandas/_libs/tslibs/conversion.pyi index a5f5d04efeb76..93935789abc80 100644 --- a/pandas/_libs/tslibs/conversion.pyi +++ b/pandas/_libs/tslibs/conversion.pyi @@ -8,8 +8,4 @@ import numpy as np DT64NS_DTYPE: np.dtype TD64NS_DTYPE: np.dtype -def precision_from_unit( - in_reso: int, - out_reso: int = ..., -) -> tuple[int, int]: ... # (int64_t, _) def localize_pydatetime(dt: datetime, tz: tzinfo | None) -> datetime: ... diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 84aceecb09a33..764f2dc853df3 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -1,8 +1,11 @@ +cimport cython + import numpy as np cimport numpy as cnp from libc.math cimport log10 from numpy cimport ( + float64_t, int32_t, int64_t, ) @@ -37,6 +40,7 @@ from pandas._libs.tslibs.np_datetime cimport ( NPY_DATETIMEUNIT, NPY_FR_ns, NPY_FR_us, + astype_overflowsafe, check_dts_bounds, convert_reso, dts_to_iso_string, @@ -74,6 +78,7 @@ from pandas._libs.tslibs.tzconversion cimport ( from pandas._libs.tslibs.util cimport ( is_float_object, is_integer_object, + is_nan, ) # ---------------------------------------------------------------------- @@ -86,6 +91,78 @@ TD64NS_DTYPE = np.dtype("m8[ns]") # ---------------------------------------------------------------------- # Unit Conversion Helpers +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.overflowcheck(True) +def cast_from_unit_vectorized( + ndarray values, + str unit, +): + """ + Vectorized analogue to cast_from_unit. + """ + cdef: + int64_t m + int p + NPY_DATETIMEUNIT in_reso, out_reso + Py_ssize_t i + + assert values.dtype.kind == "f" + + if unit in "YM": + if not (((values % 1) == 0) | np.isnan(values)).all(): + # GH#47267 it is clear that 2 "M" corresponds to 1970-02-01, + # but not clear what 2.5 "M" corresponds to, so we will + # disallow that case. + raise ValueError( + f"Conversion of non-round float with unit={unit} " + "is ambiguous" + ) + + # GH#47266 go through np.datetime64 to avoid weird results e.g. with "Y" + # and 150 we'd get 2120-01-01 09:00:00 + values = values.astype(f"M8[{unit}]") + dtype = np.dtype("M8[ns]") + return astype_overflowsafe(values, dtype=dtype, copy=False).view("i8") + + in_reso = abbrev_to_npy_unit(unit) + out_reso = abbrev_to_npy_unit("ns") + m, p = precision_from_unit(in_reso, out_reso) + + cdef: + ndarray[int64_t] base, out + ndarray[float64_t] frac + tuple shape = (values).shape + + out = np.empty(shape, dtype="i8") + base = np.empty(shape, dtype="i8") + frac = np.empty(shape, dtype="f8") + + for i in range(len(values)): + if is_nan(values[i]): + base[i] = NPY_NAT + else: + base[i] = values[i] + frac[i] = values[i] - base[i] + + if p: + frac = np.round(frac, p) + + try: + for i in range(len(values)): + if base[i] == NPY_NAT: + out[i] = NPY_NAT + else: + out[i] = (base[i] * m) + (frac[i] * m) + except (OverflowError, FloatingPointError) as err: + # FloatingPointError can be issued if we have float dtype and have + # set np.errstate(over="raise") + raise OutOfBoundsDatetime( + f"cannot convert input {values[i]} with the unit '{unit}'" + ) from err + return out + + cdef int64_t cast_from_unit( object ts, str unit, @@ -155,7 +232,7 @@ cdef int64_t cast_from_unit( ) from err -cpdef (int64_t, int) precision_from_unit( +cdef (int64_t, int) precision_from_unit( NPY_DATETIMEUNIT in_reso, NPY_DATETIMEUNIT out_reso=NPY_DATETIMEUNIT.NPY_FR_ns, ): diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 226e2568fdbf8..f55d3de8878ad 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -6,7 +6,6 @@ TYPE_CHECKING, cast, ) -import warnings import numpy as np @@ -27,8 +26,7 @@ npy_unit_to_abbrev, periods_per_second, ) -from pandas._libs.tslibs.conversion import precision_from_unit -from pandas._libs.tslibs.dtypes import abbrev_to_npy_unit +from pandas._libs.tslibs.conversion import cast_from_unit_vectorized from pandas._libs.tslibs.fields import ( get_timedelta_days, get_timedelta_field, @@ -1059,23 +1057,10 @@ def sequence_to_td64ns( data = data._data else: mask = np.isnan(data) - # The next few lines are effectively a vectorized 'cast_from_unit' - m, p = precision_from_unit(abbrev_to_npy_unit(unit or "ns")) - with warnings.catch_warnings(): - # Suppress RuntimeWarning about All-NaN slice - warnings.filterwarnings( - "ignore", "invalid value encountered in cast", RuntimeWarning - ) - base = data.astype(np.int64) - frac = data - base - if p: - frac = np.round(frac, p) - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", "invalid value encountered in cast", RuntimeWarning - ) - data = (base * m + (frac * m).astype(np.int64)).view("timedelta64[ns]") + + data = cast_from_unit_vectorized(data, unit or "ns") data[mask] = iNaT + data = data.view("m8[ns]") copy = False elif lib.is_np_dtype(data.dtype, "m"): diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index ea5e6e46f58ec..863fb414a75f2 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -26,12 +26,10 @@ Timestamp, astype_overflowsafe, get_unit_from_dtype, - iNaT, is_supported_unit, timezones as libtimezones, ) -from pandas._libs.tslibs.conversion import precision_from_unit -from pandas._libs.tslibs.dtypes import abbrev_to_npy_unit +from pandas._libs.tslibs.conversion import cast_from_unit_vectorized from pandas._libs.tslibs.parsing import ( DateParseError, guess_datetime_format, @@ -551,23 +549,19 @@ def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index: tz_parsed = None elif arg.dtype.kind == "f": - mult, _ = precision_from_unit(abbrev_to_npy_unit(unit)) - - mask = np.isnan(arg) | (arg == iNaT) - fvalues = (arg * mult).astype("f8", copy=False) - fvalues[mask] = 0 - - if (fvalues < Timestamp.min._value).any() or ( - fvalues > Timestamp.max._value - ).any(): - if errors != "raise": - arg = arg.astype(object) - return _to_datetime_with_unit(arg, unit, name, utc, errors) - raise OutOfBoundsDatetime(f"cannot convert input with unit '{unit}'") - - arr = fvalues.astype("M8[ns]", copy=False) - arr[mask] = np.datetime64("NaT", "ns") - + with np.errstate(over="raise"): + try: + arr = cast_from_unit_vectorized(arg, unit=unit) + except OutOfBoundsDatetime: + if errors != "raise": + return _to_datetime_with_unit( + arg.astype(object), unit, name, utc, errors + ) + raise OutOfBoundsDatetime( + f"cannot convert input with unit '{unit}'" + ) + + arr = arr.view("M8[ns]") tz_parsed = None else: arg = arg.astype(object, copy=False) diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index d56139d32b1da..58a6fa43dded5 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -187,7 +187,18 @@ def test_date_time(datapath): fname, parse_dates=["Date1", "Date2", "DateTime", "DateTimeHi", "Taiw"] ) # GH 19732: Timestamps imported from sas will incur floating point errors - df[df.columns[3]] = df.iloc[:, 3].dt.round("us") + # 2023-11-16 we don't know the correct "expected" result bc we do not have + # access to SAS to read the sas7bdat file. We are really just testing + # that we are "close". This only seems to be an issue near the + # implementation bounds. + res = df.iloc[:, 3].dt.round("us").copy() + + # the first and last elements are near the implementation bounds, where we + # would expect floating point error to occur. + res.iloc[0] -= pd.Timedelta(microseconds=1) + res.iloc[-1] += pd.Timedelta(microseconds=1) + + df.iloc[:, 3] = res tm.assert_frame_equal(df, df0) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index ef76d99260764..b7db5545a9e26 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1864,16 +1864,14 @@ def test_to_datetime_month_or_year_unit_int(self, cache, unit, item, request): result = to_datetime(np.array([item], dtype=object), unit=unit, cache=cache) tm.assert_index_equal(result, expected) - # TODO: this should also work - if isinstance(item, float): - request.applymarker( - pytest.mark.xfail( - reason=f"{type(item).__name__} in np.array should work" - ) - ) result = to_datetime(np.array([item]), unit=unit, cache=cache) tm.assert_index_equal(result, expected) + # with a nan! + result = to_datetime(np.array([item, np.nan]), unit=unit, cache=cache) + assert result.isna()[1] + tm.assert_index_equal(result[:1], expected) + @pytest.mark.parametrize("unit", ["Y", "M"]) def test_to_datetime_month_or_year_unit_non_round_float(self, cache, unit): # GH#50301 @@ -1883,6 +1881,8 @@ def test_to_datetime_month_or_year_unit_non_round_float(self, cache, unit): msg = f"Conversion of non-round float with unit={unit} is ambiguous" with pytest.raises(ValueError, match=msg): to_datetime([1.5], unit=unit, errors="raise") + with pytest.raises(ValueError, match=msg): + to_datetime(np.array([1.5]), unit=unit, errors="raise") with pytest.raises(ValueError, match=msg): with tm.assert_produces_warning(FutureWarning, match=warn_msg): to_datetime(["1.5"], unit=unit, errors="raise") @@ -2030,10 +2030,14 @@ def test_unit_mixed(self, cache, arr): def test_unit_rounding(self, cache): # GH 14156 & GH 20445: argument will incur floating point errors # but no premature rounding - result = to_datetime(1434743731.8770001, unit="s", cache=cache) - expected = Timestamp("2015-06-19 19:55:31.877000192") + value = 1434743731.8770001 + result = to_datetime(value, unit="s", cache=cache) + expected = Timestamp("2015-06-19 19:55:31.877000093") assert result == expected + alt = Timestamp(value, unit="s") + assert alt == result + def test_unit_ignore_keeps_name(self, cache): # GH 21697 expected = Index([15e9] * 2, name="name") From 48db2a5c1ce54bb76388a17bdd88cc57bb860259 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 17 Nov 2023 21:50:52 -0800 Subject: [PATCH 2/6] mypy fixup --- pandas/_libs/tslibs/conversion.pyi | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/_libs/tslibs/conversion.pyi b/pandas/_libs/tslibs/conversion.pyi index 93935789abc80..cfe39fe2964cb 100644 --- a/pandas/_libs/tslibs/conversion.pyi +++ b/pandas/_libs/tslibs/conversion.pyi @@ -9,3 +9,4 @@ DT64NS_DTYPE: np.dtype TD64NS_DTYPE: np.dtype def localize_pydatetime(dt: datetime, tz: tzinfo | None) -> datetime: ... +def cast_from_unit_vectorized(values: np.ndarray, unit: str) -> np.ndarray: ... From 777a1a3d12c410a3dd9b941882c7149b55147af1 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 20 Nov 2023 09:57:23 -0800 Subject: [PATCH 3/6] Update doc/source/whatsnew/v2.2.0.rst Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index b565fa28f2b70..98244798a804c 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -359,7 +359,7 @@ Datetimelike - Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond :class:`DatetimeTZDtype` and inputs that would be out of bounds with nanosecond resolution incorrectly raising ``OutOfBoundsDatetime`` (:issue:`54620`) - Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` (or :class:`DatetimeTZDtype`) from mixed-numeric inputs treating those as nanoseconds instead of as multiples of the dtype's unit (which would happen with non-mixed numeric inputs) (:issue:`56004`) - Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` dtype and inputs that would be out of bounds for a ``datetime64[ns]`` incorrectly raising ``OutOfBoundsDatetime`` (:issue:`55756`) -- Bug in the results of :func:`pd.to_datetime` with an floating-dtype argument with ``unit`` not matching the pointwise results of :class:`Timestamp` (:issue:`??`) +- Bug in the results of :func:`pd.to_datetime` with an floating-dtype argument with ``unit`` not matching the pointwise results of :class:`Timestamp` (:issue:`56037`) - Timedelta From 3192cfded90364b93aae10e0b5f656e8581a6959 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 20 Nov 2023 13:59:22 -0800 Subject: [PATCH 4/6] update for CoW --- pandas/tests/io/sas/test_sas7bdat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index 58a6fa43dded5..0ce428cef9520 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -198,7 +198,7 @@ def test_date_time(datapath): res.iloc[0] -= pd.Timedelta(microseconds=1) res.iloc[-1] += pd.Timedelta(microseconds=1) - df.iloc[:, 3] = res + df["DateTimeHi"] = res tm.assert_frame_equal(df, df0) From f77107944e2d2d3e3b484835252518e598b4741d Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 21 Nov 2023 12:36:33 -0800 Subject: [PATCH 5/6] xfail float32 case --- pandas/tests/tools/test_to_timedelta.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/pandas/tests/tools/test_to_timedelta.py b/pandas/tests/tools/test_to_timedelta.py index c4c9b41c218a0..8b5b09092ec18 100644 --- a/pandas/tests/tools/test_to_timedelta.py +++ b/pandas/tests/tools/test_to_timedelta.py @@ -232,9 +232,18 @@ def test_to_timedelta_on_missing_values_list(self, val): actual = to_timedelta([val]) assert actual[0]._value == np.timedelta64("NaT").astype("int64") - def test_to_timedelta_float(self): + @pytest.mark.parametrize( + "dtype", + [ + pytest.param( + np.float32, marks=pytest.mark.xfail(reason="Floating point error") + ), + np.float64, + ], + ) + def test_to_timedelta_float(self, dtype): # https://github.com/pandas-dev/pandas/issues/25077 - arr = np.arange(0, 1, 1e-6)[-10:] + arr = np.arange(0, 1, 1e-6)[-10:].astype(dtype) result = to_timedelta(arr, unit="s") expected_asi8 = np.arange(999990000, 10**9, 1000, dtype="int64") tm.assert_numpy_array_equal(result.asi8, expected_asi8) From 47efe18209a859850c70288a64747a21031a9117 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 21 Nov 2023 13:23:10 -0800 Subject: [PATCH 6/6] xfail on3 2bit --- pandas/tests/tools/test_to_timedelta.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/pandas/tests/tools/test_to_timedelta.py b/pandas/tests/tools/test_to_timedelta.py index 8b5b09092ec18..e588bc83b0de8 100644 --- a/pandas/tests/tools/test_to_timedelta.py +++ b/pandas/tests/tools/test_to_timedelta.py @@ -6,6 +6,7 @@ import numpy as np import pytest +from pandas.compat import IS64 from pandas.errors import OutOfBoundsTimedelta import pandas as pd @@ -232,18 +233,10 @@ def test_to_timedelta_on_missing_values_list(self, val): actual = to_timedelta([val]) assert actual[0]._value == np.timedelta64("NaT").astype("int64") - @pytest.mark.parametrize( - "dtype", - [ - pytest.param( - np.float32, marks=pytest.mark.xfail(reason="Floating point error") - ), - np.float64, - ], - ) - def test_to_timedelta_float(self, dtype): + @pytest.mark.xfail(not IS64, reason="Floating point error") + def test_to_timedelta_float(self): # https://github.com/pandas-dev/pandas/issues/25077 - arr = np.arange(0, 1, 1e-6)[-10:].astype(dtype) + arr = np.arange(0, 1, 1e-6)[-10:] result = to_timedelta(arr, unit="s") expected_asi8 = np.arange(999990000, 10**9, 1000, dtype="int64") tm.assert_numpy_array_equal(result.asi8, expected_asi8)