diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 9be54e95bf36f..2c7ef6ff30e77 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -370,6 +370,7 @@ Datetimelike - Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` dtype and inputs that would be out of bounds for a ``datetime64[ns]`` incorrectly raising ``OutOfBoundsDatetime`` (:issue:`55756`) - Bug in parsing datetime strings with nanosecond resolution with non-ISO8601 formats incorrectly truncating sub-microsecond components (:issue:`56051`) - Bug in parsing datetime strings with sub-second resolution and trailing zeros incorrectly inferring second or millisecond resolution (:issue:`55737`) +- Bug in the results of :func:`pd.to_datetime` with an floating-dtype argument with ``unit`` not matching the pointwise results of :class:`Timestamp` (:issue:`56037`) - Timedelta diff --git a/pandas/_libs/tslibs/conversion.pxd b/pandas/_libs/tslibs/conversion.pxd index 7ffd630d6d8e1..6b9f41b1bb06f 100644 --- a/pandas/_libs/tslibs/conversion.pxd +++ b/pandas/_libs/tslibs/conversion.pxd @@ -45,7 +45,7 @@ cdef int64_t get_datetime64_nanos(object val, NPY_DATETIMEUNIT reso) except? -1 cpdef datetime localize_pydatetime(datetime dt, tzinfo tz) cdef int64_t cast_from_unit(object ts, str unit, NPY_DATETIMEUNIT out_reso=*) except? -1 -cpdef (int64_t, int) precision_from_unit( +cdef (int64_t, int) precision_from_unit( NPY_DATETIMEUNIT in_reso, NPY_DATETIMEUNIT out_reso=* ) diff --git a/pandas/_libs/tslibs/conversion.pyi b/pandas/_libs/tslibs/conversion.pyi index a5f5d04efeb76..cfe39fe2964cb 100644 --- a/pandas/_libs/tslibs/conversion.pyi +++ b/pandas/_libs/tslibs/conversion.pyi @@ -8,8 +8,5 @@ import numpy as np DT64NS_DTYPE: np.dtype TD64NS_DTYPE: np.dtype -def precision_from_unit( - in_reso: int, - out_reso: int = ..., -) -> tuple[int, int]: ... # (int64_t, _) def localize_pydatetime(dt: datetime, tz: tzinfo | None) -> datetime: ... +def cast_from_unit_vectorized(values: np.ndarray, unit: str) -> np.ndarray: ... diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 222ff2cde0ede..5ad9a648c52a2 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -1,8 +1,11 @@ +cimport cython + import numpy as np cimport numpy as cnp from libc.math cimport log10 from numpy cimport ( + float64_t, int32_t, int64_t, ) @@ -37,6 +40,7 @@ from pandas._libs.tslibs.np_datetime cimport ( NPY_DATETIMEUNIT, NPY_FR_ns, NPY_FR_us, + astype_overflowsafe, check_dts_bounds, convert_reso, dts_to_iso_string, @@ -74,6 +78,7 @@ from pandas._libs.tslibs.tzconversion cimport ( from pandas._libs.tslibs.util cimport ( is_float_object, is_integer_object, + is_nan, ) # ---------------------------------------------------------------------- @@ -86,6 +91,78 @@ TD64NS_DTYPE = np.dtype("m8[ns]") # ---------------------------------------------------------------------- # Unit Conversion Helpers +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.overflowcheck(True) +def cast_from_unit_vectorized( + ndarray values, + str unit, +): + """ + Vectorized analogue to cast_from_unit. + """ + cdef: + int64_t m + int p + NPY_DATETIMEUNIT in_reso, out_reso + Py_ssize_t i + + assert values.dtype.kind == "f" + + if unit in "YM": + if not (((values % 1) == 0) | np.isnan(values)).all(): + # GH#47267 it is clear that 2 "M" corresponds to 1970-02-01, + # but not clear what 2.5 "M" corresponds to, so we will + # disallow that case. + raise ValueError( + f"Conversion of non-round float with unit={unit} " + "is ambiguous" + ) + + # GH#47266 go through np.datetime64 to avoid weird results e.g. with "Y" + # and 150 we'd get 2120-01-01 09:00:00 + values = values.astype(f"M8[{unit}]") + dtype = np.dtype("M8[ns]") + return astype_overflowsafe(values, dtype=dtype, copy=False).view("i8") + + in_reso = abbrev_to_npy_unit(unit) + out_reso = abbrev_to_npy_unit("ns") + m, p = precision_from_unit(in_reso, out_reso) + + cdef: + ndarray[int64_t] base, out + ndarray[float64_t] frac + tuple shape = (values).shape + + out = np.empty(shape, dtype="i8") + base = np.empty(shape, dtype="i8") + frac = np.empty(shape, dtype="f8") + + for i in range(len(values)): + if is_nan(values[i]): + base[i] = NPY_NAT + else: + base[i] = values[i] + frac[i] = values[i] - base[i] + + if p: + frac = np.round(frac, p) + + try: + for i in range(len(values)): + if base[i] == NPY_NAT: + out[i] = NPY_NAT + else: + out[i] = (base[i] * m) + (frac[i] * m) + except (OverflowError, FloatingPointError) as err: + # FloatingPointError can be issued if we have float dtype and have + # set np.errstate(over="raise") + raise OutOfBoundsDatetime( + f"cannot convert input {values[i]} with the unit '{unit}'" + ) from err + return out + + cdef int64_t cast_from_unit( object ts, str unit, @@ -155,7 +232,7 @@ cdef int64_t cast_from_unit( ) from err -cpdef (int64_t, int) precision_from_unit( +cdef (int64_t, int) precision_from_unit( NPY_DATETIMEUNIT in_reso, NPY_DATETIMEUNIT out_reso=NPY_DATETIMEUNIT.NPY_FR_ns, ): diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 226e2568fdbf8..f55d3de8878ad 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -6,7 +6,6 @@ TYPE_CHECKING, cast, ) -import warnings import numpy as np @@ -27,8 +26,7 @@ npy_unit_to_abbrev, periods_per_second, ) -from pandas._libs.tslibs.conversion import precision_from_unit -from pandas._libs.tslibs.dtypes import abbrev_to_npy_unit +from pandas._libs.tslibs.conversion import cast_from_unit_vectorized from pandas._libs.tslibs.fields import ( get_timedelta_days, get_timedelta_field, @@ -1059,23 +1057,10 @@ def sequence_to_td64ns( data = data._data else: mask = np.isnan(data) - # The next few lines are effectively a vectorized 'cast_from_unit' - m, p = precision_from_unit(abbrev_to_npy_unit(unit or "ns")) - with warnings.catch_warnings(): - # Suppress RuntimeWarning about All-NaN slice - warnings.filterwarnings( - "ignore", "invalid value encountered in cast", RuntimeWarning - ) - base = data.astype(np.int64) - frac = data - base - if p: - frac = np.round(frac, p) - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", "invalid value encountered in cast", RuntimeWarning - ) - data = (base * m + (frac * m).astype(np.int64)).view("timedelta64[ns]") + + data = cast_from_unit_vectorized(data, unit or "ns") data[mask] = iNaT + data = data.view("m8[ns]") copy = False elif lib.is_np_dtype(data.dtype, "m"): diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index ea5e6e46f58ec..863fb414a75f2 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -26,12 +26,10 @@ Timestamp, astype_overflowsafe, get_unit_from_dtype, - iNaT, is_supported_unit, timezones as libtimezones, ) -from pandas._libs.tslibs.conversion import precision_from_unit -from pandas._libs.tslibs.dtypes import abbrev_to_npy_unit +from pandas._libs.tslibs.conversion import cast_from_unit_vectorized from pandas._libs.tslibs.parsing import ( DateParseError, guess_datetime_format, @@ -551,23 +549,19 @@ def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index: tz_parsed = None elif arg.dtype.kind == "f": - mult, _ = precision_from_unit(abbrev_to_npy_unit(unit)) - - mask = np.isnan(arg) | (arg == iNaT) - fvalues = (arg * mult).astype("f8", copy=False) - fvalues[mask] = 0 - - if (fvalues < Timestamp.min._value).any() or ( - fvalues > Timestamp.max._value - ).any(): - if errors != "raise": - arg = arg.astype(object) - return _to_datetime_with_unit(arg, unit, name, utc, errors) - raise OutOfBoundsDatetime(f"cannot convert input with unit '{unit}'") - - arr = fvalues.astype("M8[ns]", copy=False) - arr[mask] = np.datetime64("NaT", "ns") - + with np.errstate(over="raise"): + try: + arr = cast_from_unit_vectorized(arg, unit=unit) + except OutOfBoundsDatetime: + if errors != "raise": + return _to_datetime_with_unit( + arg.astype(object), unit, name, utc, errors + ) + raise OutOfBoundsDatetime( + f"cannot convert input with unit '{unit}'" + ) + + arr = arr.view("M8[ns]") tz_parsed = None else: arg = arg.astype(object, copy=False) diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index d56139d32b1da..0ce428cef9520 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -187,7 +187,18 @@ def test_date_time(datapath): fname, parse_dates=["Date1", "Date2", "DateTime", "DateTimeHi", "Taiw"] ) # GH 19732: Timestamps imported from sas will incur floating point errors - df[df.columns[3]] = df.iloc[:, 3].dt.round("us") + # 2023-11-16 we don't know the correct "expected" result bc we do not have + # access to SAS to read the sas7bdat file. We are really just testing + # that we are "close". This only seems to be an issue near the + # implementation bounds. + res = df.iloc[:, 3].dt.round("us").copy() + + # the first and last elements are near the implementation bounds, where we + # would expect floating point error to occur. + res.iloc[0] -= pd.Timedelta(microseconds=1) + res.iloc[-1] += pd.Timedelta(microseconds=1) + + df["DateTimeHi"] = res tm.assert_frame_equal(df, df0) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index ef76d99260764..b7db5545a9e26 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1864,16 +1864,14 @@ def test_to_datetime_month_or_year_unit_int(self, cache, unit, item, request): result = to_datetime(np.array([item], dtype=object), unit=unit, cache=cache) tm.assert_index_equal(result, expected) - # TODO: this should also work - if isinstance(item, float): - request.applymarker( - pytest.mark.xfail( - reason=f"{type(item).__name__} in np.array should work" - ) - ) result = to_datetime(np.array([item]), unit=unit, cache=cache) tm.assert_index_equal(result, expected) + # with a nan! + result = to_datetime(np.array([item, np.nan]), unit=unit, cache=cache) + assert result.isna()[1] + tm.assert_index_equal(result[:1], expected) + @pytest.mark.parametrize("unit", ["Y", "M"]) def test_to_datetime_month_or_year_unit_non_round_float(self, cache, unit): # GH#50301 @@ -1883,6 +1881,8 @@ def test_to_datetime_month_or_year_unit_non_round_float(self, cache, unit): msg = f"Conversion of non-round float with unit={unit} is ambiguous" with pytest.raises(ValueError, match=msg): to_datetime([1.5], unit=unit, errors="raise") + with pytest.raises(ValueError, match=msg): + to_datetime(np.array([1.5]), unit=unit, errors="raise") with pytest.raises(ValueError, match=msg): with tm.assert_produces_warning(FutureWarning, match=warn_msg): to_datetime(["1.5"], unit=unit, errors="raise") @@ -2030,10 +2030,14 @@ def test_unit_mixed(self, cache, arr): def test_unit_rounding(self, cache): # GH 14156 & GH 20445: argument will incur floating point errors # but no premature rounding - result = to_datetime(1434743731.8770001, unit="s", cache=cache) - expected = Timestamp("2015-06-19 19:55:31.877000192") + value = 1434743731.8770001 + result = to_datetime(value, unit="s", cache=cache) + expected = Timestamp("2015-06-19 19:55:31.877000093") assert result == expected + alt = Timestamp(value, unit="s") + assert alt == result + def test_unit_ignore_keeps_name(self, cache): # GH 21697 expected = Index([15e9] * 2, name="name") diff --git a/pandas/tests/tools/test_to_timedelta.py b/pandas/tests/tools/test_to_timedelta.py index c4c9b41c218a0..e588bc83b0de8 100644 --- a/pandas/tests/tools/test_to_timedelta.py +++ b/pandas/tests/tools/test_to_timedelta.py @@ -6,6 +6,7 @@ import numpy as np import pytest +from pandas.compat import IS64 from pandas.errors import OutOfBoundsTimedelta import pandas as pd @@ -232,6 +233,7 @@ def test_to_timedelta_on_missing_values_list(self, val): actual = to_timedelta([val]) assert actual[0]._value == np.timedelta64("NaT").astype("int64") + @pytest.mark.xfail(not IS64, reason="Floating point error") def test_to_timedelta_float(self): # https://github.com/pandas-dev/pandas/issues/25077 arr = np.arange(0, 1, 1e-6)[-10:]