Skip to content

BUG: to_datetime with floats and unit not matching Timestamp #56037

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Nov 22, 2023
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,7 @@ Datetimelike
- Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` dtype and inputs that would be out of bounds for a ``datetime64[ns]`` incorrectly raising ``OutOfBoundsDatetime`` (:issue:`55756`)
- Bug in parsing datetime strings with nanosecond resolution with non-ISO8601 formats incorrectly truncating sub-microsecond components (:issue:`56051`)
- Bug in parsing datetime strings with sub-second resolution and trailing zeros incorrectly inferring second or millisecond resolution (:issue:`55737`)
- Bug in the results of :func:`pd.to_datetime` with an floating-dtype argument with ``unit`` not matching the pointwise results of :class:`Timestamp` (:issue:`56037`)
-

Timedelta
Expand Down
2 changes: 1 addition & 1 deletion pandas/_libs/tslibs/conversion.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ cdef int64_t get_datetime64_nanos(object val, NPY_DATETIMEUNIT reso) except? -1

cpdef datetime localize_pydatetime(datetime dt, tzinfo tz)
cdef int64_t cast_from_unit(object ts, str unit, NPY_DATETIMEUNIT out_reso=*) except? -1
cpdef (int64_t, int) precision_from_unit(
cdef (int64_t, int) precision_from_unit(
NPY_DATETIMEUNIT in_reso, NPY_DATETIMEUNIT out_reso=*
)

Expand Down
5 changes: 1 addition & 4 deletions pandas/_libs/tslibs/conversion.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,5 @@ import numpy as np
DT64NS_DTYPE: np.dtype
TD64NS_DTYPE: np.dtype

def precision_from_unit(
in_reso: int,
out_reso: int = ...,
) -> tuple[int, int]: ... # (int64_t, _)
def localize_pydatetime(dt: datetime, tz: tzinfo | None) -> datetime: ...
def cast_from_unit_vectorized(values: np.ndarray, unit: str) -> np.ndarray: ...
79 changes: 78 additions & 1 deletion pandas/_libs/tslibs/conversion.pyx
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
cimport cython

import numpy as np

cimport numpy as cnp
from libc.math cimport log10
from numpy cimport (
float64_t,
int32_t,
int64_t,
)
Expand Down Expand Up @@ -37,6 +40,7 @@ from pandas._libs.tslibs.np_datetime cimport (
NPY_DATETIMEUNIT,
NPY_FR_ns,
NPY_FR_us,
astype_overflowsafe,
check_dts_bounds,
convert_reso,
dts_to_iso_string,
Expand Down Expand Up @@ -74,6 +78,7 @@ from pandas._libs.tslibs.tzconversion cimport (
from pandas._libs.tslibs.util cimport (
is_float_object,
is_integer_object,
is_nan,
)

# ----------------------------------------------------------------------
Expand All @@ -86,6 +91,78 @@ TD64NS_DTYPE = np.dtype("m8[ns]")
# ----------------------------------------------------------------------
# Unit Conversion Helpers

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.overflowcheck(True)
def cast_from_unit_vectorized(
ndarray values,
str unit,
):
"""
Vectorized analogue to cast_from_unit.
"""
cdef:
int64_t m
int p
NPY_DATETIMEUNIT in_reso, out_reso
Py_ssize_t i

assert values.dtype.kind == "f"

if unit in "YM":
if not (((values % 1) == 0) | np.isnan(values)).all():
# GH#47267 it is clear that 2 "M" corresponds to 1970-02-01,
# but not clear what 2.5 "M" corresponds to, so we will
# disallow that case.
raise ValueError(
f"Conversion of non-round float with unit={unit} "
"is ambiguous"
)

# GH#47266 go through np.datetime64 to avoid weird results e.g. with "Y"
# and 150 we'd get 2120-01-01 09:00:00
values = values.astype(f"M8[{unit}]")
dtype = np.dtype("M8[ns]")
return astype_overflowsafe(values, dtype=dtype, copy=False).view("i8")

in_reso = abbrev_to_npy_unit(unit)
out_reso = abbrev_to_npy_unit("ns")
m, p = precision_from_unit(in_reso, out_reso)

cdef:
ndarray[int64_t] base, out
ndarray[float64_t] frac
tuple shape = (<object>values).shape

out = np.empty(shape, dtype="i8")
base = np.empty(shape, dtype="i8")
frac = np.empty(shape, dtype="f8")

for i in range(len(values)):
if is_nan(values[i]):
base[i] = NPY_NAT
else:
base[i] = <int64_t>values[i]
frac[i] = values[i] - base[i]

if p:
frac = np.round(frac, p)

try:
for i in range(len(values)):
if base[i] == NPY_NAT:
out[i] = NPY_NAT
else:
out[i] = <int64_t>(base[i] * m) + <int64_t>(frac[i] * m)
except (OverflowError, FloatingPointError) as err:
# FloatingPointError can be issued if we have float dtype and have
# set np.errstate(over="raise")
raise OutOfBoundsDatetime(
f"cannot convert input {values[i]} with the unit '{unit}'"
) from err
return out


cdef int64_t cast_from_unit(
object ts,
str unit,
Expand Down Expand Up @@ -155,7 +232,7 @@ cdef int64_t cast_from_unit(
) from err


cpdef (int64_t, int) precision_from_unit(
cdef (int64_t, int) precision_from_unit(
NPY_DATETIMEUNIT in_reso,
NPY_DATETIMEUNIT out_reso=NPY_DATETIMEUNIT.NPY_FR_ns,
):
Expand Down
23 changes: 4 additions & 19 deletions pandas/core/arrays/timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
TYPE_CHECKING,
cast,
)
import warnings

import numpy as np

Expand All @@ -27,8 +26,7 @@
npy_unit_to_abbrev,
periods_per_second,
)
from pandas._libs.tslibs.conversion import precision_from_unit
from pandas._libs.tslibs.dtypes import abbrev_to_npy_unit
from pandas._libs.tslibs.conversion import cast_from_unit_vectorized
from pandas._libs.tslibs.fields import (
get_timedelta_days,
get_timedelta_field,
Expand Down Expand Up @@ -1059,23 +1057,10 @@ def sequence_to_td64ns(
data = data._data
else:
mask = np.isnan(data)
# The next few lines are effectively a vectorized 'cast_from_unit'
m, p = precision_from_unit(abbrev_to_npy_unit(unit or "ns"))
with warnings.catch_warnings():
# Suppress RuntimeWarning about All-NaN slice
warnings.filterwarnings(
"ignore", "invalid value encountered in cast", RuntimeWarning
)
base = data.astype(np.int64)
frac = data - base
if p:
frac = np.round(frac, p)
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore", "invalid value encountered in cast", RuntimeWarning
)
data = (base * m + (frac * m).astype(np.int64)).view("timedelta64[ns]")

data = cast_from_unit_vectorized(data, unit or "ns")
data[mask] = iNaT
data = data.view("m8[ns]")
copy = False

elif lib.is_np_dtype(data.dtype, "m"):
Expand Down
34 changes: 14 additions & 20 deletions pandas/core/tools/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,10 @@
Timestamp,
astype_overflowsafe,
get_unit_from_dtype,
iNaT,
is_supported_unit,
timezones as libtimezones,
)
from pandas._libs.tslibs.conversion import precision_from_unit
from pandas._libs.tslibs.dtypes import abbrev_to_npy_unit
from pandas._libs.tslibs.conversion import cast_from_unit_vectorized
from pandas._libs.tslibs.parsing import (
DateParseError,
guess_datetime_format,
Expand Down Expand Up @@ -551,23 +549,19 @@ def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index:
tz_parsed = None

elif arg.dtype.kind == "f":
mult, _ = precision_from_unit(abbrev_to_npy_unit(unit))

mask = np.isnan(arg) | (arg == iNaT)
fvalues = (arg * mult).astype("f8", copy=False)
fvalues[mask] = 0

if (fvalues < Timestamp.min._value).any() or (
fvalues > Timestamp.max._value
).any():
if errors != "raise":
arg = arg.astype(object)
return _to_datetime_with_unit(arg, unit, name, utc, errors)
raise OutOfBoundsDatetime(f"cannot convert input with unit '{unit}'")

arr = fvalues.astype("M8[ns]", copy=False)
arr[mask] = np.datetime64("NaT", "ns")

with np.errstate(over="raise"):
try:
arr = cast_from_unit_vectorized(arg, unit=unit)
except OutOfBoundsDatetime:
if errors != "raise":
return _to_datetime_with_unit(
arg.astype(object), unit, name, utc, errors
)
raise OutOfBoundsDatetime(
f"cannot convert input with unit '{unit}'"
)

arr = arr.view("M8[ns]")
tz_parsed = None
else:
arg = arg.astype(object, copy=False)
Expand Down
13 changes: 12 additions & 1 deletion pandas/tests/io/sas/test_sas7bdat.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,18 @@ def test_date_time(datapath):
fname, parse_dates=["Date1", "Date2", "DateTime", "DateTimeHi", "Taiw"]
)
# GH 19732: Timestamps imported from sas will incur floating point errors
df[df.columns[3]] = df.iloc[:, 3].dt.round("us")
# 2023-11-16 we don't know the correct "expected" result bc we do not have
# access to SAS to read the sas7bdat file. We are really just testing
# that we are "close". This only seems to be an issue near the
# implementation bounds.
res = df.iloc[:, 3].dt.round("us").copy()

# the first and last elements are near the implementation bounds, where we
# would expect floating point error to occur.
res.iloc[0] -= pd.Timedelta(microseconds=1)
res.iloc[-1] += pd.Timedelta(microseconds=1)

df["DateTimeHi"] = res
tm.assert_frame_equal(df, df0)


Expand Down
22 changes: 13 additions & 9 deletions pandas/tests/tools/test_to_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -1864,16 +1864,14 @@ def test_to_datetime_month_or_year_unit_int(self, cache, unit, item, request):
result = to_datetime(np.array([item], dtype=object), unit=unit, cache=cache)
tm.assert_index_equal(result, expected)

# TODO: this should also work
if isinstance(item, float):
request.applymarker(
pytest.mark.xfail(
reason=f"{type(item).__name__} in np.array should work"
)
)
result = to_datetime(np.array([item]), unit=unit, cache=cache)
tm.assert_index_equal(result, expected)

# with a nan!
result = to_datetime(np.array([item, np.nan]), unit=unit, cache=cache)
assert result.isna()[1]
tm.assert_index_equal(result[:1], expected)

@pytest.mark.parametrize("unit", ["Y", "M"])
def test_to_datetime_month_or_year_unit_non_round_float(self, cache, unit):
# GH#50301
Expand All @@ -1883,6 +1881,8 @@ def test_to_datetime_month_or_year_unit_non_round_float(self, cache, unit):
msg = f"Conversion of non-round float with unit={unit} is ambiguous"
with pytest.raises(ValueError, match=msg):
to_datetime([1.5], unit=unit, errors="raise")
with pytest.raises(ValueError, match=msg):
to_datetime(np.array([1.5]), unit=unit, errors="raise")
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(FutureWarning, match=warn_msg):
to_datetime(["1.5"], unit=unit, errors="raise")
Expand Down Expand Up @@ -2030,10 +2030,14 @@ def test_unit_mixed(self, cache, arr):
def test_unit_rounding(self, cache):
# GH 14156 & GH 20445: argument will incur floating point errors
# but no premature rounding
result = to_datetime(1434743731.8770001, unit="s", cache=cache)
expected = Timestamp("2015-06-19 19:55:31.877000192")
value = 1434743731.8770001
result = to_datetime(value, unit="s", cache=cache)
expected = Timestamp("2015-06-19 19:55:31.877000093")
assert result == expected

alt = Timestamp(value, unit="s")
assert alt == result

def test_unit_ignore_keeps_name(self, cache):
# GH 21697
expected = Index([15e9] * 2, name="name")
Expand Down
2 changes: 2 additions & 0 deletions pandas/tests/tools/test_to_timedelta.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import numpy as np
import pytest

from pandas.compat import IS64
from pandas.errors import OutOfBoundsTimedelta

import pandas as pd
Expand Down Expand Up @@ -232,6 +233,7 @@ def test_to_timedelta_on_missing_values_list(self, val):
actual = to_timedelta([val])
assert actual[0]._value == np.timedelta64("NaT").astype("int64")

@pytest.mark.xfail(not IS64, reason="Floating point error")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is incorrect: I'm getting an XPASS here.

My guess is that you've hit this on i386, probably due to i387 weirdness. However, this wouldn't affect other 32-bit architectures or i386 builds with -mfpmath=sse. Gentoo is switching towards the latter since i387 is simply causing too many floating-point issues.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A PR to make the xfail condition more precise would be welcome

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know but I honestly have no clue how to detect the right case. At this point, I can think only of two options: either removing the xfail and ignoring i387-based x86 to support other 32-bit setups, or using an approximate comparison instead.

def test_to_timedelta_float(self):
# https://github.com/pandas-dev/pandas/issues/25077
arr = np.arange(0, 1, 1e-6)[-10:]
Expand Down