Skip to content

BUG: incorrect OutOfBoundsDatetime with non-nano dtype #55756

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Oct 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,7 @@ Datetimelike
- Bug in addition or subtraction of :class:`BusinessDay` offset with ``offset`` attribute to non-nanosecond :class:`Index`, :class:`Series`, or :class:`DataFrame` column giving incorrect results (:issue:`55608`)
- Bug in addition or subtraction of :class:`DateOffset` objects with microsecond components to ``datetime64`` :class:`Index`, :class:`Series`, or :class:`DataFrame` columns with non-nanosecond resolution (:issue:`55595`)
- Bug in addition or subtraction of very large :class:`Tick` objects with :class:`Timestamp` or :class:`Timedelta` objects raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`)
- Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` dtype and inputs that would be out of bounds for a ``datetime64[ns]`` incorrectly raising ``OutOfBoundsDatetime`` (:issue:`55756`)
-

Timedelta
Expand Down
1 change: 1 addition & 0 deletions pandas/_libs/tslib.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def array_to_datetime(
dayfirst: bool = ...,
yearfirst: bool = ...,
utc: bool = ...,
creso: int = ...,
) -> tuple[np.ndarray, tzinfo | None]: ...

# returned ndarray may be object dtype or datetime64[ns]
Expand Down
21 changes: 13 additions & 8 deletions pandas/_libs/tslib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ from pandas._libs.tslibs.conversion cimport (
get_datetime64_nanos,
parse_pydatetime,
)
from pandas._libs.tslibs.dtypes cimport npy_unit_to_abbrev
from pandas._libs.tslibs.nattype cimport (
NPY_NAT,
c_NaT as NaT,
Expand Down Expand Up @@ -277,6 +278,7 @@ def array_with_unit_to_datetime(
result, tz = array_to_datetime(
values.astype(object, copy=False),
errors=errors,
creso=NPY_FR_ns,
)
return result, tz

Expand Down Expand Up @@ -408,6 +410,7 @@ cpdef array_to_datetime(
bint dayfirst=False,
bint yearfirst=False,
bint utc=False,
NPY_DATETIMEUNIT creso=NPY_FR_ns,
):
"""
Converts a 1D array of date-like values to a numpy array of either:
Expand All @@ -434,6 +437,7 @@ cpdef array_to_datetime(
yearfirst parsing behavior when encountering datetime strings
utc : bool, default False
indicator whether the dates should be UTC
creso : NPY_DATETIMEUNIT, default NPY_FR_ns

Returns
-------
Expand All @@ -457,13 +461,14 @@ cpdef array_to_datetime(
set out_tzoffset_vals = set()
tzinfo tz_out = None
cnp.flatiter it = cnp.PyArray_IterNew(values)
NPY_DATETIMEUNIT creso = NPY_FR_ns
DatetimeParseState state = DatetimeParseState()
str reso_str

# specify error conditions
assert is_raise or is_ignore or is_coerce

result = np.empty((<object>values).shape, dtype="M8[ns]")
reso_str = npy_unit_to_abbrev(creso)
result = np.empty((<object>values).shape, dtype=f"M8[{reso_str}]")
iresult = result.view("i8").ravel()

for i in range(n):
Expand All @@ -480,11 +485,11 @@ cpdef array_to_datetime(
iresult[i] = parse_pydatetime(val, &dts, creso=creso)

elif PyDate_Check(val):
iresult[i] = pydate_to_dt64(val, &dts)
check_dts_bounds(&dts)
iresult[i] = pydate_to_dt64(val, &dts, reso=creso)
check_dts_bounds(&dts, creso)

elif is_datetime64_object(val):
iresult[i] = get_datetime64_nanos(val, NPY_FR_ns)
iresult[i] = get_datetime64_nanos(val, creso)

elif is_integer_object(val) or is_float_object(val):
# these must be ns unit by-definition
Expand All @@ -493,23 +498,23 @@ cpdef array_to_datetime(
iresult[i] = NPY_NAT
else:
# we now need to parse this as if unit='ns'
iresult[i] = cast_from_unit(val, "ns")
iresult[i] = cast_from_unit(val, "ns", out_reso=creso)

elif isinstance(val, str):
# string
if type(val) is not str:
# GH#32264 np.str_ object
val = str(val)

if parse_today_now(val, &iresult[i], utc):
if parse_today_now(val, &iresult[i], utc, creso):
# We can't _quite_ dispatch this to convert_str_to_tsobject
# bc there isn't a nice way to pass "utc"
continue

_ts = convert_str_to_tsobject(
val, None, unit="ns", dayfirst=dayfirst, yearfirst=yearfirst
)
_ts.ensure_reso(NPY_FR_ns, val)
_ts.ensure_reso(creso, val)

iresult[i] = _ts.value

Expand Down
4 changes: 3 additions & 1 deletion pandas/_libs/tslibs/conversion.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,9 @@ cdef int64_t get_datetime64_nanos(object val, NPY_DATETIMEUNIT reso) except? -1

cpdef datetime localize_pydatetime(datetime dt, tzinfo tz)
cdef int64_t cast_from_unit(object ts, str unit, NPY_DATETIMEUNIT out_reso=*) except? -1
cpdef (int64_t, int) precision_from_unit(str unit, NPY_DATETIMEUNIT out_reso=*)
cpdef (int64_t, int) precision_from_unit(
NPY_DATETIMEUNIT in_reso, NPY_DATETIMEUNIT out_reso=*
)

cdef maybe_localize_tso(_TSObject obj, tzinfo tz, NPY_DATETIMEUNIT reso)

Expand Down
2 changes: 1 addition & 1 deletion pandas/_libs/tslibs/conversion.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,6 @@ DT64NS_DTYPE: np.dtype
TD64NS_DTYPE: np.dtype

def precision_from_unit(
unit: str,
in_reso: int, # NPY_DATETIMEUNIT
) -> tuple[int, int]: ... # (int64_t, _)
def localize_pydatetime(dt: datetime, tz: tzinfo | None) -> datetime: ...
25 changes: 16 additions & 9 deletions pandas/_libs/tslibs/conversion.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ cdef int64_t cast_from_unit(
cdef:
int64_t m
int p
NPY_DATETIMEUNIT in_reso

if unit in ["Y", "M"]:
if is_float_object(ts) and not ts.is_integer():
Expand All @@ -123,7 +124,14 @@ cdef int64_t cast_from_unit(
dt64obj = np.datetime64(ts, unit)
return get_datetime64_nanos(dt64obj, out_reso)

m, p = precision_from_unit(unit, out_reso)
in_reso = abbrev_to_npy_unit(unit)
if out_reso < in_reso and in_reso != NPY_DATETIMEUNIT.NPY_FR_GENERIC:
# We will end up rounding (always *down*), so don't need the fractional
# part of `ts`.
m, _ = precision_from_unit(out_reso, in_reso)
return (<int64_t>ts) // m

m, p = precision_from_unit(in_reso, out_reso)

# cast the unit, multiply base/frac separately
# to avoid precision issues from float -> int
Expand All @@ -146,8 +154,8 @@ cdef int64_t cast_from_unit(
) from err


cpdef inline (int64_t, int) precision_from_unit(
str unit,
cpdef (int64_t, int) precision_from_unit(
NPY_DATETIMEUNIT in_reso,
NPY_DATETIMEUNIT out_reso=NPY_DATETIMEUNIT.NPY_FR_ns,
):
"""
Expand All @@ -163,25 +171,24 @@ cpdef inline (int64_t, int) precision_from_unit(
int64_t m
int64_t multiplier
int p
NPY_DATETIMEUNIT reso = abbrev_to_npy_unit(unit)

if reso == NPY_DATETIMEUNIT.NPY_FR_GENERIC:
reso = NPY_DATETIMEUNIT.NPY_FR_ns
if reso == NPY_DATETIMEUNIT.NPY_FR_Y:
if in_reso == NPY_DATETIMEUNIT.NPY_FR_GENERIC:
in_reso = NPY_DATETIMEUNIT.NPY_FR_ns
if in_reso == NPY_DATETIMEUNIT.NPY_FR_Y:
# each 400 years we have 97 leap years, for an average of 97/400=.2425
# extra days each year. We get 31556952 by writing
# 3600*24*365.2425=31556952
multiplier = periods_per_second(out_reso)
m = multiplier * 31556952
elif reso == NPY_DATETIMEUNIT.NPY_FR_M:
elif in_reso == NPY_DATETIMEUNIT.NPY_FR_M:
# 2629746 comes from dividing the "Y" case by 12.
multiplier = periods_per_second(out_reso)
m = multiplier * 2629746
else:
# Careful: if get_conversion_factor raises, the exception does
# not propagate, instead we get a warning about an ignored exception.
# https://github.com/pandas-dev/pandas/pull/51483#discussion_r1115198951
m = get_conversion_factor(reso, out_reso)
m = get_conversion_factor(in_reso, out_reso)

p = <int>log10(m) # number of digits in 'm' minus 1
return m, p
Expand Down
4 changes: 3 additions & 1 deletion pandas/_libs/tslibs/strptime.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@ from cpython.datetime cimport (
)
from numpy cimport int64_t

from pandas._libs.tslibs.np_datetime cimport NPY_DATETIMEUNIT

cdef bint parse_today_now(str val, int64_t* iresult, bint utc)

cdef bint parse_today_now(str val, int64_t* iresult, bint utc, NPY_DATETIMEUNIT creso)


cdef class DatetimeParseState:
Expand Down
19 changes: 12 additions & 7 deletions pandas/_libs/tslibs/strptime.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -111,22 +111,27 @@ def _test_format_is_iso(f: str) -> bool:
return format_is_iso(f)


cdef bint parse_today_now(str val, int64_t* iresult, bint utc):
cdef bint parse_today_now(
str val, int64_t* iresult, bint utc, NPY_DATETIMEUNIT creso
):
# We delay this check for as long as possible
# because it catches relatively rare cases
cdef:
_Timestamp ts

# Multiply by 1000 to convert to nanos, since these methods naturally have
# microsecond resolution
if val == "now":
if utc:
iresult[0] = Timestamp.utcnow()._value * 1000
ts = <_Timestamp>Timestamp.utcnow()
iresult[0] = ts._as_creso(creso)._value
else:
# GH#18705 make sure to_datetime("now") matches Timestamp("now")
# Note using Timestamp.now() is faster than Timestamp("now")
iresult[0] = Timestamp.now()._value * 1000
ts = <_Timestamp>Timestamp.now()
iresult[0] = ts._as_creso(creso)._value
return True
elif val == "today":
iresult[0] = Timestamp.today()._value * 1000
ts = <_Timestamp>Timestamp.today()
iresult[0] = ts._as_creso(creso)._value
return True
return False

Expand Down Expand Up @@ -363,7 +368,7 @@ def array_strptime(
check_dts_bounds(&dts)
continue

if parse_today_now(val, &iresult[i], utc):
if parse_today_now(val, &iresult[i], utc, NPY_FR_ns):
continue

# Some ISO formats can't be parsed by string_to_dts
Expand Down
4 changes: 1 addition & 3 deletions pandas/_libs/tslibs/timedeltas.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -303,18 +303,16 @@ cdef object ensure_td64ns(object ts):
cdef:
NPY_DATETIMEUNIT td64_unit
int64_t td64_value, mult
str unitstr

td64_unit = get_datetime64_unit(ts)
if (
td64_unit != NPY_DATETIMEUNIT.NPY_FR_ns
and td64_unit != NPY_DATETIMEUNIT.NPY_FR_GENERIC
):
unitstr = npy_unit_to_abbrev(td64_unit)

td64_value = cnp.get_timedelta64_value(ts)

mult = precision_from_unit(unitstr)[0]
mult = precision_from_unit(td64_unit)[0]
try:
# NB: cython#1381 this cannot be *=
td64_value = td64_value * mult
Expand Down
8 changes: 6 additions & 2 deletions pandas/core/arrays/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -2251,18 +2251,19 @@ def _sequence_to_dt64ns(
dayfirst=dayfirst,
yearfirst=yearfirst,
allow_object=False,
out_unit=out_unit or "ns",
)
copy = False
if tz and inferred_tz:
# two timezones: convert to intended from base UTC repr
assert converted.dtype == "i8"
# GH#42505
# by convention, these are _already_ UTC, e.g
result = converted.view(DT64NS_DTYPE)
result = converted.view(out_dtype)

elif inferred_tz:
tz = inferred_tz
result = converted.view(DT64NS_DTYPE)
result = converted.view(out_dtype)

else:
result, _ = _construct_from_dt64_naive(
Expand Down Expand Up @@ -2360,6 +2361,7 @@ def objects_to_datetime64ns(
utc: bool = False,
errors: DateTimeErrorChoices = "raise",
allow_object: bool = False,
out_unit: str = "ns",
):
"""
Convert data to array of timestamps.
Expand All @@ -2375,6 +2377,7 @@ def objects_to_datetime64ns(
allow_object : bool
Whether to return an object-dtype ndarray instead of raising if the
data contains more than one timezone.
out_unit : str, default "ns"

Returns
-------
Expand All @@ -2399,6 +2402,7 @@ def objects_to_datetime64ns(
utc=utc,
dayfirst=dayfirst,
yearfirst=yearfirst,
creso=abbrev_to_npy_unit(out_unit),
)

if tz_parsed is not None:
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/arrays/timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
to_offset,
)
from pandas._libs.tslibs.conversion import precision_from_unit
from pandas._libs.tslibs.dtypes import abbrev_to_npy_unit
from pandas._libs.tslibs.fields import (
get_timedelta_days,
get_timedelta_field,
Expand Down Expand Up @@ -1078,7 +1079,7 @@ def sequence_to_td64ns(
else:
mask = np.isnan(data)
# The next few lines are effectively a vectorized 'cast_from_unit'
m, p = precision_from_unit(unit or "ns")
m, p = precision_from_unit(abbrev_to_npy_unit(unit or "ns"))
with warnings.catch_warnings():
# Suppress RuntimeWarning about All-NaN slice
warnings.filterwarnings(
Expand Down
7 changes: 3 additions & 4 deletions pandas/core/dtypes/astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,11 +105,10 @@ def _astype_nansafe(
# then coerce to datetime64[ns] and use DatetimeArray.astype

if lib.is_np_dtype(dtype, "M"):
from pandas import to_datetime
from pandas.core.arrays import DatetimeArray

dti = to_datetime(arr.ravel())
dta = dti._data.reshape(arr.shape)
return dta.astype(dtype, copy=False)._ndarray
dta = DatetimeArray._from_sequence(arr, dtype=dtype)
return dta._ndarray

elif lib.is_np_dtype(dtype, "m"):
from pandas.core.construction import ensure_wrapped_if_datetimelike
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/tools/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
timezones as libtimezones,
)
from pandas._libs.tslibs.conversion import precision_from_unit
from pandas._libs.tslibs.dtypes import abbrev_to_npy_unit
from pandas._libs.tslibs.parsing import (
DateParseError,
guess_datetime_format,
Expand Down Expand Up @@ -550,7 +551,7 @@ def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index:
tz_parsed = None

elif arg.dtype.kind == "f":
mult, _ = precision_from_unit(unit)
mult, _ = precision_from_unit(abbrev_to_npy_unit(unit))

mask = np.isnan(arg) | (arg == iNaT)
fvalues = (arg * mult).astype("f8", copy=False)
Expand Down
7 changes: 6 additions & 1 deletion pandas/tests/frame/methods/test_astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,12 @@ def test_astype_from_object_to_datetime_unit(self, unit):
["2017-01-01", "2017-01-02", "2017-02-03"],
]
df = DataFrame(vals, dtype=object)
with pytest.raises(TypeError, match="Cannot cast"):
msg = (
rf"Unexpected value for 'dtype': 'datetime64\[{unit}\]'. "
r"Must be 'datetime64\[s\]', 'datetime64\[ms\]', 'datetime64\[us\]', "
r"'datetime64\[ns\]' or DatetimeTZDtype"
)
with pytest.raises(ValueError, match=msg):
df.astype(f"M8[{unit}]")

@pytest.mark.parametrize("unit", ["Y", "M", "W", "D", "h", "m"])
Expand Down
Loading