Skip to content

Simplify Timedelta init, standardize overflow errors #47004

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 16 commits into from
Closed
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
257 changes: 127 additions & 130 deletions pandas/_libs/tslibs/timedeltas.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import collections
import operator
import warnings

cimport cython
Expand Down Expand Up @@ -55,6 +56,7 @@ from pandas._libs.tslibs.np_datetime cimport (
pandas_timedelta_to_timedeltastruct,
pandas_timedeltastruct,
)
from pandas._libs.util cimport INT64_MAX

from pandas._libs.tslibs.np_datetime import OutOfBoundsTimedelta

Expand Down Expand Up @@ -217,12 +219,11 @@ cpdef int64_t delta_to_nanoseconds(delta) except? -1:
+ delta.microseconds
) * 1000
except OverflowError as err:
raise OutOfBoundsTimedelta(*err.args) from err

msg = f"{delta} outside allowed range [{NPY_NAT + 1}ns, {INT64_MAX}ns]"
raise OutOfBoundsTimedelta(msg) from err
raise TypeError(type(delta))


@cython.overflowcheck(True)
cdef object ensure_td64ns(object ts):
"""
Overflow-safe implementation of td64.astype("m8[ns]")
Expand All @@ -241,24 +242,20 @@ cdef object ensure_td64ns(object ts):
str unitstr

td64_unit = get_datetime64_unit(ts)
if (
td64_unit != NPY_DATETIMEUNIT.NPY_FR_ns
and td64_unit != NPY_DATETIMEUNIT.NPY_FR_GENERIC
):
unitstr = npy_unit_to_abbrev(td64_unit)
if td64_unit == NPY_DATETIMEUNIT.NPY_FR_ns or td64_unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC:
return ts

td64_value = get_timedelta64_value(ts)
unitstr = npy_unit_to_abbrev(td64_unit)
mult = precision_from_unit(unitstr)[0]

mult = precision_from_unit(unitstr)[0]
with cython.overflowcheck(True):
try:
# NB: cython#1381 this cannot be *=
td64_value = td64_value * mult
td64_value = get_timedelta64_value(ts) * mult
except OverflowError as err:
raise OutOfBoundsTimedelta(ts) from err
msg = f"{str(ts)} outside allowed range [{NPY_NAT + 1}ns, {INT64_MAX}ns]"
raise OutOfBoundsTimedelta(msg) from err

return np.timedelta64(td64_value, "ns")

return ts
return np.timedelta64(td64_value, "ns")


cdef convert_to_timedelta64(object ts, str unit):
Expand Down Expand Up @@ -674,8 +671,7 @@ cdef bint _validate_ops_compat(other):

def _op_unary_method(func, name):
def f(self):
new_value = func(self.value)
return _timedelta_from_value_and_reso(new_value, self._reso)
return create_timedelta(func(self.value), "ignore", self._reso)
f.__name__ = name
return f

Expand Down Expand Up @@ -724,13 +720,7 @@ def _binary_op_method_timedeltalike(op, name):
if self._reso != other._reso:
raise NotImplementedError

res = op(self.value, other.value)
if res == NPY_NAT:
# e.g. test_implementation_limits
# TODO: more generally could do an overflowcheck in op?
return NaT

return _timedelta_from_value_and_reso(res, reso=self._reso)
return create_timedelta(op(self.value, other.value), "ignore", self._reso)

f.__name__ = name
return f
Expand Down Expand Up @@ -861,7 +851,7 @@ cdef _to_py_int_float(v):


def _timedelta_unpickle(value, reso):
return _timedelta_from_value_and_reso(value, reso)
return create_timedelta(value, "ignore", reso)


cdef _timedelta_from_value_and_reso(int64_t value, NPY_DATETIMEUNIT reso):
Expand Down Expand Up @@ -892,6 +882,68 @@ cdef _timedelta_from_value_and_reso(int64_t value, NPY_DATETIMEUNIT reso):
return td_base


@cython.overflowcheck(True)
cdef object create_timedelta(object value, str in_unit, NPY_DATETIMEUNIT out_reso):
"""
Timedelta factory.

Overflow-safe, and allows for the creation of Timedeltas with non-nano resos while
the public API for that gets hashed out (ref: GH#46587). For now, Timedelta.__new__
just does arg validation and kwarg processing.

_timedelta_from_value_and_reso faster if value already an int that can be safely
cast to an int64.

Parameters
----------
value : Timedelta, timedelta, np.timedelta64, str, int, float
The same value types accepted by Timedelta.__new__
in_unit : str
Denote the (np) unit of the input, if it's numeric
out_reso: NPY_DATETIMEUNIT
Desired resolution of new Timedelta

Notes
-----
Pass in_unit="ignore" (or "ns") with a numeric value to just do overflow checking
(and bypass the prior behavior of converting value -> td64[ns] -> int)
"""
cdef:
int64_t out_value

if isinstance(value, _Timedelta):
return value

try:
# if unit == "ns", no need to create an m8[ns] just to read the (same) value back
# if unit == "ignore", assume caller wants to invoke an overflow-safe version of
# _timedelta_from_value_and_reso, and that any float rounding is acceptable
if (is_integer_object(value) or is_float_object(value)) and (
in_unit == "ns" or in_unit == "ignore"
):
if util.is_nan(value):
return NaT
out_value = <int64_t>value
# is_timedelta_64_object may not give correct results w/ some versions? see e.g.
# github.com/pandas-dev/pandas/runs/6397652653?check_suite_focus=true#step:11:435
elif isinstance(value, np.timedelta64):
out_value = ensure_td64ns(value).view(np.int64)
elif isinstance(value, str):
if value.startswith(("P", "-P")):
out_value = parse_iso_format_string(value)
else:
out_value = parse_timedelta_string(value)
else:
out_value = convert_to_timedelta64(value, in_unit).view(np.int64)
except OverflowError as err:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

any way for less of this logic to be within the try/except?

Copy link
Author

@patrickmckenna patrickmckenna May 24, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree, it's not ideal.

Initially, I tried to leave as much conditional processing as possible in convert_to_timedelta64, i.e. no if/else here, but that led to considerable slowdowns in many benchmarks IIRC. So that would be a simpler, albeit slower, option.

A similar option would be to let all the dispatching on arg type happen automatically, and add overflow checking as needed within each variant, e.g.

@functools.singledispatch
def value_to_int(value):
    ...

@value_to_int.register(str)
def _(value: str):
    ...

def create_timedelta(value):
    ...
    try:
        out_value = value_to_int(value)
    catch OverflowError:
        raise OutOfBoundsTimedelta(msg) from err
    ...

I'm not sure if cython plays well with singledispatch? This approach would also expand the scope of this PR a lot.

Definitely open to other ideas though!

msg = f"{value} outside allowed range [{NPY_NAT + 1}ns, {INT64_MAX}ns]"
raise OutOfBoundsTimedelta(msg) from err

if out_value == NPY_NAT:
return NaT
return _timedelta_from_value_and_reso(out_value, out_reso)


# Similar to Timestamp/datetime, this is a construction requirement for
# timedeltas that we need to do object instantiation in python. This will
# serve as a C extension type that shadows the Python class, where we do any
Expand Down Expand Up @@ -1375,7 +1427,7 @@ cdef class _Timedelta(timedelta):
@classmethod
def _from_value_and_reso(cls, int64_t value, NPY_DATETIMEUNIT reso):
# exposing as classmethod for testing
return _timedelta_from_value_and_reso(value, reso)
return create_timedelta(value, "ignore", reso)


# Python front end to C extension type _Timedelta
Expand Down Expand Up @@ -1438,99 +1490,49 @@ class Timedelta(_Timedelta):
We see that either way we get the same result
"""

_req_any_kwargs_new = {"weeks", "days", "hours", "minutes", "seconds",
"milliseconds", "microseconds", "nanoseconds"}
_allowed_kwargs = (
"weeks", "days", "hours", "minutes", "seconds", "milliseconds", "microseconds",
"nanoseconds"
)

def __new__(cls, object value=_no_input, unit=None, **kwargs):
cdef _Timedelta td_base
cdef:
NPY_DATETIMEUNIT out_reso = NPY_FR_ns

# process kwargs iff no value passed
if value is _no_input:
if not len(kwargs):
raise ValueError("cannot construct a Timedelta without a "
"value/unit or descriptive keywords "
"(days,seconds....)")

kwargs = {key: _to_py_int_float(kwargs[key]) for key in kwargs}

unsupported_kwargs = set(kwargs)
unsupported_kwargs.difference_update(cls._req_any_kwargs_new)
if unsupported_kwargs or not cls._req_any_kwargs_new.intersection(kwargs):
if not kwargs:
raise ValueError(
"cannot construct a Timedelta without a value/unit "
"or descriptive keywords (days,seconds....)"
)
if not kwargs.keys() <= set(cls._allowed_kwargs):
raise ValueError(
"cannot construct a Timedelta from the passed arguments, "
"allowed keywords are "
"[weeks, days, hours, minutes, seconds, "
"milliseconds, microseconds, nanoseconds]"
f"allowed keywords are {cls._allowed_kwargs}"
)

# GH43764, convert any input to nanoseconds first and then
# create the timestamp. This ensures that any potential
# nanosecond contributions from kwargs parsed as floats
# are taken into consideration.
seconds = int((
(
(kwargs.get('days', 0) + kwargs.get('weeks', 0) * 7) * 24
+ kwargs.get('hours', 0)
) * 3600
+ kwargs.get('minutes', 0) * 60
+ kwargs.get('seconds', 0)
) * 1_000_000_000
)

value = np.timedelta64(
int(kwargs.get('nanoseconds', 0))
+ int(kwargs.get('microseconds', 0) * 1_000)
+ int(kwargs.get('milliseconds', 0) * 1_000_000)
+ seconds
)

if unit in {'Y', 'y', 'M'}:
# GH43764, convert any input to nanoseconds first, to ensure any potential
# nanosecond contributions from kwargs parsed as floats are included
ns = sum((
_to_py_int_float(kwargs.get("weeks", 0)) * 7 * 24 * 3600 * 1_000_000_000,
_to_py_int_float(kwargs.get("days", 0)) * 24 * 3600 * 1_000_000_000,
_to_py_int_float(kwargs.get("hours", 0)) * 3600 * 1_000_000_000,
_to_py_int_float(kwargs.get("minutes", 0)) * 60 * 1_000_000_000,
_to_py_int_float(kwargs.get("seconds", 0)) * 1_000_000_000,
_to_py_int_float(kwargs.get("milliseconds", 0)) * 1_000_000,
_to_py_int_float(kwargs.get("microseconds", 0)) * 1_000,
_to_py_int_float(kwargs.get("nanoseconds", 0)),
))
return create_timedelta(ns, "ns", out_reso)

if isinstance(value, str) and unit is not None:
raise ValueError("unit must not be specified if the value is a str")
elif unit in {"Y", "y", "M"}:
raise ValueError(
"Units 'M', 'Y', and 'y' are no longer supported, as they do not "
"represent unambiguous timedelta values durations."
)

# GH 30543 if pd.Timedelta already passed, return it
# check that only value is passed
if isinstance(value, _Timedelta) and unit is None and len(kwargs) == 0:
return value
elif isinstance(value, _Timedelta):
value = value.value
elif isinstance(value, str):
if unit is not None:
raise ValueError("unit must not be specified if the value is a str")
if (len(value) > 0 and value[0] == 'P') or (
len(value) > 1 and value[:2] == '-P'
):
value = parse_iso_format_string(value)
else:
value = parse_timedelta_string(value)
value = np.timedelta64(value)
elif PyDelta_Check(value):
value = convert_to_timedelta64(value, 'ns')
elif is_timedelta64_object(value):
value = ensure_td64ns(value)
elif is_tick_object(value):
value = np.timedelta64(value.nanos, 'ns')
elif is_integer_object(value) or is_float_object(value):
# unit=None is de-facto 'ns'
unit = parse_timedelta_unit(unit)
value = convert_to_timedelta64(value, unit)
elif checknull_with_nat(value):
return NaT
else:
raise ValueError(
"Value must be Timedelta, string, integer, "
f"float, timedelta or convertible, not {type(value).__name__}"
)

if is_timedelta64_object(value):
value = value.view('i8')

# nat
if value == NPY_NAT:
return NaT

return _timedelta_from_value_and_reso(value, NPY_FR_ns)
return create_timedelta(value, parse_timedelta_unit(unit), out_reso)

def __setstate__(self, state):
if len(state) == 1:
Expand Down Expand Up @@ -1607,30 +1609,25 @@ class Timedelta(_Timedelta):
# Arithmetic Methods
# TODO: Can some of these be defined in the cython class?

__neg__ = _op_unary_method(lambda x: -x, '__neg__')
__pos__ = _op_unary_method(lambda x: x, '__pos__')
__abs__ = _op_unary_method(lambda x: abs(x), '__abs__')
__neg__ = _op_unary_method(operator.neg, "__neg__")
__pos__ = _op_unary_method(operator.pos, "__pos__")
__abs__ = _op_unary_method(operator.abs, "__abs__")

__add__ = _binary_op_method_timedeltalike(lambda x, y: x + y, '__add__')
__radd__ = _binary_op_method_timedeltalike(lambda x, y: x + y, '__radd__')
__sub__ = _binary_op_method_timedeltalike(lambda x, y: x - y, '__sub__')
__rsub__ = _binary_op_method_timedeltalike(lambda x, y: y - x, '__rsub__')
__add__ = _binary_op_method_timedeltalike(operator.add, "__add__")
__radd__ = _binary_op_method_timedeltalike(operator.add, "__radd__")
__sub__ = _binary_op_method_timedeltalike(operator.sub, "__sub__")
__rsub__ = _binary_op_method_timedeltalike(lambda x, y: y - x, "__rsub__")

def __mul__(self, other):
if is_integer_object(other) or is_float_object(other):
if util.is_nan(other):
# np.nan * timedelta -> np.timedelta64("NaT"), in this case NaT
return NaT

return _timedelta_from_value_and_reso(
<int64_t>(other * self.value),
reso=self._reso,
)

elif is_array(other):
if util.is_nan(other):
# np.nan * timedelta -> np.timedelta64("NaT"), in this case NaT
return NaT
if is_array(other):
# ndarray-like
return other * self.to_timedelta64()

if is_integer_object(other) or is_float_object(other):
# can't call Timedelta b/c it doesn't (yet) expose reso
return create_timedelta(self.value * other, "ignore", self._reso)
return NotImplemented

__rmul__ = __mul__
Expand Down Expand Up @@ -1825,6 +1822,6 @@ cdef _broadcast_floordiv_td64(


# resolution in ns
Timedelta.min = Timedelta(np.iinfo(np.int64).min + 1)
Timedelta.max = Timedelta(np.iinfo(np.int64).max)
Timedelta.min = Timedelta(NPY_NAT + 1)
Timedelta.max = Timedelta(INT64_MAX)
Timedelta.resolution = Timedelta(nanoseconds=1)
Loading