diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 6390fbeed8548..a4dc369702f97 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -385,6 +385,8 @@ Timedelta - Bug in :class:`Timedelta` with Numpy timedelta64 objects not properly raising ``ValueError`` (:issue:`52806`) - Bug in :meth:`Timedelta.round` with values close to the implementation bounds returning incorrect results instead of raising ``OutOfBoundsTimedelta`` (:issue:`51494`) - Bug in :meth:`arrays.TimedeltaArray.map` and :meth:`TimedeltaIndex.map`, where the supplied callable operated array-wise instead of element-wise (:issue:`51977`) +- Bug in constructing a :class:`Timedelta` with a valid ISO 8601 duration string raising an error (:issue:`51882`) +- Bug in constructing a :class:`Timedelta` with an invalid ISO 8601 duration string not raising an error and returning unexpected values (:issue:`51882`) - Timezones diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 0981c966c4cd4..f61ef27420f40 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1,4 +1,5 @@ import collections +import re import warnings cimport cython @@ -691,19 +692,25 @@ cdef timedelta_from_spec(object number, object frac, object unit): unit : a list of unit characters """ cdef: - str n + object ts + int64_t result unit = "".join(unit) if unit in ["M", "Y", "y"]: raise ValueError( - "Units 'M', 'Y' and 'y' do not represent unambiguous timedelta " - "values and are not supported." + "Units 'M', 'Y' and 'y' do not represent unambiguous timedelta values and " + "are not supported." ) - unit = parse_timedelta_unit(unit) - n = "".join(number) + "." + "".join(frac) - return cast_from_unit(float(n), unit) + ts = float("".join(number) + "." + "".join(frac)) + result = cast_from_unit(ts, unit) + + # Check that ts can be represented by an integer number of nanoseconds + if result == 0 and ts > 0: + raise ValueError("Resolution too high, maximum resolution is 1ns") + + return result cpdef inline str parse_timedelta_unit(str unit): @@ -841,93 +848,71 @@ cdef int64_t parse_iso_format_string(str ts) except? -1: """ cdef: - unicode c int64_t result = 0, r - int p = 0, sign = 1 - object dec_unit = "ms", err_msg - bint have_dot = 0, have_value = 0, neg = 0 - list number = [], unit = [] - - err_msg = f"Invalid ISO 8601 Duration format - {ts}" + int sign = 1 + object sign_part, integer_part, fractional_part + list groups = [], units = [] if ts[0] == "-": sign = -1 - ts = ts[1:] - - for c in ts: - # number (ascii codes) - if 48 <= ord(c) <= 57: - - have_value = 1 - if have_dot: - if p == 3 and dec_unit != "ns": - unit.append(dec_unit) - if dec_unit == "ms": - dec_unit = "us" - elif dec_unit == "us": - dec_unit = "ns" - p = 0 - p += 1 - - if not len(unit): - number.append(c) - else: - r = timedelta_from_spec(number, "0", unit) - result += timedelta_as_neg(r, neg) - - neg = 0 - unit, number = [], [c] - else: - if c == "P" or c == "T": - pass # ignore marking characters P and T - elif c == "-": - if neg or have_value: - raise ValueError(err_msg) - else: - neg = 1 - elif c == "+": - pass - elif c in ["W", "D", "H", "M"]: - if c in ["H", "M"] and len(number) > 2: - raise ValueError(err_msg) - if c == "M": - c = "min" - unit.append(c) - r = timedelta_from_spec(number, "0", unit) - result += timedelta_as_neg(r, neg) - neg = 0 - unit, number = [], [] - elif c == ".": - # append any seconds - if len(number): - r = timedelta_from_spec(number, "0", "S") - result += timedelta_as_neg(r, neg) - unit, number = [], [] - have_dot = 1 - elif c == "S": - if have_dot: # ms, us, or ns - if not len(number) or p > 3: - raise ValueError(err_msg) - # pad to 3 digits as required - pad = 3 - p - while pad > 0: - number.append("0") - pad -= 1 - - r = timedelta_from_spec(number, "0", dec_unit) - result += timedelta_as_neg(r, neg) - else: # seconds - r = timedelta_from_spec(number, "0", "S") - result += timedelta_as_neg(r, neg) - else: - raise ValueError(err_msg) - - if not have_value: - # Received string only - never parsed any values - raise ValueError(err_msg) - - return sign*result + year_component = r""" + (?: + (-?) # Capture sign + (\d+) # Capture integer part + (?: + [.,] # Decimal separator + (\d+) # Capture fractional part + (?=Y$) # But only if it is the final component + )? + Y + ) + """ + month_component = r"(?:(-?)(\d+)(?:[.,](\d+)(?=M$))?M)" + week_component = r"(?:(-?)(\d+)(?:[.,](\d+)(?=W$))?W)" + day_component = r"(?:(-?)(\d+)(?:[.,](\d+)(?=D$))?D)" + hour_component = r"(?:(-?)(\d+)(?:[.,](\d+)(?=H$))?H)" + minute_component = r"(?:(-?)(\d+)(?:[.,](\d+)(?=M$))?M)" + second_component = r"(?:(-?)(\d+)(?:[.,](\d+)(?=S$))?S)" + + iso_regex = re.compile(fr""" + ^ # Start of string + -? # Optional negative sign prefix + P # P symbol + (?!$) # Ensure at least 1 component follows P by disallowing ending on P + {year_component}? + {month_component}? + {week_component}? + {day_component}? + (?: + T # T symbol + (?!$) # Ensure at least 1 component follows T by disallowing ending on T + {hour_component}? + {minute_component}? + {second_component}? + )? + $ # End of string + """, re.VERBOSE) + + match = iso_regex.fullmatch(ts) + if match is None: + raise ValueError(f"Invalid ISO 8601 Duration format - {ts}") + groups = list(match.groups()) + + # Numpy time units corresponding to the capture groups of the regex. + units = ["Y", "M", "W", "D", "h", "m", "s"] + + # Convert each component to nanoseconds and sum. + for i, unit in enumerate(units): + j = i * 3 + sign_part, integer_part, fractional_part = groups[j:j+3] + if integer_part is not None: + if fractional_part is None: + fractional_part = "0" + r = timedelta_from_spec(integer_part, fractional_part, unit) + result += timedelta_as_neg(r, sign_part == "-") + + return sign * result cdef _to_py_int_float(v): diff --git a/pandas/tests/scalar/timedelta/test_constructors.py b/pandas/tests/scalar/timedelta/test_constructors.py index 7bd9e5fc5e293..ec8d6bd93871b 100644 --- a/pandas/tests/scalar/timedelta/test_constructors.py +++ b/pandas/tests/scalar/timedelta/test_constructors.py @@ -372,6 +372,7 @@ def test_construction_out_of_bounds_td64s(val, unit): nanoseconds=12, ), ), + ("PT3,5H", Timedelta(hours=3.5)), ("P4DT12H30M5S", Timedelta(days=4, hours=12, minutes=30, seconds=5)), ("P0DT0H0M0.000000123S", Timedelta(nanoseconds=123)), ("P0DT0H0M0.00001S", Timedelta(microseconds=10)), @@ -388,7 +389,7 @@ def test_construction_out_of_bounds_td64s(val, unit): ("P1DT0H0M00000000000S", Timedelta(days=1)), ("PT-6H3M", Timedelta(hours=-6, minutes=3)), ("-PT6H3M", Timedelta(hours=-6, minutes=-3)), - ("-PT-6H+3M", Timedelta(hours=6, minutes=-3)), + ("-PT-6H3M", Timedelta(hours=6, minutes=-3)), ], ) def test_iso_constructor(fmt, exp): @@ -398,16 +399,65 @@ def test_iso_constructor(fmt, exp): @pytest.mark.parametrize( "fmt", [ + "P1Y", + "P1M", + ], +) +def test_iso_constructor_raises_ambiguous_units(fmt): + msg = ( + "Units 'M', 'Y' and 'y' do not represent unambiguous timedelta values and are " + "not supported." + ) + with pytest.raises(ValueError, match=msg): + Timedelta(fmt) + + +@pytest.mark.parametrize( + "fmt", + [ + "P1DT0H0M0.0000000001S", + "PT0.00000000001M", + ], +) +def test_iso_constructor_raises_resolution(fmt): + msg = "Resolution too high, maximum resolution is 1ns" + with pytest.raises(ValueError, match=msg): + Timedelta(fmt) + + +@pytest.mark.parametrize( + "fmt", + [ + # Multiple prefix characters "PPPPPPPPPPPP", - "PDTHMS", - "P0DT999H999M999S", - "P1DT0H0M0.0000000000000S", - "P1DT0H0M0.S", + # Missing any component "P", "-P", + "PT", + "-PT", + # Empty component + "P1DT", + "P1DTHMS", + # Hours or second components before the time separator + "P1H", + "P1S", + # Duplicate time separator + "P1DTT1H", + # Only lowest order component may have be frational + "P1DT2.5H3M", + # Number start with in dot + "P1DT.0H", + # Number ends in dot + "P1DT0H0M0.S", + # Time designator in date half + "P1S", + # Wrong designator order + "PT1S2M", + # Unknown designator + "P1R", ], ) -def test_iso_constructor_raises(fmt): +def test_iso_constructor_raises_invalid_format(fmt): msg = f"Invalid ISO 8601 Duration format - {fmt}" with pytest.raises(ValueError, match=msg): Timedelta(fmt)