From 67001cf12b7c6336e29bd6010f9461e083d4a79f Mon Sep 17 00:00:00 2001 From: Ross Titmarsh Date: Mon, 13 Mar 2023 12:19:37 +0000 Subject: [PATCH 1/5] Fixed ISO 8601 duration parser. --- pandas/_libs/tslibs/timedeltas.pyx | 115 ++++++------------ .../scalar/timedelta/test_constructors.py | 51 +++++++- 2 files changed, 79 insertions(+), 87 deletions(-) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 62cc07fabb747..07bac11f31215 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1,4 +1,5 @@ import collections +import re import warnings cimport cython @@ -836,93 +837,45 @@ cdef int64_t parse_iso_format_string(str ts) except? -1: """ cdef: - unicode c int64_t result = 0, r - int p = 0, sign = 1 - object dec_unit = "ms", err_msg - bint have_dot = 0, have_value = 0, neg = 0 - list number = [], unit = [] - - err_msg = f"Invalid ISO 8601 Duration format - {ts}" - - if ts[0] == "-": - sign = -1 - ts = ts[1:] - - for c in ts: - # number (ascii codes) - if 48 <= ord(c) <= 57: + int sign = 1 + object sign_part, integer_part, fractional_part + list groups = [], units = [] + + iso_regex = re.compile( + r"^(-?)P(?!$)(?:" + r"(?:(-?)(\d+)(?:[.,](\d{1,9})(?=Y$))?Y)?" + r"(?:(-?)(\d+)(?:[.,](\d{1,9})(?=M$))?M)?" + r"(?:(-?)(\d+)(?:[.,](\d{1,9})(?=W$))?W)?" + r"(?:(-?)(\d+)(?:[.,](\d{1,9})(?=D$))?D)?" + r"(?:T(?!$)" + r"(?:(-?)(\d+)(?:[.,](\d{1,9})(?=H$))?H)?" + r"(?:(-?)(\d+)(?:[.,](\d{1,9})(?=M$))?M)?" + r"(?:(-?)(\d+)(?:[.,](\d{1,9})(?=S$))?S)?" + r")?)$" + ) - have_value = 1 - if have_dot: - if p == 3 and dec_unit != "ns": - unit.append(dec_unit) - if dec_unit == "ms": - dec_unit = "us" - elif dec_unit == "us": - dec_unit = "ns" - p = 0 - p += 1 - - if not len(unit): - number.append(c) - else: - r = timedelta_from_spec(number, "0", unit) - result += timedelta_as_neg(r, neg) + # Pandas units corresponding to the order of the groups in the regex + units = ["Y", "M", "W", "D", "h", "m", "s"] - neg = 0 - unit, number = [], [c] - else: - if c == "P" or c == "T": - pass # ignore marking characters P and T - elif c == "-": - if neg or have_value: - raise ValueError(err_msg) - else: - neg = 1 - elif c == "+": - pass - elif c in ["W", "D", "H", "M"]: - if c in ["H", "M"] and len(number) > 2: - raise ValueError(err_msg) - if c == "M": - c = "min" - unit.append(c) - r = timedelta_from_spec(number, "0", unit) - result += timedelta_as_neg(r, neg) + match = iso_regex.fullmatch(ts) + if match is None: + raise ValueError(f"Invalid ISO 8601 Duration format - {ts}") + sign_part, *groups = list(match.groups()) - neg = 0 - unit, number = [], [] - elif c == ".": - # append any seconds - if len(number): - r = timedelta_from_spec(number, "0", "S") - result += timedelta_as_neg(r, neg) - unit, number = [], [] - have_dot = 1 - elif c == "S": - if have_dot: # ms, us, or ns - if not len(number) or p > 3: - raise ValueError(err_msg) - # pad to 3 digits as required - pad = 3 - p - while pad > 0: - number.append("0") - pad -= 1 - - r = timedelta_from_spec(number, "0", dec_unit) - result += timedelta_as_neg(r, neg) - else: # seconds - r = timedelta_from_spec(number, "0", "S") - result += timedelta_as_neg(r, neg) - else: - raise ValueError(err_msg) + if sign_part == "-": + sign = -1 - if not have_value: - # Received string only - never parsed any values - raise ValueError(err_msg) + for i in range(len(units)): + j = i * 3 + sign_part, integer_part, fractional_part = groups[j:j+3] + if integer_part is not None: + if fractional_part is None: + fractional_part = "0" + r = timedelta_from_spec(integer_part, fractional_part, units[i]) + result += timedelta_as_neg(r, sign_part == "-") - return sign*result + return sign * result cdef _to_py_int_float(v): diff --git a/pandas/tests/scalar/timedelta/test_constructors.py b/pandas/tests/scalar/timedelta/test_constructors.py index ad9dd408fbeaf..5233f0a3b3d82 100644 --- a/pandas/tests/scalar/timedelta/test_constructors.py +++ b/pandas/tests/scalar/timedelta/test_constructors.py @@ -374,6 +374,7 @@ def test_construction_out_of_bounds_td64s(val, unit): nanoseconds=12, ), ), + ("PT3,5H", Timedelta(hours=3.5)), ("P4DT12H30M5S", Timedelta(days=4, hours=12, minutes=30, seconds=5)), ("P0DT0H0M0.000000123S", Timedelta(nanoseconds=123)), ("P0DT0H0M0.00001S", Timedelta(microseconds=10)), @@ -390,7 +391,7 @@ def test_construction_out_of_bounds_td64s(val, unit): ("P1DT0H0M00000000000S", Timedelta(days=1)), ("PT-6H3M", Timedelta(hours=-6, minutes=3)), ("-PT6H3M", Timedelta(hours=-6, minutes=-3)), - ("-PT-6H+3M", Timedelta(hours=6, minutes=-3)), + ("-PT-6H3M", Timedelta(hours=6, minutes=-3)), ], ) def test_iso_constructor(fmt, exp): @@ -400,16 +401,54 @@ def test_iso_constructor(fmt, exp): @pytest.mark.parametrize( "fmt", [ + "P1Y", + "P1M", + ], +) +def test_iso_constructor_raises_ambiguous_units(fmt): + msg = ( + "Units 'M', 'Y' and 'y' do not represent unambiguous timedelta values " + "and are not supported." + ) + with pytest.raises(ValueError, match=msg): + Timedelta(fmt) + + +@pytest.mark.parametrize( + "fmt", + [ + # Multiple prefix characters "PPPPPPPPPPPP", - "PDTHMS", - "P0DT999H999M999S", - "P1DT0H0M0.0000000000000S", - "P1DT0H0M0.S", + # Missing any component "P", "-P", + "PT", + "-PT", + # Empty component + "P1DT", + "P1DTHMS", + # Hours or second components before the time separator + "P1H", + "P1S", + # Duplicate time separator + "P1DTT1H", + # Precision too high + "P1DT0H0M0.0000000001S", + # Only lowest order component may have be frational + "P1DT2.5H3M", + # Number start with in dot + "P1DT.0H", + # Number ends in dot + "P1DT0H0M0.S", + # Time designator in date half + "P1S", + # Wrong designator order + "PT1S2M", + # Unknown designator + "P1R", ], ) -def test_iso_constructor_raises(fmt): +def test_iso_constructor_raises_invalid_format(fmt): msg = f"Invalid ISO 8601 Duration format - {fmt}" with pytest.raises(ValueError, match=msg): Timedelta(fmt) From 7b1b0226adedf4a3e2acea05c33c27a8f6bd926d Mon Sep 17 00:00:00 2001 From: Ross Titmarsh Date: Mon, 13 Mar 2023 13:04:55 +0000 Subject: [PATCH 2/5] Added whatsnew entry --- doc/source/whatsnew/v2.1.0.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 410a324be829e..f8933488952c8 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -140,6 +140,8 @@ Timedelta ^^^^^^^^^ - Bug in :meth:`Timedelta.round` with values close to the implementation bounds returning incorrect results instead of raising ``OutOfBoundsTimedelta`` (:issue:`51494`) - Bug in :class:`TimedeltaIndex` division or multiplication leading to ``.freq`` of "0 Days" instead of ``None`` (:issue:`51575`) +- Bug in constructing a :class:`Timedelta` with a valid ISO 8601 duration string raising an error (:issue:`51882`) +- Bug in constructing a :class:`Timedelta` with an invalid ISO 8601 duration string not raising an error and returning unexpected values (:issue:`51882`) - Timezones From 140016e638307c480b106c154d1d012ca3bf811b Mon Sep 17 00:00:00 2001 From: Ross Titmarsh Date: Mon, 13 Mar 2023 19:22:34 +0000 Subject: [PATCH 3/5] Improved readability and comments. --- pandas/_libs/tslibs/timedeltas.pyx | 37 +++++++++++++++++------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 07bac11f31215..07d8d740c6aa8 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -842,37 +842,42 @@ cdef int64_t parse_iso_format_string(str ts) except? -1: object sign_part, integer_part, fractional_part list groups = [], units = [] + if ts[0] == "-": + sign = -1 + iso_regex = re.compile( - r"^(-?)P(?!$)(?:" - r"(?:(-?)(\d+)(?:[.,](\d{1,9})(?=Y$))?Y)?" - r"(?:(-?)(\d+)(?:[.,](\d{1,9})(?=M$))?M)?" - r"(?:(-?)(\d+)(?:[.,](\d{1,9})(?=W$))?W)?" - r"(?:(-?)(\d+)(?:[.,](\d{1,9})(?=D$))?D)?" + # Allow starting with 'P' or '-P', disallow ending on 'P'. + r"^-?P(?!$)(?:" + # Capture the sign, integer part, and fractional part (but only if it + # is the final component). + r"(?:(-?)(\d+)(?:[.,](\d{1,9})(?=Y$))?Y)?" # Years + r"(?:(-?)(\d+)(?:[.,](\d{1,9})(?=M$))?M)?" # Months + r"(?:(-?)(\d+)(?:[.,](\d{1,9})(?=W$))?W)?" # Weeks + r"(?:(-?)(\d+)(?:[.,](\d{1,9})(?=D$))?D)?" # Days + # Disallow ending on 'T'. r"(?:T(?!$)" - r"(?:(-?)(\d+)(?:[.,](\d{1,9})(?=H$))?H)?" - r"(?:(-?)(\d+)(?:[.,](\d{1,9})(?=M$))?M)?" - r"(?:(-?)(\d+)(?:[.,](\d{1,9})(?=S$))?S)?" + r"(?:(-?)(\d+)(?:[.,](\d{1,9})(?=H$))?H)?" # Hours + r"(?:(-?)(\d+)(?:[.,](\d{1,9})(?=M$))?M)?" # Minutes + r"(?:(-?)(\d+)(?:[.,](\d{1,9})(?=S$))?S)?" # Seconds r")?)$" ) - # Pandas units corresponding to the order of the groups in the regex - units = ["Y", "M", "W", "D", "h", "m", "s"] - match = iso_regex.fullmatch(ts) if match is None: raise ValueError(f"Invalid ISO 8601 Duration format - {ts}") - sign_part, *groups = list(match.groups()) + groups = list(match.groups()) - if sign_part == "-": - sign = -1 + # Numpy time units corresponding to the capture groups of the regex. + units = ["Y", "M", "W", "D", "h", "m", "s"] - for i in range(len(units)): + # Convert each component to nanoseconds and sum. + for i, unit in enumerate(units): j = i * 3 sign_part, integer_part, fractional_part = groups[j:j+3] if integer_part is not None: if fractional_part is None: fractional_part = "0" - r = timedelta_from_spec(integer_part, fractional_part, units[i]) + r = timedelta_from_spec(integer_part, fractional_part, unit) result += timedelta_as_neg(r, sign_part == "-") return sign * result From a57a0e4cdd765f3d5e37a0cca69e2b145ae70137 Mon Sep 17 00:00:00 2001 From: Ross Titmarsh Date: Tue, 14 Mar 2023 14:43:21 +0000 Subject: [PATCH 4/5] Annotated regex. --- pandas/_libs/tslibs/timedeltas.pyx | 53 +++++++++++++++++++++--------- 1 file changed, 37 insertions(+), 16 deletions(-) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 07d8d740c6aa8..28bdc13dcdfcd 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -845,22 +845,43 @@ cdef int64_t parse_iso_format_string(str ts) except? -1: if ts[0] == "-": sign = -1 - iso_regex = re.compile( - # Allow starting with 'P' or '-P', disallow ending on 'P'. - r"^-?P(?!$)(?:" - # Capture the sign, integer part, and fractional part (but only if it - # is the final component). - r"(?:(-?)(\d+)(?:[.,](\d{1,9})(?=Y$))?Y)?" # Years - r"(?:(-?)(\d+)(?:[.,](\d{1,9})(?=M$))?M)?" # Months - r"(?:(-?)(\d+)(?:[.,](\d{1,9})(?=W$))?W)?" # Weeks - r"(?:(-?)(\d+)(?:[.,](\d{1,9})(?=D$))?D)?" # Days - # Disallow ending on 'T'. - r"(?:T(?!$)" - r"(?:(-?)(\d+)(?:[.,](\d{1,9})(?=H$))?H)?" # Hours - r"(?:(-?)(\d+)(?:[.,](\d{1,9})(?=M$))?M)?" # Minutes - r"(?:(-?)(\d+)(?:[.,](\d{1,9})(?=S$))?S)?" # Seconds - r")?)$" - ) + year_component = r""" + (?: + (-?) # Capture sign + (\d+) # Capture integer part + (?: + [.,] # Decimal separator + (\d+) # Capture fractional part + (?=Y$) # But only if it is the final component + )? + Y + ) + """ + month_component = r"(?:(-?)(\d+)(?:[.,](\d+)(?=M$))?M)" + week_component = r"(?:(-?)(\d+)(?:[.,](\d+)(?=W$))?W)" + day_component = r"(?:(-?)(\d+)(?:[.,](\d+)(?=D$))?D)" + hour_component = r"(?:(-?)(\d+)(?:[.,](\d+)(?=H$))?H)" + minute_component = r"(?:(-?)(\d+)(?:[.,](\d+)(?=M$))?M)" + second_component = r"(?:(-?)(\d+)(?:[.,](\d+)(?=S$))?S)" + + iso_regex = re.compile(fr""" + ^ # Start of string + -? # Optional negative sign prefix + P # P symbol + (?!$) # Ensure at least 1 component follows P by disallowing ending on P + {year_component}? + {month_component}? + {week_component}? + {day_component}? + (?: + T # T symbol + (?!$) # Ensure at least 1 component follows T by disallowing ending on T + {hour_component}? + {minute_component}? + {second_component}? + )? + $ # End of string + """, re.VERBOSE) match = iso_regex.fullmatch(ts) if match is None: From 03533dfbf761d6ee6cea0463bc39cff60651c6c0 Mon Sep 17 00:00:00 2001 From: Ross Titmarsh Date: Tue, 14 Mar 2023 14:47:36 +0000 Subject: [PATCH 5/5] Added check that timedelta can be represented by an integer number of nanoseconds --- pandas/_libs/tslibs/timedeltas.pyx | 18 ++++++++++++------ .../scalar/timedelta/test_constructors.py | 19 +++++++++++++++---- 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 28bdc13dcdfcd..b46cd7abbd046 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -687,19 +687,25 @@ cdef timedelta_from_spec(object number, object frac, object unit): unit : a list of unit characters """ cdef: - str n + object ts + int64_t result unit = "".join(unit) if unit in ["M", "Y", "y"]: raise ValueError( - "Units 'M', 'Y' and 'y' do not represent unambiguous timedelta " - "values and are not supported." + "Units 'M', 'Y' and 'y' do not represent unambiguous timedelta values and " + "are not supported." ) - unit = parse_timedelta_unit(unit) - n = "".join(number) + "." + "".join(frac) - return cast_from_unit(float(n), unit) + ts = float("".join(number) + "." + "".join(frac)) + result = cast_from_unit(ts, unit) + + # Check that ts can be represented by an integer number of nanoseconds + if result == 0 and ts > 0: + raise ValueError("Resolution too high, maximum resolution is 1ns") + + return result cpdef inline str parse_timedelta_unit(str unit): diff --git a/pandas/tests/scalar/timedelta/test_constructors.py b/pandas/tests/scalar/timedelta/test_constructors.py index 5233f0a3b3d82..a0c900fc2d28e 100644 --- a/pandas/tests/scalar/timedelta/test_constructors.py +++ b/pandas/tests/scalar/timedelta/test_constructors.py @@ -407,13 +407,26 @@ def test_iso_constructor(fmt, exp): ) def test_iso_constructor_raises_ambiguous_units(fmt): msg = ( - "Units 'M', 'Y' and 'y' do not represent unambiguous timedelta values " - "and are not supported." + "Units 'M', 'Y' and 'y' do not represent unambiguous timedelta values and are " + "not supported." ) with pytest.raises(ValueError, match=msg): Timedelta(fmt) +@pytest.mark.parametrize( + "fmt", + [ + "P1DT0H0M0.0000000001S", + "PT0.00000000001M", + ], +) +def test_iso_constructor_raises_resolution(fmt): + msg = "Resolution too high, maximum resolution is 1ns" + with pytest.raises(ValueError, match=msg): + Timedelta(fmt) + + @pytest.mark.parametrize( "fmt", [ @@ -432,8 +445,6 @@ def test_iso_constructor_raises_ambiguous_units(fmt): "P1S", # Duplicate time separator "P1DTT1H", - # Precision too high - "P1DT0H0M0.0000000001S", # Only lowest order component may have be frational "P1DT2.5H3M", # Number start with in dot