Skip to content

BUG: Incorrect parsing of ISO 8601 durations strings in Timedelta constructor #51928

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

2 changes: 2 additions & 0 deletions doc/source/whatsnew/v2.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,8 @@ Timedelta
- Bug in :class:`Timedelta` with Numpy timedelta64 objects not properly raising ``ValueError`` (:issue:`52806`)
- Bug in :meth:`Timedelta.round` with values close to the implementation bounds returning incorrect results instead of raising ``OutOfBoundsTimedelta`` (:issue:`51494`)
- Bug in :meth:`arrays.TimedeltaArray.map` and :meth:`TimedeltaIndex.map`, where the supplied callable operated array-wise instead of element-wise (:issue:`51977`)
- Bug in constructing a :class:`Timedelta` with a valid ISO 8601 duration string raising an error (:issue:`51882`)
- Bug in constructing a :class:`Timedelta` with an invalid ISO 8601 duration string not raising an error and returning unexpected values (:issue:`51882`)
-

Timezones
Expand Down
161 changes: 73 additions & 88 deletions pandas/_libs/tslibs/timedeltas.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import collections
import re
import warnings

cimport cython
Expand Down Expand Up @@ -691,19 +692,25 @@ cdef timedelta_from_spec(object number, object frac, object unit):
unit : a list of unit characters
"""
cdef:
str n
object ts
int64_t result

unit = "".join(unit)
if unit in ["M", "Y", "y"]:
raise ValueError(
"Units 'M', 'Y' and 'y' do not represent unambiguous timedelta "
"values and are not supported."
"Units 'M', 'Y' and 'y' do not represent unambiguous timedelta values and "
"are not supported."
)

unit = parse_timedelta_unit(unit)

n = "".join(number) + "." + "".join(frac)
return cast_from_unit(float(n), unit)
ts = float("".join(number) + "." + "".join(frac))
result = cast_from_unit(ts, unit)

# Check that ts can be represented by an integer number of nanoseconds
if result == 0 and ts > 0:
raise ValueError("Resolution too high, maximum resolution is 1ns")

return result


cpdef inline str parse_timedelta_unit(str unit):
Expand Down Expand Up @@ -841,93 +848,71 @@ cdef int64_t parse_iso_format_string(str ts) except? -1:
"""

cdef:
unicode c
int64_t result = 0, r
int p = 0, sign = 1
object dec_unit = "ms", err_msg
bint have_dot = 0, have_value = 0, neg = 0
list number = [], unit = []

err_msg = f"Invalid ISO 8601 Duration format - {ts}"
int sign = 1
object sign_part, integer_part, fractional_part
list groups = [], units = []

if ts[0] == "-":
sign = -1
ts = ts[1:]

for c in ts:
# number (ascii codes)
if 48 <= ord(c) <= 57:

have_value = 1
if have_dot:
if p == 3 and dec_unit != "ns":
unit.append(dec_unit)
if dec_unit == "ms":
dec_unit = "us"
elif dec_unit == "us":
dec_unit = "ns"
p = 0
p += 1

if not len(unit):
number.append(c)
else:
r = timedelta_from_spec(number, "0", unit)
result += timedelta_as_neg(r, neg)

neg = 0
unit, number = [], [c]
else:
if c == "P" or c == "T":
pass # ignore marking characters P and T
elif c == "-":
if neg or have_value:
raise ValueError(err_msg)
else:
neg = 1
elif c == "+":
pass
elif c in ["W", "D", "H", "M"]:
if c in ["H", "M"] and len(number) > 2:
raise ValueError(err_msg)
if c == "M":
c = "min"
unit.append(c)
r = timedelta_from_spec(number, "0", unit)
result += timedelta_as_neg(r, neg)

neg = 0
unit, number = [], []
elif c == ".":
# append any seconds
if len(number):
r = timedelta_from_spec(number, "0", "S")
result += timedelta_as_neg(r, neg)
unit, number = [], []
have_dot = 1
elif c == "S":
if have_dot: # ms, us, or ns
if not len(number) or p > 3:
raise ValueError(err_msg)
# pad to 3 digits as required
pad = 3 - p
while pad > 0:
number.append("0")
pad -= 1

r = timedelta_from_spec(number, "0", dec_unit)
result += timedelta_as_neg(r, neg)
else: # seconds
r = timedelta_from_spec(number, "0", "S")
result += timedelta_as_neg(r, neg)
else:
raise ValueError(err_msg)

if not have_value:
# Received string only - never parsed any values
raise ValueError(err_msg)

return sign*result
year_component = r"""
(?:
(-?) # Capture sign
(\d+) # Capture integer part
(?:
[.,] # Decimal separator
(\d+) # Capture fractional part
(?=Y$) # But only if it is the final component
)?
Y
)
"""
month_component = r"(?:(-?)(\d+)(?:[.,](\d+)(?=M$))?M)"
week_component = r"(?:(-?)(\d+)(?:[.,](\d+)(?=W$))?W)"
day_component = r"(?:(-?)(\d+)(?:[.,](\d+)(?=D$))?D)"
hour_component = r"(?:(-?)(\d+)(?:[.,](\d+)(?=H$))?H)"
minute_component = r"(?:(-?)(\d+)(?:[.,](\d+)(?=M$))?M)"
second_component = r"(?:(-?)(\d+)(?:[.,](\d+)(?=S$))?S)"

iso_regex = re.compile(fr"""
^ # Start of string
-? # Optional negative sign prefix
P # P symbol
(?!$) # Ensure at least 1 component follows P by disallowing ending on P
{year_component}?
{month_component}?
{week_component}?
{day_component}?
(?:
T # T symbol
(?!$) # Ensure at least 1 component follows T by disallowing ending on T
{hour_component}?
{minute_component}?
{second_component}?
)?
$ # End of string
""", re.VERBOSE)

match = iso_regex.fullmatch(ts)
if match is None:
raise ValueError(f"Invalid ISO 8601 Duration format - {ts}")
groups = list(match.groups())

# Numpy time units corresponding to the capture groups of the regex.
units = ["Y", "M", "W", "D", "h", "m", "s"]

# Convert each component to nanoseconds and sum.
for i, unit in enumerate(units):
j = i * 3
sign_part, integer_part, fractional_part = groups[j:j+3]
if integer_part is not None:
if fractional_part is None:
fractional_part = "0"
r = timedelta_from_spec(integer_part, fractional_part, unit)
result += timedelta_as_neg(r, sign_part == "-")

return sign * result


cdef _to_py_int_float(v):
Expand Down
62 changes: 56 additions & 6 deletions pandas/tests/scalar/timedelta/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,7 @@ def test_construction_out_of_bounds_td64s(val, unit):
nanoseconds=12,
),
),
("PT3,5H", Timedelta(hours=3.5)),
("P4DT12H30M5S", Timedelta(days=4, hours=12, minutes=30, seconds=5)),
("P0DT0H0M0.000000123S", Timedelta(nanoseconds=123)),
("P0DT0H0M0.00001S", Timedelta(microseconds=10)),
Expand All @@ -388,7 +389,7 @@ def test_construction_out_of_bounds_td64s(val, unit):
("P1DT0H0M00000000000S", Timedelta(days=1)),
("PT-6H3M", Timedelta(hours=-6, minutes=3)),
("-PT6H3M", Timedelta(hours=-6, minutes=-3)),
("-PT-6H+3M", Timedelta(hours=6, minutes=-3)),
("-PT-6H3M", Timedelta(hours=6, minutes=-3)),
],
)
def test_iso_constructor(fmt, exp):
Expand All @@ -398,16 +399,65 @@ def test_iso_constructor(fmt, exp):
@pytest.mark.parametrize(
"fmt",
[
"P1Y",
"P1M",
],
)
def test_iso_constructor_raises_ambiguous_units(fmt):
msg = (
"Units 'M', 'Y' and 'y' do not represent unambiguous timedelta values and are "
"not supported."
)
with pytest.raises(ValueError, match=msg):
Timedelta(fmt)


@pytest.mark.parametrize(
"fmt",
[
"P1DT0H0M0.0000000001S",
"PT0.00000000001M",
],
)
def test_iso_constructor_raises_resolution(fmt):
msg = "Resolution too high, maximum resolution is 1ns"
with pytest.raises(ValueError, match=msg):
Timedelta(fmt)


@pytest.mark.parametrize(
"fmt",
[
# Multiple prefix characters
"PPPPPPPPPPPP",
"PDTHMS",
"P0DT999H999M999S",
"P1DT0H0M0.0000000000000S",
"P1DT0H0M0.S",
# Missing any component
"P",
"-P",
"PT",
"-PT",
# Empty component
"P1DT",
"P1DTHMS",
# Hours or second components before the time separator
"P1H",
"P1S",
# Duplicate time separator
"P1DTT1H",
# Only lowest order component may have be frational
"P1DT2.5H3M",
# Number start with in dot
"P1DT.0H",
# Number ends in dot
"P1DT0H0M0.S",
# Time designator in date half
"P1S",
# Wrong designator order
"PT1S2M",
# Unknown designator
"P1R",
],
)
def test_iso_constructor_raises(fmt):
def test_iso_constructor_raises_invalid_format(fmt):
msg = f"Invalid ISO 8601 Duration format - {fmt}"
with pytest.raises(ValueError, match=msg):
Timedelta(fmt)
Expand Down