Skip to content

Commit ae4fe98

Browse files
WillAydjreback
authored andcommitted
Hand written ISO parser for Timedelta construction (pandas-dev#19191)
1 parent bcce140 commit ae4fe98

File tree

1 file changed

+92
-31
lines changed

1 file changed

+92
-31
lines changed

pandas/_libs/tslibs/timedeltas.pyx

+92-31
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
# -*- coding: utf-8 -*-
22
# cython: profile=False
33
import collections
4-
import re
54

65
import sys
76
cdef bint PY3 = (sys.version_info[0] >= 3)
@@ -236,6 +235,14 @@ cpdef inline int64_t cast_from_unit(object ts, object unit) except? -1:
236235
return <int64_t> (base *m) + <int64_t> (frac *m)
237236

238237

238+
cdef inline _decode_if_necessary(object ts):
239+
# decode ts if necessary
240+
if not PyUnicode_Check(ts) and not PY3:
241+
ts = str(ts).decode('utf-8')
242+
243+
return ts
244+
245+
239246
cdef inline parse_timedelta_string(object ts):
240247
"""
241248
Parse a regular format timedelta string. Return an int64_t (in ns)
@@ -258,9 +265,7 @@ cdef inline parse_timedelta_string(object ts):
258265
if len(ts) == 0 or ts in nat_strings:
259266
return NPY_NAT
260267

261-
# decode ts if necessary
262-
if not PyUnicode_Check(ts) and not PY3:
263-
ts = str(ts).decode('utf-8')
268+
ts = _decode_if_necessary(ts)
264269

265270
for c in ts:
266271

@@ -507,26 +512,14 @@ def _binary_op_method_timedeltalike(op, name):
507512
# ----------------------------------------------------------------------
508513
# Timedelta Construction
509514

510-
iso_pater = re.compile(r"""P
511-
(?P<days>-?[0-9]*)DT
512-
(?P<hours>[0-9]{1,2})H
513-
(?P<minutes>[0-9]{1,2})M
514-
(?P<seconds>[0-9]{0,2})
515-
(\.
516-
(?P<milliseconds>[0-9]{1,3})
517-
(?P<microseconds>[0-9]{0,3})
518-
(?P<nanoseconds>[0-9]{0,3})
519-
)?S""", re.VERBOSE)
520-
521-
522-
cdef int64_t parse_iso_format_string(object iso_fmt) except? -1:
515+
cdef inline int64_t parse_iso_format_string(object ts) except? -1:
523516
"""
524517
Extracts and cleanses the appropriate values from a match object with
525518
groups for each component of an ISO 8601 duration
526519
527520
Parameters
528521
----------
529-
iso_fmt:
522+
ts:
530523
ISO 8601 Duration formatted string
531524
532525
Returns
@@ -537,25 +530,93 @@ cdef int64_t parse_iso_format_string(object iso_fmt) except? -1:
537530
Raises
538531
------
539532
ValueError
540-
If ``iso_fmt`` cannot be parsed
533+
If ``ts`` cannot be parsed
541534
"""
542535

543-
cdef int64_t ns = 0
536+
cdef:
537+
unicode c
538+
int64_t result = 0, r
539+
int p=0
540+
object dec_unit = 'ms', err_msg
541+
bint have_dot=0, have_value=0, neg=0
542+
list number=[], unit=[]
544543

545-
match = re.match(iso_pater, iso_fmt)
546-
if match:
547-
match_dict = match.groupdict(default='0')
548-
for comp in ['milliseconds', 'microseconds', 'nanoseconds']:
549-
match_dict[comp] = '{:0<3}'.format(match_dict[comp])
544+
ts = _decode_if_necessary(ts)
550545

551-
for k, v in match_dict.items():
552-
ns += timedelta_from_spec(v, '0', k)
546+
err_msg = "Invalid ISO 8601 Duration format - {}".format(ts)
553547

554-
else:
555-
raise ValueError("Invalid ISO 8601 Duration format - "
556-
"{}".format(iso_fmt))
548+
for c in ts:
549+
# number (ascii codes)
550+
if ord(c) >= 48 and ord(c) <= 57:
551+
552+
have_value = 1
553+
if have_dot:
554+
if p == 3 and dec_unit != 'ns':
555+
unit.append(dec_unit)
556+
if dec_unit == 'ms':
557+
dec_unit = 'us'
558+
elif dec_unit == 'us':
559+
dec_unit = 'ns'
560+
p = 0
561+
p += 1
562+
563+
if not len(unit):
564+
number.append(c)
565+
else:
566+
# if in days, pop trailing T
567+
if unit[-1] == 'T':
568+
unit.pop()
569+
elif 'H' in unit or 'M' in unit:
570+
if len(number) > 2:
571+
raise ValueError(err_msg)
572+
r = timedelta_from_spec(number, '0', unit)
573+
result += timedelta_as_neg(r, neg)
557574

558-
return ns
575+
neg = 0
576+
unit, number = [], [c]
577+
else:
578+
if c == 'P':
579+
pass # ignore leading character
580+
elif c == '-':
581+
if neg or have_value:
582+
raise ValueError(err_msg)
583+
else:
584+
neg = 1
585+
elif c in ['D', 'T', 'H', 'M']:
586+
unit.append(c)
587+
elif c == '.':
588+
# append any seconds
589+
if len(number):
590+
r = timedelta_from_spec(number, '0', 'S')
591+
result += timedelta_as_neg(r, neg)
592+
unit, number = [], []
593+
have_dot = 1
594+
elif c == 'S':
595+
if have_dot: # ms, us, or ns
596+
if not len(number) or p > 3:
597+
raise ValueError(err_msg)
598+
# pad to 3 digits as required
599+
pad = 3 - p
600+
while pad > 0:
601+
number.append('0')
602+
pad -= 1
603+
604+
r = timedelta_from_spec(number, '0', dec_unit)
605+
result += timedelta_as_neg(r, neg)
606+
else: # seconds
607+
if len(number) <= 2:
608+
r = timedelta_from_spec(number, '0', 'S')
609+
result += timedelta_as_neg(r, neg)
610+
else:
611+
raise ValueError(err_msg)
612+
else:
613+
raise ValueError(err_msg)
614+
615+
if not have_value:
616+
# Received string only - never parsed any values
617+
raise ValueError(err_msg)
618+
619+
return result
559620

560621

561622
cdef _to_py_int_float(v):

0 commit comments

Comments
 (0)