Skip to content

Commit ca9bb77

Browse files
committed
Hand written ISO parser for Timedelta construction
1 parent 8acdf80 commit ca9bb77

File tree

1 file changed

+91
-31
lines changed

1 file changed

+91
-31
lines changed

pandas/_libs/tslibs/timedeltas.pyx

+91-31
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
# -*- coding: utf-8 -*-
22
# cython: profile=False
33
import collections
4-
import re
54

65
import sys
76
cdef bint PY3 = (sys.version_info[0] >= 3)
@@ -235,6 +234,13 @@ cpdef inline int64_t cast_from_unit(object ts, object unit) except? -1:
235234
frac = round(frac, p)
236235
return <int64_t> (base *m) + <int64_t> (frac *m)
237236

237+
cdef inline _decode_if_necessary(object ts):
238+
# decode ts if necessary
239+
if not PyUnicode_Check(ts) and not PY3:
240+
ts = str(ts).decode('utf-8')
241+
242+
return ts
243+
238244

239245
cdef inline parse_timedelta_string(object ts):
240246
"""
@@ -258,9 +264,7 @@ cdef inline parse_timedelta_string(object ts):
258264
if len(ts) == 0 or ts in nat_strings:
259265
return NPY_NAT
260266

261-
# decode ts if necessary
262-
if not PyUnicode_Check(ts) and not PY3:
263-
ts = str(ts).decode('utf-8')
267+
ts = _decode_if_necessary(ts)
264268

265269
for c in ts:
266270

@@ -507,26 +511,14 @@ def _binary_op_method_timedeltalike(op, name):
507511
# ----------------------------------------------------------------------
508512
# Timedelta Construction
509513

510-
iso_pater = re.compile(r"""P
511-
(?P<days>-?[0-9]*)DT
512-
(?P<hours>[0-9]{1,2})H
513-
(?P<minutes>[0-9]{1,2})M
514-
(?P<seconds>[0-9]{0,2})
515-
(\.
516-
(?P<milliseconds>[0-9]{1,3})
517-
(?P<microseconds>[0-9]{0,3})
518-
(?P<nanoseconds>[0-9]{0,3})
519-
)?S""", re.VERBOSE)
520-
521-
522-
cdef int64_t parse_iso_format_string(object iso_fmt) except? -1:
514+
cdef inline int64_t parse_iso_format_string(object ts) except? -1:
523515
"""
524516
Extracts and cleanses the appropriate values from a match object with
525517
groups for each component of an ISO 8601 duration
526518
527519
Parameters
528520
----------
529-
iso_fmt:
521+
ts:
530522
ISO 8601 Duration formatted string
531523
532524
Returns
@@ -537,25 +529,93 @@ cdef int64_t parse_iso_format_string(object iso_fmt) except? -1:
537529
Raises
538530
------
539531
ValueError
540-
If ``iso_fmt`` cannot be parsed
532+
If ``ts`` cannot be parsed
541533
"""
542534

543-
cdef int64_t ns = 0
535+
cdef:
536+
unicode c
537+
int64_t result = 0, r
538+
int p=0
539+
object dec_unit = 'ms', err_msg
540+
bint have_dot=0, have_value=0, neg=0
541+
list number=[], unit=[]
544542

545-
match = re.match(iso_pater, iso_fmt)
546-
if match:
547-
match_dict = match.groupdict(default='0')
548-
for comp in ['milliseconds', 'microseconds', 'nanoseconds']:
549-
match_dict[comp] = '{:0<3}'.format(match_dict[comp])
543+
ts = _decode_if_necessary(ts)
550544

551-
for k, v in match_dict.items():
552-
ns += timedelta_from_spec(v, '0', k)
545+
err_msg = "Invalid ISO 8601 Duration format - {}".format(ts)
553546

554-
else:
555-
raise ValueError("Invalid ISO 8601 Duration format - "
556-
"{}".format(iso_fmt))
547+
for c in ts:
548+
# number (ascii codes)
549+
if ord(c) >= 48 and ord(c) <= 57:
557550

558-
return ns
551+
have_value = 1
552+
if have_dot:
553+
if p == 3 and dec_unit != 'ns':
554+
unit.append(dec_unit)
555+
if dec_unit == 'ms':
556+
dec_unit = 'us'
557+
elif dec_unit == 'us':
558+
dec_unit = 'ns'
559+
p = 0
560+
p += 1
561+
562+
if not len(unit):
563+
number.append(c)
564+
else:
565+
# if in days, pop trailing T
566+
if unit[-1] == 'T':
567+
unit.pop()
568+
elif 'H' in unit or 'M' in unit:
569+
if len(number) > 2:
570+
raise ValueError(err_msg)
571+
r = timedelta_from_spec(number, '0', unit)
572+
result += timedelta_as_neg(r, neg)
573+
574+
neg = 0
575+
unit, number = [], [c]
576+
else:
577+
if c == 'P':
578+
pass # ignore leading character
579+
elif c == '-':
580+
if neg or have_value:
581+
raise ValueError(err_msg)
582+
else:
583+
neg = 1
584+
elif c in ['D', 'T', 'H', 'M']:
585+
unit.append(c)
586+
elif c == '.':
587+
# append any seconds
588+
if len(number):
589+
r = timedelta_from_spec(number, '0', 'S')
590+
result += timedelta_as_neg(r, neg)
591+
unit, number = [], []
592+
have_dot = 1
593+
elif c == 'S':
594+
if have_dot: # ms, us, or ns
595+
if not len(number) or p > 3:
596+
raise ValueError(err_msg)
597+
# pad to 3 digits as required
598+
pad = 3 - p
599+
while pad > 0:
600+
number.append('0')
601+
pad -= 1
602+
603+
r = timedelta_from_spec(number, '0', dec_unit)
604+
result += timedelta_as_neg(r, neg)
605+
else: # seconds
606+
if len(number) <= 2:
607+
r = timedelta_from_spec(number, '0', 'S')
608+
result += timedelta_as_neg(r, neg)
609+
else:
610+
raise ValueError(err_msg)
611+
else:
612+
raise ValueError(err_msg)
613+
614+
if not have_value:
615+
# Received string only - never parsed any values
616+
raise ValueError(err_msg)
617+
618+
return result
559619

560620

561621
cdef _to_py_int_float(v):

0 commit comments

Comments
 (0)