Skip to content

Commit af8eb59

Browse files
committed
Merge pull request #10396 from jreback/td
PERF: parse and timedelta ops improvements, #6755
2 parents 2fea54a + bb5ec57 commit af8eb59

File tree

5 files changed

+289
-174
lines changed

5 files changed

+289
-174
lines changed

doc/source/whatsnew/v0.17.0.txt

+3
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,9 @@ Performance Improvements
5757
~~~~~~~~~~~~~~~~~~~~~~~~
5858
- Added vbench benchmarks for alternative ExcelWriter engines and reading Excel files (:issue:`7171`)
5959

60+
- 4x improvement in ``timedelta`` string parsing (:issue:`6755`)
61+
- 8x improvement in ``timedelta64`` and ``datetime64`` ops (:issue:`6755`)
62+
6063
.. _whatsnew_0170.bug_fixes:
6164

6265
Bug Fixes

pandas/tseries/tests/test_timedeltas.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,9 @@ def test_construction(self):
109109
# currently invalid as it has a - on the hhmmdd part (only allowed on the days)
110110
self.assertRaises(ValueError, lambda : Timedelta('-10 days -1 h 1.5m 1s 3us'))
111111

112+
# only leading neg signs are allowed
113+
self.assertRaises(ValueError, lambda : Timedelta('10 days -1 h 1.5m 1s 3us'))
114+
112115
# roundtripping both for string and value
113116
for v in ['1s',
114117
'-1s',
@@ -151,7 +154,7 @@ def test_construction(self):
151154
"cannot construct a TimeDelta",
152155
lambda : Timedelta())
153156
tm.assertRaisesRegexp(ValueError,
154-
"cannot create timedelta string convert",
157+
"unit abbreviation w/o a number",
155158
lambda : Timedelta('foo'))
156159
tm.assertRaisesRegexp(ValueError,
157160
"cannot construct a TimeDelta from the passed arguments, allowed keywords are ",

pandas/tseries/timedeltas.py

+3-160
Original file line numberDiff line numberDiff line change
@@ -34,22 +34,13 @@ def _convert_listlike(arg, box, unit):
3434
if isinstance(arg, (list,tuple)) or ((hasattr(arg,'__iter__') and not hasattr(arg,'dtype'))):
3535
arg = np.array(list(arg), dtype='O')
3636

37+
# these are shortcutable
3738
if is_timedelta64_dtype(arg):
3839
value = arg.astype('timedelta64[ns]')
3940
elif is_integer_dtype(arg):
40-
41-
# these are shortcutable
42-
value = arg.astype('timedelta64[{0}]'.format(unit)).astype('timedelta64[ns]')
41+
value = arg.astype('timedelta64[{0}]'.format(unit)).astype('timedelta64[ns]', copy=False)
4342
else:
44-
try:
45-
value = tslib.array_to_timedelta64(_ensure_object(arg), unit=unit, coerce=coerce)
46-
except:
47-
48-
# try to process strings fast; may need to fallback
49-
try:
50-
value = np.array([ _get_string_converter(r, unit=unit)() for r in arg ],dtype='m8[ns]')
51-
except:
52-
value = np.array([ _coerce_scalar_to_timedelta_type(r, unit=unit, coerce=coerce) for r in arg ])
43+
value = tslib.array_to_timedelta64(_ensure_object(arg), unit=unit, coerce=coerce)
5344
value = value.astype('timedelta64[ns]', copy=False)
5445

5546
if box:
@@ -95,15 +86,6 @@ def _convert_listlike(arg, box, unit):
9586
'NS' : 'ns',
9687
'ns' : 'ns',
9788
}
98-
_unit_scale = {
99-
'd' : 86400*1e9,
100-
'h' : 3600*1e9,
101-
'm' : 60*1e9,
102-
's' : 1e9,
103-
'ms' : 1e6,
104-
'us' : 1e3,
105-
'ns' : 1,
106-
}
10789

10890
def _validate_timedelta_unit(arg):
10991
""" provide validation / translation for timedelta short units """
@@ -114,150 +96,11 @@ def _validate_timedelta_unit(arg):
11496
return 'ns'
11597
raise ValueError("invalid timedelta unit {0} provided".format(arg))
11698

117-
_short_search = re.compile(
118-
"^\s*(?P<neg>-?)\s*(?P<value>\d*\.?\d*)\s*(?P<unit>d|s|ms|us|ns)?\s*$",re.IGNORECASE)
119-
_full_search = re.compile(
120-
"^\s*(?P<neg>-?)\s*(?P<days>\d*?\.?\d*?)?\s*(days|d|day)?,?\s*\+?(?P<time>\d{1,2}:\d{2}:\d{2})?(?P<frac>\.\d+)?\s*$",re.IGNORECASE)
121-
_nat_search = re.compile(
122-
"^\s*(nat|nan)\s*$",re.IGNORECASE)
123-
_whitespace = re.compile('^\s*$')
124-
_number_split = re.compile("^(\d+\.?\d*)")
125-
126-
# construct the full2_search
127-
abbrevs = [('d' ,'days|d|day'),
128-
('h' ,'hours|h|hour'),
129-
('m' ,'minutes|min|minute|m'),
130-
('s' ,'seconds|sec|second|s'),
131-
('ms','milliseconds|milli|millis|millisecond|ms'),
132-
('us','microseconds|micro|micros|microsecond|us'),
133-
('ns','nanoseconds|nano|nanos|nanosecond|ns')]
134-
135-
_full_search2 = re.compile(''.join(
136-
["^\s*(?P<neg>-?)\s*"] + [ "(?P<" + p + ">\\d+\.?\d*\s*(" + ss + "))?\\s*" for p, ss in abbrevs ] + ['$']))
137-
13899
def _coerce_scalar_to_timedelta_type(r, unit='ns', box=True, coerce=False):
139100
""" convert strings to timedelta; coerce to Timedelta (if box), else np.timedelta64"""
140101

141-
if isinstance(r, compat.string_types):
142-
143-
# we are already converting to nanoseconds
144-
converter = _get_string_converter(r, unit=unit)
145-
r = converter()
146-
unit='ns'
147-
148102
result = tslib.convert_to_timedelta(r,unit,coerce)
149103
if box:
150104
result = tslib.Timedelta(result)
151105

152106
return result
153-
154-
def _get_string_converter(r, unit='ns'):
155-
""" return a string converter for r to process the timedelta format """
156-
157-
# treat as a nan
158-
if isnull(r):
159-
def convert(r=None, unit=None):
160-
return tslib.iNaT
161-
return convert
162-
163-
if _whitespace.search(r):
164-
def convert(r=None, unit=None):
165-
return tslib.iNaT
166-
return convert
167-
168-
m = _short_search.search(r)
169-
if m:
170-
def convert(r=None, unit=unit, m=m):
171-
if r is not None:
172-
m = _short_search.search(r)
173-
174-
gd = m.groupdict()
175-
176-
r = float(gd['value'])
177-
u = gd.get('unit')
178-
if u is not None:
179-
unit = u.lower()
180-
result = tslib.cast_from_unit(r, unit)
181-
if gd['neg']:
182-
result *= -1
183-
return result
184-
return convert
185-
186-
m = _full_search.search(r)
187-
if m:
188-
def convert(r=None, unit=None, m=m):
189-
if r is not None:
190-
m = _full_search.search(r)
191-
192-
gd = m.groupdict()
193-
194-
# handle time
195-
value = 0
196-
time = gd['time']
197-
if time:
198-
(hh,mm,ss) = time.split(':')
199-
value += int((float(hh)*3600 + float(mm)*60 + float(ss))*1e9)
200-
201-
# handle frac
202-
frac = gd['frac']
203-
if frac:
204-
value += round(float(frac)*1e9)
205-
206-
# handle days (possibly negative)
207-
is_neg = gd['neg']
208-
if gd['days']:
209-
days = int((float(gd['days'] or 0) * 86400)*1e9)
210-
if is_neg:
211-
days *= -1
212-
value += days
213-
else:
214-
if is_neg:
215-
value *= -1
216-
return tslib.cast_from_unit(value, 'ns')
217-
return convert
218-
219-
# look for combo strings
220-
m = _full_search2.search(r)
221-
if m:
222-
def convert(r=None, unit=None, m=m):
223-
if r is not None:
224-
m = _full_search2.search(r)
225-
226-
gd = m.groupdict()
227-
228-
# the parser
229-
def parse(k, v):
230-
if v is None:
231-
return 0
232-
v = float(_number_split.search(v).group())
233-
return int(v*_unit_scale[k])
234-
235-
# handle non-days
236-
days = gd.pop('days',None)
237-
neg = gd.pop('neg',None)
238-
value = 0
239-
for k, v in gd.items():
240-
value += parse(k,v)
241-
242-
# parse days / neg
243-
if days:
244-
days = parse('days',days)
245-
if neg:
246-
days *= -1
247-
value += days
248-
else:
249-
if neg:
250-
value *= -1
251-
252-
return tslib.cast_from_unit(value, 'ns')
253-
return convert
254-
255-
m = _nat_search.search(r)
256-
if m:
257-
def convert(r=None, unit=None, m=m):
258-
return tslib.iNaT
259-
return convert
260-
261-
# no converter
262-
raise ValueError("cannot create timedelta string converter for [{0}]".format(r))
263-

pandas/tseries/tools.py

+7
Original file line numberDiff line numberDiff line change
@@ -263,6 +263,7 @@ def _convert_listlike(arg, box, format):
263263
if isinstance(arg, (list,tuple)):
264264
arg = np.array(arg, dtype='O')
265265

266+
# these are shortcutable
266267
if com.is_datetime64_ns_dtype(arg):
267268
if box and not isinstance(arg, DatetimeIndex):
268269
try:
@@ -271,6 +272,12 @@ def _convert_listlike(arg, box, format):
271272
pass
272273

273274
return arg
275+
elif format is None and com.is_integer_dtype(arg) and unit=='ns':
276+
result = arg.astype('datetime64[ns]')
277+
if box:
278+
return DatetimeIndex(result, tz='utc' if utc else None)
279+
280+
return result
274281

275282
arg = com._ensure_object(arg)
276283

0 commit comments

Comments
 (0)