Skip to content

Commit 2267b97

Browse files
mroeschkeNo-Stream
authored andcommitted
PERF: Add cache keyword to to_datetime (pandas-dev#11665) (pandas-dev#17077)
1 parent 7920032 commit 2267b97

File tree

4 files changed

+445
-199
lines changed

4 files changed

+445
-199
lines changed

asv_bench/benchmarks/timeseries.py

+38-3
Original file line numberDiff line numberDiff line change
@@ -346,17 +346,22 @@ class ToDatetime(object):
346346

347347
def setup(self):
348348
self.rng = date_range(start='1/1/2000', periods=10000, freq='D')
349-
self.stringsD = Series((((self.rng.year * 10000) + (self.rng.month * 100)) + self.rng.day), dtype=np.int64).apply(str)
349+
self.stringsD = Series(self.rng.strftime('%Y%m%d'))
350350

351351
self.rng = date_range(start='1/1/2000', periods=20000, freq='H')
352-
self.strings = [x.strftime('%Y-%m-%d %H:%M:%S') for x in self.rng]
353-
self.strings_nosep = [x.strftime('%Y%m%d %H:%M:%S') for x in self.rng]
352+
self.strings = self.rng.strftime('%Y-%m-%d %H:%M:%S').tolist()
353+
self.strings_nosep = self.rng.strftime('%Y%m%d %H:%M:%S').tolist()
354354
self.strings_tz_space = [x.strftime('%Y-%m-%d %H:%M:%S') + ' -0800'
355355
for x in self.rng]
356356

357357
self.s = Series((['19MAY11', '19MAY11:00:00:00'] * 100000))
358358
self.s2 = self.s.str.replace(':\\S+$', '')
359359

360+
self.unique_numeric_seconds = range(10000)
361+
self.dup_numeric_seconds = [1000] * 10000
362+
self.dup_string_dates = ['2000-02-11'] * 10000
363+
self.dup_string_with_tz = ['2000-02-11 15:00:00-0800'] * 10000
364+
360365
def time_format_YYYYMMDD(self):
361366
to_datetime(self.stringsD, format='%Y%m%d')
362367

@@ -381,6 +386,36 @@ def time_format_exact(self):
381386
def time_format_no_exact(self):
382387
to_datetime(self.s, format='%d%b%y', exact=False)
383388

389+
def time_cache_true_with_unique_seconds_and_unit(self):
390+
to_datetime(self.unique_numeric_seconds, unit='s', cache=True)
391+
392+
def time_cache_false_with_unique_seconds_and_unit(self):
393+
to_datetime(self.unique_numeric_seconds, unit='s', cache=False)
394+
395+
def time_cache_true_with_dup_seconds_and_unit(self):
396+
to_datetime(self.dup_numeric_seconds, unit='s', cache=True)
397+
398+
def time_cache_false_with_dup_seconds_and_unit(self):
399+
to_datetime(self.dup_numeric_seconds, unit='s', cache=False)
400+
401+
def time_cache_true_with_dup_string_dates(self):
402+
to_datetime(self.dup_string_dates, cache=True)
403+
404+
def time_cache_false_with_dup_string_dates(self):
405+
to_datetime(self.dup_string_dates, cache=False)
406+
407+
def time_cache_true_with_dup_string_dates_and_format(self):
408+
to_datetime(self.dup_string_dates, format='%Y-%m-%d', cache=True)
409+
410+
def time_cache_false_with_dup_string_dates_and_format(self):
411+
to_datetime(self.dup_string_dates, format='%Y-%m-%d', cache=False)
412+
413+
def time_cache_true_with_dup_string_tzoffset_dates(self):
414+
to_datetime(self.dup_string_with_tz, cache=True)
415+
416+
def time_cache_false_with_dup_string_tzoffset_dates(self):
417+
to_datetime(self.dup_string_with_tz, cache=False)
418+
384419

385420
class Offsets(object):
386421
goal_time = 0.2

doc/source/whatsnew/v0.22.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ Performance Improvements
7070
~~~~~~~~~~~~~~~~~~~~~~~~
7171

7272
- Indexers on ``Series`` or ``DataFrame`` no longer create a reference cycle (:issue:`17956`)
73-
-
73+
- Added a keyword argument, ``cache``, to :func:`to_datetime` that improved the performance of converting duplicate datetime arguments (:issue:`11665`)
7474
-
7575

7676
.. _whatsnew_0220.docs:

pandas/core/tools/datetimes.py

+92-6
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,77 @@ def _guess_datetime_format_for_array(arr, **kwargs):
3636
return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs)
3737

3838

39+
def _maybe_cache(arg, format, cache, tz, convert_listlike):
40+
"""
41+
Create a cache of unique dates from an array of dates
42+
43+
Parameters
44+
----------
45+
arg : integer, float, string, datetime, list, tuple, 1-d array, Series
46+
format : string
47+
Strftime format to parse time
48+
cache : boolean
49+
True attempts to create a cache of converted values
50+
tz : string
51+
Timezone of the dates
52+
convert_listlike : function
53+
Conversion function to apply on dates
54+
55+
Returns
56+
-------
57+
cache_array : Series
58+
Cache of converted, unique dates. Can be empty
59+
"""
60+
from pandas import Series
61+
cache_array = Series()
62+
if cache:
63+
# Perform a quicker unique check
64+
from pandas import Index
65+
if not Index(arg).is_unique:
66+
unique_dates = algorithms.unique(arg)
67+
cache_dates = convert_listlike(unique_dates, True, format, tz=tz)
68+
cache_array = Series(cache_dates, index=unique_dates)
69+
return cache_array
70+
71+
72+
def _convert_and_box_cache(arg, cache_array, box, errors, name=None):
73+
"""
74+
Convert array of dates with a cache and box the result
75+
76+
Parameters
77+
----------
78+
arg : integer, float, string, datetime, list, tuple, 1-d array, Series
79+
cache_array : Series
80+
Cache of converted, unique dates
81+
box : boolean
82+
True boxes result as an Index-like, False returns an ndarray
83+
errors : string
84+
'ignore' plus box=True will convert result to Index
85+
name : string, default None
86+
Name for a DatetimeIndex
87+
88+
Returns
89+
-------
90+
result : datetime of converted dates
91+
Returns:
92+
93+
- Index-like if box=True
94+
- ndarray if box=False
95+
"""
96+
from pandas import Series, DatetimeIndex, Index
97+
result = Series(arg).map(cache_array)
98+
if box:
99+
if errors == 'ignore':
100+
return Index(result)
101+
else:
102+
return DatetimeIndex(result, name=name)
103+
return result.values
104+
105+
39106
def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
40107
utc=None, box=True, format=None, exact=True,
41-
unit=None, infer_datetime_format=False, origin='unix'):
108+
unit=None, infer_datetime_format=False, origin='unix',
109+
cache=False):
42110
"""
43111
Convert argument to datetime.
44112
@@ -111,7 +179,12 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
111179
origin.
112180
113181
.. versionadded: 0.20.0
182+
cache : boolean, default False
183+
If True, use a cache of unique, converted dates to apply the datetime
184+
conversion. May produce sigificant speed-up when parsing duplicate date
185+
strings, especially ones with timezone offsets.
114186
187+
.. versionadded: 0.22.0
115188
Returns
116189
-------
117190
ret : datetime if parsing succeeded.
@@ -369,15 +442,28 @@ def _convert_listlike(arg, box, format, name=None, tz=tz):
369442
if isinstance(arg, tslib.Timestamp):
370443
result = arg
371444
elif isinstance(arg, ABCSeries):
372-
from pandas import Series
373-
values = _convert_listlike(arg._values, True, format)
374-
result = Series(values, index=arg.index, name=arg.name)
445+
cache_array = _maybe_cache(arg, format, cache, tz, _convert_listlike)
446+
if not cache_array.empty:
447+
result = arg.map(cache_array)
448+
else:
449+
from pandas import Series
450+
values = _convert_listlike(arg._values, True, format)
451+
result = Series(values, index=arg.index, name=arg.name)
375452
elif isinstance(arg, (ABCDataFrame, MutableMapping)):
376453
result = _assemble_from_unit_mappings(arg, errors=errors)
377454
elif isinstance(arg, ABCIndexClass):
378-
result = _convert_listlike(arg, box, format, name=arg.name)
455+
cache_array = _maybe_cache(arg, format, cache, tz, _convert_listlike)
456+
if not cache_array.empty:
457+
result = _convert_and_box_cache(arg, cache_array, box, errors,
458+
name=arg.name)
459+
else:
460+
result = _convert_listlike(arg, box, format, name=arg.name)
379461
elif is_list_like(arg):
380-
result = _convert_listlike(arg, box, format)
462+
cache_array = _maybe_cache(arg, format, cache, tz, _convert_listlike)
463+
if not cache_array.empty:
464+
result = _convert_and_box_cache(arg, cache_array, box, errors)
465+
else:
466+
result = _convert_listlike(arg, box, format)
381467
else:
382468
result = _convert_listlike(np.array([arg]), box, format)[0]
383469

0 commit comments

Comments
 (0)