Skip to content

Commit 89ce37f

Browse files
committed
Add asvs, modify tests for caches
1 parent 2a1a064 commit 89ce37f

File tree

3 files changed

+308
-241
lines changed

3 files changed

+308
-241
lines changed

asv_bench/benchmarks/timeseries.py

+19-34
Original file line numberDiff line numberDiff line change
@@ -346,27 +346,21 @@ class ToDatetime(object):
346346

347347
def setup(self):
348348
self.rng = date_range(start='1/1/2000', periods=10000, freq='D')
349-
self.stringsD = Series((((self.rng.year * 10000) + (self.rng.month * 100)) + self.rng.day), dtype=np.int64).apply(str)
349+
self.stringsD = Series(self.rng.strftime('%Y%m%d'))
350350

351351
self.rng = date_range(start='1/1/2000', periods=20000, freq='H')
352-
self.strings = [x.strftime('%Y-%m-%d %H:%M:%S') for x in self.rng]
353-
self.strings_nosep = [x.strftime('%Y%m%d %H:%M:%S') for x in self.rng]
352+
self.strings = self.rng.strftime('%Y-%m-%d %H:%M:%S').tolist()
353+
self.strings_nosep = self.rng.strftime('%Y%m%d %H:%M:%S').tolist()
354354
self.strings_tz_space = [x.strftime('%Y-%m-%d %H:%M:%S') + ' -0800'
355355
for x in self.rng]
356356

357357
self.s = Series((['19MAY11', '19MAY11:00:00:00'] * 100000))
358358
self.s2 = self.s.str.replace(':\\S+$', '')
359-
self.dup_numeric_data_10_5 = Series([1000] * 100000)
360-
self.dup_string_data_10_5 = ['2013-01-01 01:00:00'] * 100000
361-
self.dup_datetime_data_10_5 = [dt.datetime(2010, 1, 1)] * 100000
362359

363-
self.dup_numeric_data_10_3 = Series([1000] * 100)
364-
self.dup_string_data_10_3 = ['2013-01-01 01:00:00'] * 100
365-
self.dup_datetime_data_10_3 = [dt.datetime(2010, 1, 1)] * 100
366-
367-
self.dup_numeric_data_10_7 = Series([1000] * 10**7)
368-
self.dup_string_data_10_7 = ['2013-01-01 01:00:00'] * 10**7
369-
self.dup_datetime_data_10_7 = [dt.datetime(2010, 1, 1)] * 10**7
360+
self.unique_numeric_seconds = range(10000)
361+
self.dup_numeric_seconds = [1000] * 10000
362+
self.dup_string_dates = ['2000-02-11'] * 10000
363+
self.dup_string_with_tz = ['2000-02-11 15:00:00-0800'] * 10000
370364

371365
def time_format_YYYYMMDD(self):
372366
to_datetime(self.stringsD, format='%Y%m%d')
@@ -392,32 +386,23 @@ def time_format_exact(self):
392386
def time_format_no_exact(self):
393387
to_datetime(self.s, format='%d%b%y', exact=False)
394388

395-
def time_cache_dup_numeric_data_10_3(self):
396-
to_datetime(self.dup_numeric_data_10_3, unit='s')
397-
398-
def time_cache_dup_datetime_data_10_3(self):
399-
to_datetime(self.dup_datetime_data_10_3)
400-
401-
def time_cache_dup_string_data_10_3(self):
402-
to_datetime(self.dup_string_data_10_3)
403-
404-
def time_cache_dup_numeric_data_10_5(self):
405-
to_datetime(self.dup_numeric_data_10_5, unit='s')
389+
def time_cache_with_unique_seconds_and unit(self):
390+
to_datetime(self.unique_numeric_seconds, unit='s')
406391

407-
def time_cache_dup_datetime_data_10_5(self):
408-
to_datetime(self.dup_datetime_data_10_5)
392+
def time_cache_with_dup_seconds_and_unit(self):
393+
to_datetime(self.dup_numeric_seconds, unit='s')
409394

410-
def time_cache_dup_string_data_10_5(self):
411-
to_datetime(self.dup_string_data_10_5)
395+
def time_cache_with_dup_string_dates(self):
396+
to_datetime(self.dup_string_dates)
412397

413-
def time_cache_dup_numeric_data_10_7(self):
414-
to_datetime(self.dup_numeric_data_10_7, unit='s')
398+
def time_cache_with_dup_string_dates_and_format(self):
399+
to_datetime(self.dup_string_dates, format='%Y-%m-%d')
415400

416-
def time_cache_dup_datetime_data_10_7(self):
417-
to_datetime(self.dup_datetime_data_10_7)
401+
def time_cache_with_dup_string_tzoffset_dates(self):
402+
to_datetime(self.dup_string_with_tz)
418403

419-
def time_cache_dup_string_data_10_7(self):
420-
to_datetime(self.dup_string_data_10_7)
404+
def time_cache_with_dup_string_tzoffset_dates_and_format(self):
405+
to_datetim(self.dup_string_with_tz, format='%Y-%m-%d %H:%M:%S%z')
421406

422407

423408
class Offsets(object):

pandas/core/tools/datetimes.py

+7-14
Original file line numberDiff line numberDiff line change
@@ -261,7 +261,7 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
261261
origin.
262262
263263
.. versionadded: 0.20.0
264-
cache_datetime : boolean, default False
264+
cache : boolean, default False
265265
If True, use a cache of unique, converted dates to apply the datetime
266266
conversion. Produces signficant speed-ups when parsing duplicate date.
267267
@@ -355,7 +355,6 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
355355

356356
def _convert_listlike(arg, box, format, name=None, tz=tz):
357357

358-
import pdb; pdb.set_trace()
359358
if isinstance(arg, (list, tuple)):
360359
arg = np.array(arg, dtype='O')
361360

@@ -523,18 +522,12 @@ def _convert_listlike(arg, box, format, name=None, tz=tz):
523522

524523
convert_cache = None
525524
if cache and is_list_like(arg):
526-
# Create a cache only if there are more than 10k values and the user
527-
# passes in datestrings
528-
#min_cache_threshold = 10**5
529-
#if len(arg) >= min_cache_threshold and is_string_dtype(arg):
530-
# unique currently cannot determine dates that are out of bounds
531-
# recurison errors with datetime
532-
unique_dates = algorithms.unique(arg)
533-
# Essentially they need to all be the same value
534-
if len(unique_dates) != len(arg):
535-
from pandas import Series
536-
cache_data = _convert_listlike(unique_dates, False, format)
537-
convert_cache = Series(cache_data, index=unique_dates)
525+
if len(arg) >= 1000:
526+
unique_dates = algorithms.unique(arg)
527+
if len(unique_dates) != len(arg):
528+
from pandas import Series
529+
cache_dates = _convert_listlike(unique_dates, False, format)
530+
convert_cache = Series(cache_dates, index=unique_dates)
538531

539532
if isinstance(arg, tslib.Timestamp):
540533
result = arg

0 commit comments

Comments
 (0)