Skip to content

Commit eb70d39

Browse files
committed
added new cache_dates parameter for read_csv func
1 parent 2769ebf commit eb70d39

File tree

2 files changed

+43
-5
lines changed

2 files changed

+43
-5
lines changed

asv_bench/benchmarks/io/csv.py

+20
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import numpy as np
55
import pandas.util.testing as tm
66
from pandas import DataFrame, Categorical, date_range, read_csv
7+
from pandas.io.parsers import _parser_defaults
78
from io import StringIO
89

910
from ..pandas_vb_common import BaseIO
@@ -214,6 +215,25 @@ def time_baseline(self):
214215
names=list(string.digits[:9]))
215216

216217

218+
class ReadCSVCachedParseDates(StringIORewind):
219+
params = ([True, False],)
220+
param_names = ['do_cache']
221+
222+
def setup(self, do_cache):
223+
data = ('\n'.join('10/{}'.format(year)
224+
for year in range(2000, 2100)) + '\n') * 10
225+
self.StringIO_input = StringIO(data)
226+
227+
def time_read_csv_cached(self, do_cache):
228+
# kwds setting here is used to avoid breaking tests in
229+
# previuos version of pandas, because this is api changes
230+
kwds = {}
231+
if 'cache_dates' in _parser_defaults:
232+
kwds['cache_dates'] = do_cache
233+
read_csv(self.data(self.StringIO_input), header=None,
234+
parse_dates=[0], **kwds)
235+
236+
217237
class ReadCSVMemoryGrowth(BaseIO):
218238

219239
chunksize = 20

pandas/io/parsers.py

+23-5
Original file line numberDiff line numberDiff line change
@@ -327,6 +327,12 @@
327327
values. The options are `None` for the ordinary converter,
328328
`high` for the high-precision converter, and `round_trip` for the
329329
round-trip converter.
330+
cache_dates : boolean, default False
331+
If True, use a cache of unique, converted dates to apply the datetime
332+
conversion. May produce significant speed-up when parsing duplicate
333+
date strings, especially ones with timezone offsets.
334+
335+
.. versionadded:: 0.23.0
330336
331337
Returns
332338
-------
@@ -476,6 +482,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
476482
'false_values': None,
477483
'converters': None,
478484
'dtype': None,
485+
'cache_dates': False,
479486

480487
'thousands': None,
481488
'comment': None,
@@ -577,6 +584,7 @@ def parser_f(filepath_or_buffer: FilePathOrBuffer,
577584
keep_date_col=False,
578585
date_parser=None,
579586
dayfirst=False,
587+
cache_dates=False,
580588

581589
# Iteration
582590
iterator=False,
@@ -683,6 +691,7 @@ def parser_f(filepath_or_buffer: FilePathOrBuffer,
683691
keep_date_col=keep_date_col,
684692
dayfirst=dayfirst,
685693
date_parser=date_parser,
694+
cache_dates=cache_dates,
686695

687696
nrows=nrows,
688697
iterator=iterator,
@@ -1385,11 +1394,13 @@ def __init__(self, kwds):
13851394
self.tupleize_cols = kwds.get('tupleize_cols', False)
13861395
self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True)
13871396
self.infer_datetime_format = kwds.pop('infer_datetime_format', False)
1397+
self.cache_dates = kwds.pop('cache_dates', False)
13881398

13891399
self._date_conv = _make_date_converter(
13901400
date_parser=self.date_parser,
13911401
dayfirst=self.dayfirst,
1392-
infer_datetime_format=self.infer_datetime_format
1402+
infer_datetime_format=self.infer_datetime_format,
1403+
cache_dates = self.cache_dates
13931404
)
13941405

13951406
# validate header options for mi
@@ -3179,7 +3190,7 @@ def _get_lines(self, rows=None):
31793190

31803191

31813192
def _make_date_converter(date_parser=None, dayfirst=False,
3182-
infer_datetime_format=False):
3193+
infer_datetime_format=False, cache_dates=False):
31833194
def converter(*date_cols):
31843195
if date_parser is None:
31853196
strs = _concat_date_cols(date_cols)
@@ -3190,16 +3201,22 @@ def converter(*date_cols):
31903201
utc=None,
31913202
dayfirst=dayfirst,
31923203
errors='ignore',
3193-
infer_datetime_format=infer_datetime_format
3204+
infer_datetime_format=infer_datetime_format,
3205+
cache=cache_dates
31943206
).to_numpy()
31953207

31963208
except ValueError:
31973209
return tools.to_datetime(
3198-
parsing.try_parse_dates(strs, dayfirst=dayfirst))
3210+
parsing.try_parse_dates(strs, dayfirst=dayfirst),
3211+
cache=cache_dates
3212+
)
31993213
else:
32003214
try:
32013215
result = tools.to_datetime(
3202-
date_parser(*date_cols), errors='ignore')
3216+
date_parser(*date_cols),
3217+
errors='ignore',
3218+
cache=cache_dates
3219+
)
32033220
if isinstance(result, datetime.datetime):
32043221
raise Exception('scalar parser')
32053222
return result
@@ -3209,6 +3226,7 @@ def converter(*date_cols):
32093226
parsing.try_parse_dates(_concat_date_cols(date_cols),
32103227
parser=date_parser,
32113228
dayfirst=dayfirst),
3229+
cache=cache_dates,
32123230
errors='ignore')
32133231
except Exception:
32143232
return generic_parser(date_parser, *date_cols)

0 commit comments

Comments
 (0)