Skip to content

Commit f65742c

Browse files
anmyachevjreback
authored andcommitted
ENH: added new cache_dates parameter for read_csv func (#25990)
1 parent 2f182aa commit f65742c

File tree

4 files changed

+50
-5
lines changed

4 files changed

+50
-5
lines changed

asv_bench/benchmarks/io/csv.py

+20
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import numpy as np
55
import pandas.util.testing as tm
66
from pandas import DataFrame, Categorical, date_range, read_csv
7+
from pandas.io.parsers import _parser_defaults
78
from io import StringIO
89

910
from ..pandas_vb_common import BaseIO
@@ -232,6 +233,25 @@ def time_baseline(self):
232233
names=list(string.digits[:9]))
233234

234235

236+
class ReadCSVCachedParseDates(StringIORewind):
237+
params = ([True, False],)
238+
param_names = ['do_cache']
239+
240+
def setup(self, do_cache):
241+
data = ('\n'.join('10/{}'.format(year)
242+
for year in range(2000, 2100)) + '\n') * 10
243+
self.StringIO_input = StringIO(data)
244+
245+
def time_read_csv_cached(self, do_cache):
246+
# kwds setting here is used to avoid breaking tests in
247+
# previous version of pandas, because this is api changes
248+
kwds = {}
249+
if 'cache_dates' in _parser_defaults:
250+
kwds['cache_dates'] = do_cache
251+
read_csv(self.data(self.StringIO_input), header=None,
252+
parse_dates=[0], **kwds)
253+
254+
235255
class ReadCSVMemoryGrowth(BaseIO):
236256

237257
chunksize = 20

doc/source/user_guide/io.rst

+6
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,12 @@ date_parser : function, default ``None``
271271
(corresponding to the columns defined by parse_dates) as arguments.
272272
dayfirst : boolean, default ``False``
273273
DD/MM format dates, international and European format.
274+
cache_dates : boolean, default True
275+
If True, use a cache of unique, converted dates to apply the datetime
276+
conversion. May produce significant speed-up when parsing duplicate
277+
date strings, especially ones with timezone offsets.
278+
279+
.. versionadded:: 0.25.0
274280

275281
Iteration
276282
+++++++++

doc/source/whatsnew/v0.25.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -375,6 +375,7 @@ I/O
375375
- Adds ``use_bqstorage_api`` parameter to :func:`read_gbq` to speed up downloads of large data frames. This feature requires version 0.10.0 of the ``pandas-gbq`` library as well as the ``google-cloud-bigquery-storage`` and ``fastavro`` libraries. (:issue:`26104`)
376376
- Fixed memory leak in :meth:`DataFrame.to_json` when dealing with numeric data (:issue:`24889`)
377377
- Bug in :func:`read_json` where date strings with ``Z`` were not converted to a UTC timezone (:issue:`26168`)
378+
- Added ``cache_dates=True`` parameter to :meth:`read_csv`, which allows to cache unique dates when they are parsed (:issue:`25990`)
378379

379380
Plotting
380381
^^^^^^^^

pandas/io/parsers.py

+23-5
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,12 @@
235235
arguments.
236236
dayfirst : bool, default False
237237
DD/MM format dates, international and European format.
238+
cache_dates : boolean, default True
239+
If True, use a cache of unique, converted dates to apply the datetime
240+
conversion. May produce significant speed-up when parsing duplicate
241+
date strings, especially ones with timezone offsets.
242+
243+
.. versionadded:: 0.25.0
238244
iterator : bool, default False
239245
Return TextFileReader object for iteration or getting chunks with
240246
``get_chunk()``.
@@ -476,6 +482,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
476482
'false_values': None,
477483
'converters': None,
478484
'dtype': None,
485+
'cache_dates': True,
479486

480487
'thousands': None,
481488
'comment': None,
@@ -577,6 +584,7 @@ def parser_f(filepath_or_buffer: FilePathOrBuffer,
577584
keep_date_col=False,
578585
date_parser=None,
579586
dayfirst=False,
587+
cache_dates=True,
580588

581589
# Iteration
582590
iterator=False,
@@ -683,6 +691,7 @@ def parser_f(filepath_or_buffer: FilePathOrBuffer,
683691
keep_date_col=keep_date_col,
684692
dayfirst=dayfirst,
685693
date_parser=date_parser,
694+
cache_dates=cache_dates,
686695

687696
nrows=nrows,
688697
iterator=iterator,
@@ -1379,11 +1388,13 @@ def __init__(self, kwds):
13791388
self.tupleize_cols = kwds.get('tupleize_cols', False)
13801389
self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True)
13811390
self.infer_datetime_format = kwds.pop('infer_datetime_format', False)
1391+
self.cache_dates = kwds.pop('cache_dates', True)
13821392

13831393
self._date_conv = _make_date_converter(
13841394
date_parser=self.date_parser,
13851395
dayfirst=self.dayfirst,
1386-
infer_datetime_format=self.infer_datetime_format
1396+
infer_datetime_format=self.infer_datetime_format,
1397+
cache_dates=self.cache_dates
13871398
)
13881399

13891400
# validate header options for mi
@@ -3173,7 +3184,7 @@ def _get_lines(self, rows=None):
31733184

31743185

31753186
def _make_date_converter(date_parser=None, dayfirst=False,
3176-
infer_datetime_format=False):
3187+
infer_datetime_format=False, cache_dates=True):
31773188
def converter(*date_cols):
31783189
if date_parser is None:
31793190
strs = _concat_date_cols(date_cols)
@@ -3184,16 +3195,22 @@ def converter(*date_cols):
31843195
utc=None,
31853196
dayfirst=dayfirst,
31863197
errors='ignore',
3187-
infer_datetime_format=infer_datetime_format
3198+
infer_datetime_format=infer_datetime_format,
3199+
cache=cache_dates
31883200
).to_numpy()
31893201

31903202
except ValueError:
31913203
return tools.to_datetime(
3192-
parsing.try_parse_dates(strs, dayfirst=dayfirst))
3204+
parsing.try_parse_dates(strs, dayfirst=dayfirst),
3205+
cache=cache_dates
3206+
)
31933207
else:
31943208
try:
31953209
result = tools.to_datetime(
3196-
date_parser(*date_cols), errors='ignore')
3210+
date_parser(*date_cols),
3211+
errors='ignore',
3212+
cache=cache_dates
3213+
)
31973214
if isinstance(result, datetime.datetime):
31983215
raise Exception('scalar parser')
31993216
return result
@@ -3203,6 +3220,7 @@ def converter(*date_cols):
32033220
parsing.try_parse_dates(_concat_date_cols(date_cols),
32043221
parser=date_parser,
32053222
dayfirst=dayfirst),
3223+
cache=cache_dates,
32063224
errors='ignore')
32073225
except Exception:
32083226
return generic_parser(date_parser, *date_cols)

0 commit comments

Comments
 (0)