diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index add0943c5ce9f..c51fb09ad8671 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -4,6 +4,7 @@ import numpy as np import pandas.util.testing as tm from pandas import DataFrame, Categorical, date_range, read_csv +from pandas.io.parsers import _parser_defaults from io import StringIO from ..pandas_vb_common import BaseIO @@ -232,6 +233,25 @@ def time_baseline(self): names=list(string.digits[:9])) +class ReadCSVCachedParseDates(StringIORewind): + params = ([True, False],) + param_names = ['do_cache'] + + def setup(self, do_cache): + data = ('\n'.join('10/{}'.format(year) + for year in range(2000, 2100)) + '\n') * 10 + self.StringIO_input = StringIO(data) + + def time_read_csv_cached(self, do_cache): + # kwds setting here is used to avoid breaking tests in + # previous version of pandas, because this is api changes + kwds = {} + if 'cache_dates' in _parser_defaults: + kwds['cache_dates'] = do_cache + read_csv(self.data(self.StringIO_input), header=None, + parse_dates=[0], **kwds) + + class ReadCSVMemoryGrowth(BaseIO): chunksize = 20 diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 9ec39c0ff2b23..0abd073c7dc07 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -271,6 +271,12 @@ date_parser : function, default ``None`` (corresponding to the columns defined by parse_dates) as arguments. dayfirst : boolean, default ``False`` DD/MM format dates, international and European format. +cache_dates : boolean, default True + If True, use a cache of unique, converted dates to apply the datetime + conversion. May produce significant speed-up when parsing duplicate + date strings, especially ones with timezone offsets. + + .. versionadded:: 0.25.0 Iteration +++++++++ diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 1f4176b18c2e0..3e1ce702f0423 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -375,6 +375,7 @@ I/O - Adds ``use_bqstorage_api`` parameter to :func:`read_gbq` to speed up downloads of large data frames. This feature requires version 0.10.0 of the ``pandas-gbq`` library as well as the ``google-cloud-bigquery-storage`` and ``fastavro`` libraries. (:issue:`26104`) - Fixed memory leak in :meth:`DataFrame.to_json` when dealing with numeric data (:issue:`24889`) - Bug in :func:`read_json` where date strings with ``Z`` were not converted to a UTC timezone (:issue:`26168`) +- Added ``cache_dates=True`` parameter to :meth:`read_csv`, which allows to cache unique dates when they are parsed (:issue:`25990`) Plotting ^^^^^^^^ diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 701d4bb58bb08..f85863c4384bd 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -235,6 +235,12 @@ arguments. dayfirst : bool, default False DD/MM format dates, international and European format. +cache_dates : boolean, default True + If True, use a cache of unique, converted dates to apply the datetime + conversion. May produce significant speed-up when parsing duplicate + date strings, especially ones with timezone offsets. + + .. versionadded:: 0.25.0 iterator : bool, default False Return TextFileReader object for iteration or getting chunks with ``get_chunk()``. @@ -476,6 +482,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): 'false_values': None, 'converters': None, 'dtype': None, + 'cache_dates': True, 'thousands': None, 'comment': None, @@ -577,6 +584,7 @@ def parser_f(filepath_or_buffer: FilePathOrBuffer, keep_date_col=False, date_parser=None, dayfirst=False, + cache_dates=True, # Iteration iterator=False, @@ -683,6 +691,7 @@ def parser_f(filepath_or_buffer: FilePathOrBuffer, keep_date_col=keep_date_col, dayfirst=dayfirst, date_parser=date_parser, + cache_dates=cache_dates, nrows=nrows, iterator=iterator, @@ -1379,11 +1388,13 @@ def __init__(self, kwds): self.tupleize_cols = kwds.get('tupleize_cols', False) self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True) self.infer_datetime_format = kwds.pop('infer_datetime_format', False) + self.cache_dates = kwds.pop('cache_dates', True) self._date_conv = _make_date_converter( date_parser=self.date_parser, dayfirst=self.dayfirst, - infer_datetime_format=self.infer_datetime_format + infer_datetime_format=self.infer_datetime_format, + cache_dates=self.cache_dates ) # validate header options for mi @@ -3173,7 +3184,7 @@ def _get_lines(self, rows=None): def _make_date_converter(date_parser=None, dayfirst=False, - infer_datetime_format=False): + infer_datetime_format=False, cache_dates=True): def converter(*date_cols): if date_parser is None: strs = _concat_date_cols(date_cols) @@ -3184,16 +3195,22 @@ def converter(*date_cols): utc=None, dayfirst=dayfirst, errors='ignore', - infer_datetime_format=infer_datetime_format + infer_datetime_format=infer_datetime_format, + cache=cache_dates ).to_numpy() except ValueError: return tools.to_datetime( - parsing.try_parse_dates(strs, dayfirst=dayfirst)) + parsing.try_parse_dates(strs, dayfirst=dayfirst), + cache=cache_dates + ) else: try: result = tools.to_datetime( - date_parser(*date_cols), errors='ignore') + date_parser(*date_cols), + errors='ignore', + cache=cache_dates + ) if isinstance(result, datetime.datetime): raise Exception('scalar parser') return result @@ -3203,6 +3220,7 @@ def converter(*date_cols): parsing.try_parse_dates(_concat_date_cols(date_cols), parser=date_parser, dayfirst=dayfirst), + cache=cache_dates, errors='ignore') except Exception: return generic_parser(date_parser, *date_cols)