Skip to content

ENH: added new cache_dates parameter for read_csv func #25990

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
May 7, 2019
20 changes: 20 additions & 0 deletions asv_bench/benchmarks/io/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import numpy as np
import pandas.util.testing as tm
from pandas import DataFrame, Categorical, date_range, read_csv
from pandas.io.parsers import _parser_defaults
from io import StringIO

from ..pandas_vb_common import BaseIO
Expand Down Expand Up @@ -232,6 +233,25 @@ def time_baseline(self):
names=list(string.digits[:9]))


class ReadCSVCachedParseDates(StringIORewind):
params = ([True, False],)
param_names = ['do_cache']

def setup(self, do_cache):
data = ('\n'.join('10/{}'.format(year)
for year in range(2000, 2100)) + '\n') * 10
self.StringIO_input = StringIO(data)

def time_read_csv_cached(self, do_cache):
# kwds setting here is used to avoid breaking tests in
# previous version of pandas, because this is api changes
kwds = {}
if 'cache_dates' in _parser_defaults:
kwds['cache_dates'] = do_cache
read_csv(self.data(self.StringIO_input), header=None,
parse_dates=[0], **kwds)


class ReadCSVMemoryGrowth(BaseIO):

chunksize = 20
Expand Down
6 changes: 6 additions & 0 deletions doc/source/user_guide/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,12 @@ date_parser : function, default ``None``
(corresponding to the columns defined by parse_dates) as arguments.
dayfirst : boolean, default ``False``
DD/MM format dates, international and European format.
cache_dates : boolean, default True
If True, use a cache of unique, converted dates to apply the datetime
conversion. May produce significant speed-up when parsing duplicate
date strings, especially ones with timezone offsets.

.. versionadded:: 0.25.0

Iteration
+++++++++
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,7 @@ I/O
- Adds ``use_bqstorage_api`` parameter to :func:`read_gbq` to speed up downloads of large data frames. This feature requires version 0.10.0 of the ``pandas-gbq`` library as well as the ``google-cloud-bigquery-storage`` and ``fastavro`` libraries. (:issue:`26104`)
- Fixed memory leak in :meth:`DataFrame.to_json` when dealing with numeric data (:issue:`24889`)
- Bug in :func:`read_json` where date strings with ``Z`` were not converted to a UTC timezone (:issue:`26168`)
- Added ``cache_dates=True`` parameter to :meth:`read_csv`, which allows to cache unique dates when they are parsed (:issue:`25990`)

Plotting
^^^^^^^^
Expand Down
28 changes: 23 additions & 5 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,12 @@
arguments.
dayfirst : bool, default False
DD/MM format dates, international and European format.
cache_dates : boolean, default True
If True, use a cache of unique, converted dates to apply the datetime
conversion. May produce significant speed-up when parsing duplicate
date strings, especially ones with timezone offsets.

.. versionadded:: 0.25.0
iterator : bool, default False
Return TextFileReader object for iteration or getting chunks with
``get_chunk()``.
Expand Down Expand Up @@ -476,6 +482,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
'false_values': None,
'converters': None,
'dtype': None,
'cache_dates': True,

'thousands': None,
'comment': None,
Expand Down Expand Up @@ -577,6 +584,7 @@ def parser_f(filepath_or_buffer: FilePathOrBuffer,
keep_date_col=False,
date_parser=None,
dayfirst=False,
cache_dates=True,

# Iteration
iterator=False,
Expand Down Expand Up @@ -683,6 +691,7 @@ def parser_f(filepath_or_buffer: FilePathOrBuffer,
keep_date_col=keep_date_col,
dayfirst=dayfirst,
date_parser=date_parser,
cache_dates=cache_dates,

nrows=nrows,
iterator=iterator,
Expand Down Expand Up @@ -1379,11 +1388,13 @@ def __init__(self, kwds):
self.tupleize_cols = kwds.get('tupleize_cols', False)
self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True)
self.infer_datetime_format = kwds.pop('infer_datetime_format', False)
self.cache_dates = kwds.pop('cache_dates', True)

self._date_conv = _make_date_converter(
date_parser=self.date_parser,
dayfirst=self.dayfirst,
infer_datetime_format=self.infer_datetime_format
infer_datetime_format=self.infer_datetime_format,
cache_dates=self.cache_dates
)

# validate header options for mi
Expand Down Expand Up @@ -3173,7 +3184,7 @@ def _get_lines(self, rows=None):


def _make_date_converter(date_parser=None, dayfirst=False,
infer_datetime_format=False):
infer_datetime_format=False, cache_dates=True):
def converter(*date_cols):
if date_parser is None:
strs = _concat_date_cols(date_cols)
Expand All @@ -3184,16 +3195,22 @@ def converter(*date_cols):
utc=None,
dayfirst=dayfirst,
errors='ignore',
infer_datetime_format=infer_datetime_format
infer_datetime_format=infer_datetime_format,
cache=cache_dates
).to_numpy()

except ValueError:
return tools.to_datetime(
parsing.try_parse_dates(strs, dayfirst=dayfirst))
parsing.try_parse_dates(strs, dayfirst=dayfirst),
cache=cache_dates
)
else:
try:
result = tools.to_datetime(
date_parser(*date_cols), errors='ignore')
date_parser(*date_cols),
errors='ignore',
cache=cache_dates
)
if isinstance(result, datetime.datetime):
raise Exception('scalar parser')
return result
Expand All @@ -3203,6 +3220,7 @@ def converter(*date_cols):
parsing.try_parse_dates(_concat_date_cols(date_cols),
parser=date_parser,
dayfirst=dayfirst),
cache=cache_dates,
errors='ignore')
except Exception:
return generic_parser(date_parser, *date_cols)
Expand Down