From 20fae8be04e9b7fd4965b5536255cfa1e161042a Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Thu, 4 Apr 2019 21:08:53 +0300 Subject: [PATCH 1/9] added new cache_dates parameter for read_csv func --- asv_bench/benchmarks/io/csv.py | 20 ++++++++++++++++++++ pandas/io/parsers.py | 28 +++++++++++++++++++++++----- 2 files changed, 43 insertions(+), 5 deletions(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index add0943c5ce9f..0be5fcc8d94c9 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -4,6 +4,7 @@ import numpy as np import pandas.util.testing as tm from pandas import DataFrame, Categorical, date_range, read_csv +from pandas.io.parsers import _parser_defaults from io import StringIO from ..pandas_vb_common import BaseIO @@ -232,6 +233,25 @@ def time_baseline(self): names=list(string.digits[:9])) +class ReadCSVCachedParseDates(StringIORewind): + params = ([True, False],) + param_names = ['do_cache'] + + def setup(self, do_cache): + data = ('\n'.join('10/{}'.format(year) + for year in range(2000, 2100)) + '\n') * 10 + self.StringIO_input = StringIO(data) + + def time_read_csv_cached(self, do_cache): + # kwds setting here is used to avoid breaking tests in + # previuos version of pandas, because this is api changes + kwds = {} + if 'cache_dates' in _parser_defaults: + kwds['cache_dates'] = do_cache + read_csv(self.data(self.StringIO_input), header=None, + parse_dates=[0], **kwds) + + class ReadCSVMemoryGrowth(BaseIO): chunksize = 20 diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 701d4bb58bb08..fa017c69e8e45 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -327,6 +327,12 @@ values. The options are `None` for the ordinary converter, `high` for the high-precision converter, and `round_trip` for the round-trip converter. +cache_dates : boolean, default False + If True, use a cache of unique, converted dates to apply the datetime + conversion. May produce significant speed-up when parsing duplicate + date strings, especially ones with timezone offsets. + + .. versionadded:: 0.23.0 Returns ------- @@ -476,6 +482,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): 'false_values': None, 'converters': None, 'dtype': None, + 'cache_dates': False, 'thousands': None, 'comment': None, @@ -577,6 +584,7 @@ def parser_f(filepath_or_buffer: FilePathOrBuffer, keep_date_col=False, date_parser=None, dayfirst=False, + cache_dates=False, # Iteration iterator=False, @@ -683,6 +691,7 @@ def parser_f(filepath_or_buffer: FilePathOrBuffer, keep_date_col=keep_date_col, dayfirst=dayfirst, date_parser=date_parser, + cache_dates=cache_dates, nrows=nrows, iterator=iterator, @@ -1379,11 +1388,13 @@ def __init__(self, kwds): self.tupleize_cols = kwds.get('tupleize_cols', False) self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True) self.infer_datetime_format = kwds.pop('infer_datetime_format', False) + self.cache_dates = kwds.pop('cache_dates', False) self._date_conv = _make_date_converter( date_parser=self.date_parser, dayfirst=self.dayfirst, - infer_datetime_format=self.infer_datetime_format + infer_datetime_format=self.infer_datetime_format, + cache_dates = self.cache_dates ) # validate header options for mi @@ -3173,7 +3184,7 @@ def _get_lines(self, rows=None): def _make_date_converter(date_parser=None, dayfirst=False, - infer_datetime_format=False): + infer_datetime_format=False, cache_dates=False): def converter(*date_cols): if date_parser is None: strs = _concat_date_cols(date_cols) @@ -3184,16 +3195,22 @@ def converter(*date_cols): utc=None, dayfirst=dayfirst, errors='ignore', - infer_datetime_format=infer_datetime_format + infer_datetime_format=infer_datetime_format, + cache=cache_dates ).to_numpy() except ValueError: return tools.to_datetime( - parsing.try_parse_dates(strs, dayfirst=dayfirst)) + parsing.try_parse_dates(strs, dayfirst=dayfirst), + cache=cache_dates + ) else: try: result = tools.to_datetime( - date_parser(*date_cols), errors='ignore') + date_parser(*date_cols), + errors='ignore', + cache=cache_dates + ) if isinstance(result, datetime.datetime): raise Exception('scalar parser') return result @@ -3203,6 +3220,7 @@ def converter(*date_cols): parsing.try_parse_dates(_concat_date_cols(date_cols), parser=date_parser, dayfirst=dayfirst), + cache=cache_dates, errors='ignore') except Exception: return generic_parser(date_parser, *date_cols) From 028b55ddf3deb57637e16e166625f858a3e65ec1 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Thu, 4 Apr 2019 21:17:42 +0300 Subject: [PATCH 2/9] fix PEP 8 issues --- pandas/io/parsers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index fa017c69e8e45..f1f9fd330a95d 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1394,7 +1394,7 @@ def __init__(self, kwds): date_parser=self.date_parser, dayfirst=self.dayfirst, infer_datetime_format=self.infer_datetime_format, - cache_dates = self.cache_dates + cache_dates=self.cache_dates ) # validate header options for mi From f83c65c403574384372ea4259b32e19fa4288e87 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Thu, 4 Apr 2019 22:11:33 +0300 Subject: [PATCH 3/9] changed default value of cache_dates var to true --- pandas/io/parsers.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index f1f9fd330a95d..f76c4f59d022d 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -327,7 +327,7 @@ values. The options are `None` for the ordinary converter, `high` for the high-precision converter, and `round_trip` for the round-trip converter. -cache_dates : boolean, default False +cache_dates : boolean, default True If True, use a cache of unique, converted dates to apply the datetime conversion. May produce significant speed-up when parsing duplicate date strings, especially ones with timezone offsets. @@ -482,7 +482,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): 'false_values': None, 'converters': None, 'dtype': None, - 'cache_dates': False, + 'cache_dates': True, 'thousands': None, 'comment': None, @@ -584,7 +584,7 @@ def parser_f(filepath_or_buffer: FilePathOrBuffer, keep_date_col=False, date_parser=None, dayfirst=False, - cache_dates=False, + cache_dates=True, # Iteration iterator=False, @@ -1388,7 +1388,7 @@ def __init__(self, kwds): self.tupleize_cols = kwds.get('tupleize_cols', False) self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True) self.infer_datetime_format = kwds.pop('infer_datetime_format', False) - self.cache_dates = kwds.pop('cache_dates', False) + self.cache_dates = kwds.pop('cache_dates', True) self._date_conv = _make_date_converter( date_parser=self.date_parser, @@ -3184,7 +3184,7 @@ def _get_lines(self, rows=None): def _make_date_converter(date_parser=None, dayfirst=False, - infer_datetime_format=False, cache_dates=False): + infer_datetime_format=False, cache_dates=True): def converter(*date_cols): if date_parser is None: strs = _concat_date_cols(date_cols) From 4da942d5561ecdcc56d1f01f79481b7a153c2a22 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Thu, 4 Apr 2019 22:20:11 +0300 Subject: [PATCH 4/9] added whatsnew entry --- doc/source/whatsnew/v0.25.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 1f4176b18c2e0..8e7fbd86aebc4 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -216,6 +216,7 @@ Other API Changes - Comparing :class:`Timestamp` with unsupported objects now returns :py:obj:`NotImplemented` instead of raising ``TypeError``. This implies that unsupported rich comparisons are delegated to the other object, and are now consistent with Python 3 behavior for ``datetime`` objects (:issue:`24011`) - Bug in :meth:`DatetimeIndex.snap` which didn't preserving the ``name`` of the input :class:`Index` (:issue:`25575`) - The ``arg`` argument in :meth:`pandas.core.groupby.DataFrameGroupBy.agg` has been renamed to ``func`` (:issue:`26089`) +- Added ``cache_dates`` parameter to :meth:`read_csv`, which allows to cache unique dates when they are parsed (:issue:`25990`) .. _whatsnew_0250.deprecations: From 995d8d550b9d4e890a19ef22f2e2192cc3681a96 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Thu, 4 Apr 2019 22:37:39 +0300 Subject: [PATCH 5/9] move cache_dates var in docstring on the rigth place --- pandas/io/parsers.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index f76c4f59d022d..4d19c95f97696 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -235,6 +235,12 @@ arguments. dayfirst : bool, default False DD/MM format dates, international and European format. +cache_dates : boolean, default True + If True, use a cache of unique, converted dates to apply the datetime + conversion. May produce significant speed-up when parsing duplicate + date strings, especially ones with timezone offsets. + + .. versionadded:: 0.23.0 iterator : bool, default False Return TextFileReader object for iteration or getting chunks with ``get_chunk()``. @@ -327,12 +333,6 @@ values. The options are `None` for the ordinary converter, `high` for the high-precision converter, and `round_trip` for the round-trip converter. -cache_dates : boolean, default True - If True, use a cache of unique, converted dates to apply the datetime - conversion. May produce significant speed-up when parsing duplicate - date strings, especially ones with timezone offsets. - - .. versionadded:: 0.23.0 Returns ------- From 07575a2aaeb13731f9766dce426ddbbb623f2794 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Thu, 4 Apr 2019 23:30:37 +0300 Subject: [PATCH 6/9] fix wrong pandas version in docstring --- pandas/io/parsers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 4d19c95f97696..f85863c4384bd 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -240,7 +240,7 @@ conversion. May produce significant speed-up when parsing duplicate date strings, especially ones with timezone offsets. - .. versionadded:: 0.23.0 + .. versionadded:: 0.25.0 iterator : bool, default False Return TextFileReader object for iteration or getting chunks with ``get_chunk()``. From f3186a257fa1f576c787f15f4425858e264b1607 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Fri, 5 Apr 2019 00:17:54 +0300 Subject: [PATCH 7/9] added cache_dates argument for read_csv func in doc/source/user_guide/io.rst --- doc/source/user_guide/io.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 9ec39c0ff2b23..0abd073c7dc07 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -271,6 +271,12 @@ date_parser : function, default ``None`` (corresponding to the columns defined by parse_dates) as arguments. dayfirst : boolean, default ``False`` DD/MM format dates, international and European format. +cache_dates : boolean, default True + If True, use a cache of unique, converted dates to apply the datetime + conversion. May produce significant speed-up when parsing duplicate + date strings, especially ones with timezone offsets. + + .. versionadded:: 0.25.0 Iteration +++++++++ From fba9407366dbf01992b0781d7e8856c51eadd077 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Wed, 10 Apr 2019 08:54:23 +0300 Subject: [PATCH 8/9] fixed typo; writed 'cache_dates=True' in doc/source/whatsnew/v0.25.0.rst --- asv_bench/benchmarks/io/csv.py | 2 +- doc/source/whatsnew/v0.25.0.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 0be5fcc8d94c9..c51fb09ad8671 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -244,7 +244,7 @@ def setup(self, do_cache): def time_read_csv_cached(self, do_cache): # kwds setting here is used to avoid breaking tests in - # previuos version of pandas, because this is api changes + # previous version of pandas, because this is api changes kwds = {} if 'cache_dates' in _parser_defaults: kwds['cache_dates'] = do_cache diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 8e7fbd86aebc4..7e88186a3302b 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -216,7 +216,7 @@ Other API Changes - Comparing :class:`Timestamp` with unsupported objects now returns :py:obj:`NotImplemented` instead of raising ``TypeError``. This implies that unsupported rich comparisons are delegated to the other object, and are now consistent with Python 3 behavior for ``datetime`` objects (:issue:`24011`) - Bug in :meth:`DatetimeIndex.snap` which didn't preserving the ``name`` of the input :class:`Index` (:issue:`25575`) - The ``arg`` argument in :meth:`pandas.core.groupby.DataFrameGroupBy.agg` has been renamed to ``func`` (:issue:`26089`) -- Added ``cache_dates`` parameter to :meth:`read_csv`, which allows to cache unique dates when they are parsed (:issue:`25990`) +- Added ``cache_dates=True`` parameter to :meth:`read_csv`, which allows to cache unique dates when they are parsed (:issue:`25990`) .. _whatsnew_0250.deprecations: From 687d0c4c738a876d962d2cad634c7d000e86c0dd Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Sun, 28 Apr 2019 21:17:31 +0300 Subject: [PATCH 9/9] replaced information about cache_dates=True in IO section --- doc/source/whatsnew/v0.25.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 7e88186a3302b..3e1ce702f0423 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -216,7 +216,6 @@ Other API Changes - Comparing :class:`Timestamp` with unsupported objects now returns :py:obj:`NotImplemented` instead of raising ``TypeError``. This implies that unsupported rich comparisons are delegated to the other object, and are now consistent with Python 3 behavior for ``datetime`` objects (:issue:`24011`) - Bug in :meth:`DatetimeIndex.snap` which didn't preserving the ``name`` of the input :class:`Index` (:issue:`25575`) - The ``arg`` argument in :meth:`pandas.core.groupby.DataFrameGroupBy.agg` has been renamed to ``func`` (:issue:`26089`) -- Added ``cache_dates=True`` parameter to :meth:`read_csv`, which allows to cache unique dates when they are parsed (:issue:`25990`) .. _whatsnew_0250.deprecations: @@ -376,6 +375,7 @@ I/O - Adds ``use_bqstorage_api`` parameter to :func:`read_gbq` to speed up downloads of large data frames. This feature requires version 0.10.0 of the ``pandas-gbq`` library as well as the ``google-cloud-bigquery-storage`` and ``fastavro`` libraries. (:issue:`26104`) - Fixed memory leak in :meth:`DataFrame.to_json` when dealing with numeric data (:issue:`24889`) - Bug in :func:`read_json` where date strings with ``Z`` were not converted to a UTC timezone (:issue:`26168`) +- Added ``cache_dates=True`` parameter to :meth:`read_csv`, which allows to cache unique dates when they are parsed (:issue:`25990`) Plotting ^^^^^^^^