added new cache_dates parameter for read_csv func

anmyachev · anmyachev · commit eb70d3918b38 · 2019-04-10T09:53:56.000+03:00
diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
@@ -4,6 +4,7 @@
 import numpy as np
 import pandas.util.testing as tm
 from pandas import DataFrame, Categorical, date_range, read_csv
+from pandas.io.parsers import _parser_defaults
 from io import StringIO
 
 from ..pandas_vb_common import BaseIO
@@ -214,6 +215,25 @@ def time_baseline(self):
                  names=list(string.digits[:9]))
 
 
+class ReadCSVCachedParseDates(StringIORewind):
+    params = ([True, False],)
+    param_names = ['do_cache']
+
+    def setup(self, do_cache):
+        data = ('\n'.join('10/{}'.format(year)
+                for year in range(2000, 2100)) + '\n') * 10
+        self.StringIO_input = StringIO(data)
+
+    def time_read_csv_cached(self, do_cache):
+        # kwds setting here is used to avoid breaking tests in
+        # previuos version of pandas, because this is api changes
+        kwds = {}
+        if 'cache_dates' in _parser_defaults:
+            kwds['cache_dates'] = do_cache
+        read_csv(self.data(self.StringIO_input), header=None,
+                 parse_dates=[0], **kwds)
+
+
 class ReadCSVMemoryGrowth(BaseIO):
 
     chunksize = 20
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -327,6 +327,12 @@
     values. The options are `None` for the ordinary converter,
     `high` for the high-precision converter, and `round_trip` for the
     round-trip converter.
+cache_dates : boolean, default False
+    If True, use a cache of unique, converted dates to apply the datetime
+    conversion. May produce significant speed-up when parsing duplicate
+    date strings, especially ones with timezone offsets.
+
+    .. versionadded:: 0.23.0
 
 Returns
 -------
@@ -476,6 +482,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
     'false_values': None,
     'converters': None,
     'dtype': None,
+    'cache_dates': False,
 
     'thousands': None,
     'comment': None,
@@ -577,6 +584,7 @@ def parser_f(filepath_or_buffer: FilePathOrBuffer,
                  keep_date_col=False,
                  date_parser=None,
                  dayfirst=False,
+                 cache_dates=False,
 
                  # Iteration
                  iterator=False,
@@ -683,6 +691,7 @@ def parser_f(filepath_or_buffer: FilePathOrBuffer,
                     keep_date_col=keep_date_col,
                     dayfirst=dayfirst,
                     date_parser=date_parser,
+                    cache_dates=cache_dates,
 
                     nrows=nrows,
                     iterator=iterator,
@@ -1385,11 +1394,13 @@ def __init__(self, kwds):
         self.tupleize_cols = kwds.get('tupleize_cols', False)
         self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True)
         self.infer_datetime_format = kwds.pop('infer_datetime_format', False)
+        self.cache_dates = kwds.pop('cache_dates', False)
 
         self._date_conv = _make_date_converter(
             date_parser=self.date_parser,
             dayfirst=self.dayfirst,
-            infer_datetime_format=self.infer_datetime_format
+            infer_datetime_format=self.infer_datetime_format,
+            cache_dates = self.cache_dates
         )
 
         # validate header options for mi
@@ -3179,7 +3190,7 @@ def _get_lines(self, rows=None):
 
 
 def _make_date_converter(date_parser=None, dayfirst=False,
-                         infer_datetime_format=False):
+                         infer_datetime_format=False, cache_dates=False):
     def converter(*date_cols):
         if date_parser is None:
             strs = _concat_date_cols(date_cols)
@@ -3190,16 +3201,22 @@ def converter(*date_cols):
                     utc=None,
                     dayfirst=dayfirst,
                     errors='ignore',
-                    infer_datetime_format=infer_datetime_format
+                    infer_datetime_format=infer_datetime_format,
+                    cache=cache_dates
                 ).to_numpy()
 
             except ValueError:
                 return tools.to_datetime(
-                    parsing.try_parse_dates(strs, dayfirst=dayfirst))
+                    parsing.try_parse_dates(strs, dayfirst=dayfirst),
+                    cache=cache_dates
+                )
         else:
             try:
                 result = tools.to_datetime(
-                    date_parser(*date_cols), errors='ignore')
+                    date_parser(*date_cols),
+                    errors='ignore',
+                    cache=cache_dates
+                )
                 if isinstance(result, datetime.datetime):
                     raise Exception('scalar parser')
                 return result
@@ -3209,6 +3226,7 @@ def converter(*date_cols):
                         parsing.try_parse_dates(_concat_date_cols(date_cols),
                                                 parser=date_parser,
                                                 dayfirst=dayfirst),
+                        cache=cache_dates,
                         errors='ignore')
                 except Exception:
                     return generic_parser(date_parser, *date_cols)