PERF: Add infer_datetime_format to read_csv() pandas-dev#5490

danbirken · danbirken · commit 879f270c120a · 2014-01-23T18:43:12.000-08:00
This allows read_csv() to attempt to infer the datetime format for any
columns where parse_dates is enabled.  In cases where the datetime
format can be inferred, this should speed up processing datetimes
by ~10x.

Additionally add documentation and benchmarks for read_csv().
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -500,6 +500,40 @@ a single date rather than the entire array.
 
 .. _io.dayfirst:
 
+
+Inferring Datetime Format
+~~~~~~~~~~~~~~~~~~~~~~~~~
+If you have `parse_dates` enabled for some or all of your columns, and your
+datetime strings are all formatted the same way, you may get a large speed
+up by setting `infer_datetime_format=True`.  If set, pandas will attempt
+to guess the format of your datetime strings, and then use a faster means
+of parsing the strings.  5-10x parsing speeds have been observed.  Pandas
+will fallback to the usual parsing if either the format cannot be guessed
+or the format that was guessed cannot properly parse the entire column
+of strings.  So in general, `infer_datetime_format` should not have any
+negative consequences if enabled.
+
+Here are some examples of datetime strings that can be guessed (All
+representing December 30th, 2011 at 00:00:00)
+
+"20111230"
+"2011/12/30"
+"20111230 00:00:00"
+"12/30/2011 00:00:00"
+"30/Dec/2011 00:00:00"
+"30/December/2011 00:00:00"
+
+`infer_datetime_format` is sensitive to `dayfirst`.  With `dayfirst=True`, it
+will guess "01/12/2011" to be December 1st.  With `dayfirst=False` (default)
+it will guess "01/12/2011" to be January 12th.
+
+.. ipython:: python
+
+   # Try to infer the format for the index column
+   df = pd.read_csv('foo.csv', index_col=0, parse_dates=True,
+                    infer_datetime_format=True)
+
+
 International Date Formats
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 While US date formats tend to be MM/DD/YYYY, many international formats use
diff --git a/doc/source/v0.13.1.txt b/doc/source/v0.13.1.txt
@@ -107,6 +107,20 @@ Enhancements
      result
      result.loc[:,:,'ItemA']
 
+- Added optional `infer_datetime_format` to `read_csv`, `Series.from_csv` and
+  `DataFrame.read_csv` (:issue:`5490`)
+ 
+  If `parse_dates` is enabled and this flag is set, pandas will attempt to
+  infer the format of the datetime strings in the columns, and if it can
+  be inferred, switch to a faster method of parsing them.  In some cases
+  this can increase the parsing speed by ~5-10x.
+
+  .. ipython:: python
+
+     # Try to infer the format for the index column
+     df = pd.read_csv('foo.csv', index_col=0, parse_dates=True,
+                      infer_datetime_format=True)
+
 Experimental
 ~~~~~~~~~~~~
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -947,7 +947,8 @@ def _from_arrays(cls, arrays, columns, index, dtype=None):
 
     @classmethod
     def from_csv(cls, path, header=0, sep=',', index_col=0,
-                 parse_dates=True, encoding=None, tupleize_cols=False):
+                 parse_dates=True, encoding=None, tupleize_cols=False,
+                 infer_datetime_format=False):
         """
         Read delimited file into DataFrame
 
@@ -966,6 +967,10 @@ def from_csv(cls, path, header=0, sep=',', index_col=0,
         tupleize_cols : boolean, default False
             write multi_index columns as a list of tuples (if True)
             or new (expanded format) if False)
+        infer_datetime_format: boolean, default False
+            If True and `parse_dates` is True for a column, try to infer the
+            datetime format based on the first datetime string. If the format
+            can be inferred, there often will be a large parsing speed-up.
 
         Notes
         -----
@@ -980,7 +985,8 @@ def from_csv(cls, path, header=0, sep=',', index_col=0,
         from pandas.io.parsers import read_table
         return read_table(path, header=header, sep=sep,
                           parse_dates=parse_dates, index_col=index_col,
-                          encoding=encoding, tupleize_cols=tupleize_cols)
+                          encoding=encoding, tupleize_cols=tupleize_cols,
+                          infer_datetime_format=infer_datetime_format)
 
     def to_sparse(self, fill_value=None, kind='block'):
         """
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -2178,7 +2178,7 @@ def between(self, left, right, inclusive=True):
 
     @classmethod
     def from_csv(cls, path, sep=',', parse_dates=True, header=None,
-                 index_col=0, encoding=None):
+                 index_col=0, encoding=None, infer_datetime_format=False):
         """
         Read delimited file into Series
 
@@ -2197,6 +2197,10 @@ def from_csv(cls, path, sep=',', parse_dates=True, header=None,
         encoding : string, optional
             a string representing the encoding to use if the contents are
             non-ascii, for python versions prior to 3
+        infer_datetime_format: boolean, default False
+            If True and `parse_dates` is True for a column, try to infer the
+            datetime format based on the first datetime string. If the format
+            can be inferred, there often will be a large parsing speed-up.
 
         Returns
         -------
@@ -2205,7 +2209,8 @@ def from_csv(cls, path, sep=',', parse_dates=True, header=None,
         from pandas.core.frame import DataFrame
         df = DataFrame.from_csv(path, header=header, index_col=index_col,
                                 sep=sep, parse_dates=parse_dates,
-                                encoding=encoding)
+                                encoding=encoding,
+                                infer_datetime_format=infer_datetime_format)
         result = df.icol(0)
         result.index.name = result.name = None
         return result
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -16,6 +16,7 @@
 from pandas.core.config import get_option
 from pandas.io.date_converters import generic_parser
 from pandas.io.common import get_filepath_or_buffer
+from pandas.tseries import tools
 
 from pandas.util.decorators import Appender
 
@@ -143,6 +144,9 @@
 warn_bad_lines: boolean, default True
     If error_bad_lines is False, and warn_bad_lines is True, a warning for each
     "bad line" will be output. (Only valid with C parser).
+infer_datetime_format : boolean, default False
+    If True and parse_dates is enabled for a column, attempt to infer
+    the datetime format to speed up the processing
 
 Returns
 -------
@@ -262,6 +266,7 @@ def _read(filepath_or_buffer, kwds):
     'compression': None,
     'mangle_dupe_cols': True,
     'tupleize_cols': False,
+    'infer_datetime_format': False,
 }
 
 
@@ -349,7 +354,8 @@ def parser_f(filepath_or_buffer,
                  encoding=None,
                  squeeze=False,
                  mangle_dupe_cols=True,
-                 tupleize_cols=False):
+                 tupleize_cols=False,
+                 infer_datetime_format=False):
 
         # Alias sep -> delimiter.
         if delimiter is None:
@@ -408,7 +414,8 @@ def parser_f(filepath_or_buffer,
                     low_memory=low_memory,
                     buffer_lines=buffer_lines,
                     mangle_dupe_cols=mangle_dupe_cols,
-                    tupleize_cols=tupleize_cols)
+                    tupleize_cols=tupleize_cols,
+                    infer_datetime_format=infer_datetime_format)
 
         return _read(filepath_or_buffer, kwds)
 
@@ -665,9 +672,13 @@ def __init__(self, kwds):
         self.true_values = kwds.get('true_values')
         self.false_values = kwds.get('false_values')
         self.tupleize_cols = kwds.get('tupleize_cols', False)
+        self.infer_datetime_format = kwds.pop('infer_datetime_format', False)
 
-        self._date_conv = _make_date_converter(date_parser=self.date_parser,
-                                               dayfirst=self.dayfirst)
+        self._date_conv = _make_date_converter(
+            date_parser=self.date_parser,
+            dayfirst=self.dayfirst,
+            infer_datetime_format=self.infer_datetime_format
+        )
 
         # validate header options for mi
         self.header = kwds.get('header')
@@ -1178,6 +1189,10 @@ def TextParser(*args, **kwds):
         Encoding to use for UTF when reading/writing (ex. 'utf-8')
     squeeze : boolean, default False
         returns Series if only one column
+    infer_datetime_format: boolean, default False
+        If True and `parse_dates` is True for a column, try to infer the
+        datetime format based on the first datetime string. If the format
+        can be inferred, there often will be a large parsing speed-up.
     """
     kwds['engine'] = 'python'
     return TextFileReader(*args, **kwds)
@@ -1870,13 +1885,19 @@ def _get_lines(self, rows=None):
         return self._check_thousands(lines)
 
 
-def _make_date_converter(date_parser=None, dayfirst=False):
+def _make_date_converter(date_parser=None, dayfirst=False,
+                         infer_datetime_format=False):
     def converter(*date_cols):
         if date_parser is None:
             strs = _concat_date_cols(date_cols)
             try:
-                return tslib.array_to_datetime(com._ensure_object(strs),
-                                               utc=None, dayfirst=dayfirst)
+                return tools.to_datetime(
+                    com._ensure_object(strs),
+                    utc=None,
+                    box=False,
+                    dayfirst=dayfirst,
+                    infer_datetime_format=infer_datetime_format
+                )
             except:
                 return lib.try_parse_dates(strs, dayfirst=dayfirst)
         else:
diff --git a/vb_suite/io_bench.py b/vb_suite/io_bench.py
@@ -98,3 +98,36 @@ def create_cols(name):
 
 frame_to_csv_date_formatting = Benchmark(stmt, setup,
                                      start_date=datetime(2013, 9, 1))
+
+#----------------------------------------------------------------------
+# infer datetime format
+
+setup = common_setup + """
+rng = date_range('1/1/2000', periods=1000)
+data = '\\n'.join(rng.map(lambda x: x.strftime("%Y-%m-%d %H:%M:%S")))
+"""
+
+stmt = ("read_csv(StringIO(data), header=None, names=['foo'], "
+        "         parse_dates=['foo'], infer_datetime_format=True)")
+
+read_csv_infer_datetime_format_iso8601 = Benchmark(stmt, setup)
+
+setup = common_setup + """
+rng = date_range('1/1/2000', periods=1000)
+data = '\\n'.join(rng.map(lambda x: x.strftime("%Y%m%d")))
+"""
+
+stmt = ("read_csv(StringIO(data), header=None, names=['foo'], "
+        "         parse_dates=['foo'], infer_datetime_format=True)")
+
+read_csv_infer_datetime_format_ymd = Benchmark(stmt, setup)
+
+setup = common_setup + """
+rng = date_range('1/1/2000', periods=1000)
+data = '\\n'.join(rng.map(lambda x: x.strftime("%m/%d/%Y %H:%M:%S.%f")))
+"""
+
+stmt = ("read_csv(StringIO(data), header=None, names=['foo'], "
+        "         parse_dates=['foo'], infer_datetime_format=True)")
+
+read_csv_infer_datetime_format_custom = Benchmark(stmt, setup)