Skip to content

Commit 879f270

Browse files
committed
PERF: Add infer_datetime_format to read_csv() pandas-dev#5490
This allows read_csv() to attempt to infer the datetime format for any columns where parse_dates is enabled. In cases where the datetime format can be inferred, this should speed up processing datetimes by ~10x. Additionally add documentation and benchmarks for read_csv().
1 parent 78bb467 commit 879f270

File tree

6 files changed

+124
-11
lines changed

6 files changed

+124
-11
lines changed

doc/source/io.rst

+34
Original file line numberDiff line numberDiff line change
@@ -500,6 +500,40 @@ a single date rather than the entire array.
500500
501501
.. _io.dayfirst:
502502

503+
504+
Inferring Datetime Format
505+
~~~~~~~~~~~~~~~~~~~~~~~~~
506+
If you have `parse_dates` enabled for some or all of your columns, and your
507+
datetime strings are all formatted the same way, you may get a large speed
508+
up by setting `infer_datetime_format=True`. If set, pandas will attempt
509+
to guess the format of your datetime strings, and then use a faster means
510+
of parsing the strings. 5-10x parsing speeds have been observed. Pandas
511+
will fallback to the usual parsing if either the format cannot be guessed
512+
or the format that was guessed cannot properly parse the entire column
513+
of strings. So in general, `infer_datetime_format` should not have any
514+
negative consequences if enabled.
515+
516+
Here are some examples of datetime strings that can be guessed (All
517+
representing December 30th, 2011 at 00:00:00)
518+
519+
"20111230"
520+
"2011/12/30"
521+
"20111230 00:00:00"
522+
"12/30/2011 00:00:00"
523+
"30/Dec/2011 00:00:00"
524+
"30/December/2011 00:00:00"
525+
526+
`infer_datetime_format` is sensitive to `dayfirst`. With `dayfirst=True`, it
527+
will guess "01/12/2011" to be December 1st. With `dayfirst=False` (default)
528+
it will guess "01/12/2011" to be January 12th.
529+
530+
.. ipython:: python
531+
532+
# Try to infer the format for the index column
533+
df = pd.read_csv('foo.csv', index_col=0, parse_dates=True,
534+
infer_datetime_format=True)
535+
536+
503537
International Date Formats
504538
~~~~~~~~~~~~~~~~~~~~~~~~~~
505539
While US date formats tend to be MM/DD/YYYY, many international formats use

doc/source/v0.13.1.txt

+14
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,20 @@ Enhancements
107107
result
108108
result.loc[:,:,'ItemA']
109109

110+
- Added optional `infer_datetime_format` to `read_csv`, `Series.from_csv` and
111+
`DataFrame.read_csv` (:issue:`5490`)
112+
113+
If `parse_dates` is enabled and this flag is set, pandas will attempt to
114+
infer the format of the datetime strings in the columns, and if it can
115+
be inferred, switch to a faster method of parsing them. In some cases
116+
this can increase the parsing speed by ~5-10x.
117+
118+
.. ipython:: python
119+
120+
# Try to infer the format for the index column
121+
df = pd.read_csv('foo.csv', index_col=0, parse_dates=True,
122+
infer_datetime_format=True)
123+
110124
Experimental
111125
~~~~~~~~~~~~
112126

pandas/core/frame.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -947,7 +947,8 @@ def _from_arrays(cls, arrays, columns, index, dtype=None):
947947

948948
@classmethod
949949
def from_csv(cls, path, header=0, sep=',', index_col=0,
950-
parse_dates=True, encoding=None, tupleize_cols=False):
950+
parse_dates=True, encoding=None, tupleize_cols=False,
951+
infer_datetime_format=False):
951952
"""
952953
Read delimited file into DataFrame
953954
@@ -966,6 +967,10 @@ def from_csv(cls, path, header=0, sep=',', index_col=0,
966967
tupleize_cols : boolean, default False
967968
write multi_index columns as a list of tuples (if True)
968969
or new (expanded format) if False)
970+
infer_datetime_format: boolean, default False
971+
If True and `parse_dates` is True for a column, try to infer the
972+
datetime format based on the first datetime string. If the format
973+
can be inferred, there often will be a large parsing speed-up.
969974
970975
Notes
971976
-----
@@ -980,7 +985,8 @@ def from_csv(cls, path, header=0, sep=',', index_col=0,
980985
from pandas.io.parsers import read_table
981986
return read_table(path, header=header, sep=sep,
982987
parse_dates=parse_dates, index_col=index_col,
983-
encoding=encoding, tupleize_cols=tupleize_cols)
988+
encoding=encoding, tupleize_cols=tupleize_cols,
989+
infer_datetime_format=infer_datetime_format)
984990

985991
def to_sparse(self, fill_value=None, kind='block'):
986992
"""

pandas/core/series.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -2178,7 +2178,7 @@ def between(self, left, right, inclusive=True):
21782178

21792179
@classmethod
21802180
def from_csv(cls, path, sep=',', parse_dates=True, header=None,
2181-
index_col=0, encoding=None):
2181+
index_col=0, encoding=None, infer_datetime_format=False):
21822182
"""
21832183
Read delimited file into Series
21842184
@@ -2197,6 +2197,10 @@ def from_csv(cls, path, sep=',', parse_dates=True, header=None,
21972197
encoding : string, optional
21982198
a string representing the encoding to use if the contents are
21992199
non-ascii, for python versions prior to 3
2200+
infer_datetime_format: boolean, default False
2201+
If True and `parse_dates` is True for a column, try to infer the
2202+
datetime format based on the first datetime string. If the format
2203+
can be inferred, there often will be a large parsing speed-up.
22002204
22012205
Returns
22022206
-------
@@ -2205,7 +2209,8 @@ def from_csv(cls, path, sep=',', parse_dates=True, header=None,
22052209
from pandas.core.frame import DataFrame
22062210
df = DataFrame.from_csv(path, header=header, index_col=index_col,
22072211
sep=sep, parse_dates=parse_dates,
2208-
encoding=encoding)
2212+
encoding=encoding,
2213+
infer_datetime_format=infer_datetime_format)
22092214
result = df.icol(0)
22102215
result.index.name = result.name = None
22112216
return result

pandas/io/parsers.py

+28-7
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from pandas.core.config import get_option
1717
from pandas.io.date_converters import generic_parser
1818
from pandas.io.common import get_filepath_or_buffer
19+
from pandas.tseries import tools
1920

2021
from pandas.util.decorators import Appender
2122

@@ -143,6 +144,9 @@
143144
warn_bad_lines: boolean, default True
144145
If error_bad_lines is False, and warn_bad_lines is True, a warning for each
145146
"bad line" will be output. (Only valid with C parser).
147+
infer_datetime_format : boolean, default False
148+
If True and parse_dates is enabled for a column, attempt to infer
149+
the datetime format to speed up the processing
146150
147151
Returns
148152
-------
@@ -262,6 +266,7 @@ def _read(filepath_or_buffer, kwds):
262266
'compression': None,
263267
'mangle_dupe_cols': True,
264268
'tupleize_cols': False,
269+
'infer_datetime_format': False,
265270
}
266271

267272

@@ -349,7 +354,8 @@ def parser_f(filepath_or_buffer,
349354
encoding=None,
350355
squeeze=False,
351356
mangle_dupe_cols=True,
352-
tupleize_cols=False):
357+
tupleize_cols=False,
358+
infer_datetime_format=False):
353359

354360
# Alias sep -> delimiter.
355361
if delimiter is None:
@@ -408,7 +414,8 @@ def parser_f(filepath_or_buffer,
408414
low_memory=low_memory,
409415
buffer_lines=buffer_lines,
410416
mangle_dupe_cols=mangle_dupe_cols,
411-
tupleize_cols=tupleize_cols)
417+
tupleize_cols=tupleize_cols,
418+
infer_datetime_format=infer_datetime_format)
412419

413420
return _read(filepath_or_buffer, kwds)
414421

@@ -665,9 +672,13 @@ def __init__(self, kwds):
665672
self.true_values = kwds.get('true_values')
666673
self.false_values = kwds.get('false_values')
667674
self.tupleize_cols = kwds.get('tupleize_cols', False)
675+
self.infer_datetime_format = kwds.pop('infer_datetime_format', False)
668676

669-
self._date_conv = _make_date_converter(date_parser=self.date_parser,
670-
dayfirst=self.dayfirst)
677+
self._date_conv = _make_date_converter(
678+
date_parser=self.date_parser,
679+
dayfirst=self.dayfirst,
680+
infer_datetime_format=self.infer_datetime_format
681+
)
671682

672683
# validate header options for mi
673684
self.header = kwds.get('header')
@@ -1178,6 +1189,10 @@ def TextParser(*args, **kwds):
11781189
Encoding to use for UTF when reading/writing (ex. 'utf-8')
11791190
squeeze : boolean, default False
11801191
returns Series if only one column
1192+
infer_datetime_format: boolean, default False
1193+
If True and `parse_dates` is True for a column, try to infer the
1194+
datetime format based on the first datetime string. If the format
1195+
can be inferred, there often will be a large parsing speed-up.
11811196
"""
11821197
kwds['engine'] = 'python'
11831198
return TextFileReader(*args, **kwds)
@@ -1870,13 +1885,19 @@ def _get_lines(self, rows=None):
18701885
return self._check_thousands(lines)
18711886

18721887

1873-
def _make_date_converter(date_parser=None, dayfirst=False):
1888+
def _make_date_converter(date_parser=None, dayfirst=False,
1889+
infer_datetime_format=False):
18741890
def converter(*date_cols):
18751891
if date_parser is None:
18761892
strs = _concat_date_cols(date_cols)
18771893
try:
1878-
return tslib.array_to_datetime(com._ensure_object(strs),
1879-
utc=None, dayfirst=dayfirst)
1894+
return tools.to_datetime(
1895+
com._ensure_object(strs),
1896+
utc=None,
1897+
box=False,
1898+
dayfirst=dayfirst,
1899+
infer_datetime_format=infer_datetime_format
1900+
)
18801901
except:
18811902
return lib.try_parse_dates(strs, dayfirst=dayfirst)
18821903
else:

vb_suite/io_bench.py

+33
Original file line numberDiff line numberDiff line change
@@ -98,3 +98,36 @@ def create_cols(name):
9898

9999
frame_to_csv_date_formatting = Benchmark(stmt, setup,
100100
start_date=datetime(2013, 9, 1))
101+
102+
#----------------------------------------------------------------------
103+
# infer datetime format
104+
105+
setup = common_setup + """
106+
rng = date_range('1/1/2000', periods=1000)
107+
data = '\\n'.join(rng.map(lambda x: x.strftime("%Y-%m-%d %H:%M:%S")))
108+
"""
109+
110+
stmt = ("read_csv(StringIO(data), header=None, names=['foo'], "
111+
" parse_dates=['foo'], infer_datetime_format=True)")
112+
113+
read_csv_infer_datetime_format_iso8601 = Benchmark(stmt, setup)
114+
115+
setup = common_setup + """
116+
rng = date_range('1/1/2000', periods=1000)
117+
data = '\\n'.join(rng.map(lambda x: x.strftime("%Y%m%d")))
118+
"""
119+
120+
stmt = ("read_csv(StringIO(data), header=None, names=['foo'], "
121+
" parse_dates=['foo'], infer_datetime_format=True)")
122+
123+
read_csv_infer_datetime_format_ymd = Benchmark(stmt, setup)
124+
125+
setup = common_setup + """
126+
rng = date_range('1/1/2000', periods=1000)
127+
data = '\\n'.join(rng.map(lambda x: x.strftime("%m/%d/%Y %H:%M:%S.%f")))
128+
"""
129+
130+
stmt = ("read_csv(StringIO(data), header=None, names=['foo'], "
131+
" parse_dates=['foo'], infer_datetime_format=True)")
132+
133+
read_csv_infer_datetime_format_custom = Benchmark(stmt, setup)

0 commit comments

Comments
 (0)