From 85044565b15cc3c11e1bb57b4ae2ac7bbe4b9332 Mon Sep 17 00:00:00 2001 From: Christer van der Meeren Date: Fri, 30 Jan 2015 12:12:33 +0100 Subject: [PATCH] DOC: Clarify how date_parser is called (GH9376) --- doc/source/io.rst | 27 ++++++++++++++++++++++++++- pandas/io/parsers.py | 7 ++++++- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index d5bbddfeb7d37..e39798434d96c 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -563,7 +563,7 @@ writing to a file). For example: Date Parsing Functions ~~~~~~~~~~~~~~~~~~~~~~ -Finally, the parser allows you can specify a custom ``date_parser`` function to +Finally, the parser allows you to specify a custom ``date_parser`` function to take full advantage of the flexibility of the date parsing API: .. ipython:: python @@ -573,6 +573,31 @@ take full advantage of the flexibility of the date parsing API: date_parser=conv.parse_date_time) df +Pandas will try to call the ``date_parser`` function in three different ways. If +an exception is raised, the next one is tried: + +1. ``date_parser`` is first called with one or more arrays as arguments, + as defined using `parse_dates` (e.g., ``date_parser(['2013', '2013'], ['1', '2'])``) + +2. If #1 fails, ``date_parser`` is called with all the columns + concatenated row-wise into a single array (e.g., ``date_parser(['2013 1', '2013 2'])``) + +3. If #2 fails, ``date_parser`` is called once for every row with one or more + string arguments from the columns indicated with `parse_dates` + (e.g., ``date_parser('2013', '1')`` for the first row, ``date_parser('2013', '2')`` + for the second, etc.) + +Note that performance-wise, you should try these methods of parsing dates in order: + +1. Try to infer the format using ``infer_datetime_format=True`` (see section below) + +2. If you know the format, use ``pd.to_datetime()``: + ``date_parser=lambda x: pd.to_datetime(x, format=...)`` + +3. If you have a really non-standard format, use a custom ``date_parser`` function. + For optimal performance, this should be vectorized, i.e., it should accept arrays + as arguments. + You can explore the date parsing functionality in ``date_converters.py`` and add your own. We would love to turn this module into a community supported set of date/time parsers. To get you started, ``date_converters.py`` contains diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index b23aa017138e1..637612d5fb09d 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -104,7 +104,12 @@ class ParserWarning(Warning): date_parser : function Function to use for converting a sequence of string columns to an array of datetime instances. The default uses dateutil.parser.parser - to do the conversion. + to do the conversion. Pandas will try to call date_parser in three different + ways, advancing to the next if an exception occurs: 1) Pass one or more arrays + (as defined by parse_dates) as arguments; 2) concatenate (row-wise) the string + values from the columns defined by parse_dates into a single array and pass + that; and 3) call date_parser once for each row using one or more strings + (corresponding to the columns defined by parse_dates) as arguments. dayfirst : boolean, default False DD/MM format dates, international and European format thousands : str, default None