diff --git a/doc/source/io.rst b/doc/source/io.rst index faeea9d448cf2..e72224c6fa1fe 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2767,6 +2767,20 @@ indices to be parsed. read_excel('path_to_file.xls', 'Sheet1', parse_cols=[0, 2, 3]) + +Parsing Dates ++++++++++++++ + +Datetime-like values are normally automatically converted to the appropriate +dtype when reading the excel file. But if you have a column of strings that +*look* like dates (but are not actually formatted as dates in excel), you can +use the `parse_dates` keyword to parse those strings to datetimes: + +.. code-block:: python + + read_excel('path_to_file.xls', 'Sheet1', parse_dates=['date_strings']) + + Cell Converters +++++++++++++++ diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 3ab69e1ff409b..fdf34e0d11572 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -270,7 +270,7 @@ To convert a ``SparseDataFrame`` back to sparse SciPy matrix in COO format, you .. _whatsnew_0200.enhancements.other: -Other enhancements +Other Enhancements ^^^^^^^^^^^^^^^^^^ - Integration with the ``feather-format``, including a new top-level ``pd.read_feather()`` and ``DataFrame.to_feather()`` method, see :ref:`here `. @@ -314,6 +314,7 @@ Other enhancements - ``pd.types.concat.union_categoricals`` gained the ``ignore_ordered`` argument to allow ignoring the ordered attribute of unioned categoricals (:issue:`13410`). See the :ref:`categorical union docs ` for more information. - ``pandas.io.json.json_normalize()`` with an empty ``list`` will return an empty ``DataFrame`` (:issue:`15534`) - ``pd.DataFrame.to_latex`` and ``pd.DataFrame.to_string`` now allow optional header aliases. (:issue:`15536`) +- Re-enable the ``parse_dates`` keyword of ``read_excel`` to parse string columns as dates (:issue:`14326`) .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 82ea2e8a46592..d324855bc2f4d 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -343,13 +343,10 @@ def _parse_excel(self, sheetname=0, header=0, skiprows=None, names=None, if 'chunksize' in kwds: raise NotImplementedError("chunksize keyword of read_excel " "is not implemented") - if parse_dates: - raise NotImplementedError("parse_dates keyword of read_excel " - "is not implemented") - if date_parser is not None: - raise NotImplementedError("date_parser keyword of read_excel " - "is not implemented") + if parse_dates is True and index_col is None: + warn("The 'parse_dates=True' keyword of read_excel was provided" + " without an 'index_col' keyword value.") import xlrd from xlrd import (xldate, XL_CELL_DATE, diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 45c62b224ef4e..30b88de91ef76 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1176,13 +1176,18 @@ def _should_parse_dates(self, i): if isinstance(self.parse_dates, bool): return self.parse_dates else: - name = self.index_names[i] + if self.index_names is not None: + name = self.index_names[i] + else: + name = None j = self.index_col[i] if is_scalar(self.parse_dates): - return (j == self.parse_dates) or (name == self.parse_dates) + return ((j == self.parse_dates) or + (name is not None and name == self.parse_dates)) else: - return (j in self.parse_dates) or (name in self.parse_dates) + return ((j in self.parse_dates) or + (name is not None and name in self.parse_dates)) def _extract_multi_indexer_columns(self, header, index_names, col_names, passed_names=False): @@ -1352,6 +1357,7 @@ def _get_name(icol): def _agg_index(self, index, try_parse_dates=True): arrays = [] + for i, arr in enumerate(index): if (try_parse_dates and self._should_parse_dates(i)): @@ -1512,6 +1518,7 @@ def _cast_types(self, values, cast_type, column): def _do_date_conversions(self, names, data): # returns data, columns + if self.parse_dates is not None: data, names = _process_date_conversion( data, self._date_conv, self.parse_dates, self.index_col, diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index b66cb24bf44d8..256a37e922177 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -924,17 +924,32 @@ def test_read_excel_chunksize(self): chunksize=100) def test_read_excel_parse_dates(self): - # GH 11544 - with tm.assertRaises(NotImplementedError): - pd.read_excel(os.path.join(self.dirpath, 'test1' + self.ext), - parse_dates=True) + # GH 11544, 12051 - def test_read_excel_date_parser(self): - # GH 11544 - with tm.assertRaises(NotImplementedError): - dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') - pd.read_excel(os.path.join(self.dirpath, 'test1' + self.ext), - date_parser=dateparse) + df = DataFrame( + {'col': [1, 2, 3], + 'date_strings': pd.date_range('2012-01-01', periods=3)}) + df2 = df.copy() + df2['date_strings'] = df2['date_strings'].dt.strftime('%m/%d/%Y') + + with ensure_clean(self.ext) as pth: + df2.to_excel(pth) + + res = read_excel(pth) + tm.assert_frame_equal(df2, res) + + # no index_col specified when parse_dates is True + with tm.assert_produces_warning(): + res = read_excel(pth, parse_dates=True) + tm.assert_frame_equal(df2, res) + + res = read_excel(pth, parse_dates=['date_strings'], index_col=0) + tm.assert_frame_equal(df, res) + + dateparser = lambda x: pd.datetime.strptime(x, '%m/%d/%Y') + res = read_excel(pth, parse_dates=['date_strings'], + date_parser=dateparser, index_col=0) + tm.assert_frame_equal(df, res) def test_read_excel_skiprows_list(self): # GH 4903 @@ -1382,8 +1397,7 @@ def test_to_excel_multiindex(self): # round trip frame.to_excel(path, 'test1', merge_cells=self.merge_cells) reader = ExcelFile(path) - df = read_excel(reader, 'test1', index_col=[0, 1], - parse_dates=False) + df = read_excel(reader, 'test1', index_col=[0, 1]) tm.assert_frame_equal(frame, df) # GH13511 @@ -1424,8 +1438,7 @@ def test_to_excel_multiindex_cols(self): frame.to_excel(path, 'test1', merge_cells=self.merge_cells) reader = ExcelFile(path) df = read_excel(reader, 'test1', header=header, - index_col=[0, 1], - parse_dates=False) + index_col=[0, 1]) if not self.merge_cells: fm = frame.columns.format(sparsify=False, adjoin=False, names=False)