diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 043394ded42e9..dd774417fc5f9 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -294,9 +294,9 @@ date_parser : function, default ``None`` .. deprecated:: 2.0.0 Use ``date_format`` instead, or read in as ``object`` and then apply :func:`to_datetime` as-needed. -date_format : str, default ``None`` +date_format : str or dict of column -> format, default ``None`` If used in conjunction with ``parse_dates``, will parse dates according to this - format. For anything more complex (e.g. different formats for different columns), + format. For anything more complex, please read in as ``object`` and then apply :func:`to_datetime` as-needed. .. versionadded:: 2.0.0 @@ -912,7 +912,7 @@ Finally, the parser allows you to specify a custom ``date_format``. Performance-wise, you should try these methods of parsing dates in order: 1. If you know the format, use ``date_format``, e.g.: - ``date_format="%d/%m/%Y"``. + ``date_format="%d/%m/%Y"`` or ``date_format={column_name: "%d/%m/%Y"}``. 2. If you different formats for different columns, or want to pass any extra options (such as ``utc``) to ``to_datetime``, then you should read in your data as ``object`` dtype, and diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index e2ff1f61abcc2..4463dc5d672a9 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -254,9 +254,9 @@ .. deprecated:: 2.0.0 Use ``date_format`` instead, or read in as ``object`` and then apply :func:`to_datetime` as-needed. -date_format : str, default ``None`` +date_format : str or dict of column -> format, default ``None`` If used in conjunction with ``parse_dates``, will parse dates according to this - format. For anything more complex (e.g. different formats for different columns), + format. For anything more complex, please read in as ``object`` and then apply :func:`to_datetime` as-needed. .. versionadded:: 2.0.0 @@ -397,7 +397,7 @@ def read_excel( verbose: bool = ..., parse_dates: list | dict | bool = ..., date_parser: Callable | lib.NoDefault = ..., - date_format: str | None = ..., + date_format: dict[Hashable, str] | str | None = ..., thousands: str | None = ..., decimal: str = ..., comment: str | None = ..., @@ -437,7 +437,7 @@ def read_excel( verbose: bool = ..., parse_dates: list | dict | bool = ..., date_parser: Callable | lib.NoDefault = ..., - date_format: str | None = ..., + date_format: dict[Hashable, str] | str | None = ..., thousands: str | None = ..., decimal: str = ..., comment: str | None = ..., @@ -477,7 +477,7 @@ def read_excel( verbose: bool = False, parse_dates: list | dict | bool = False, date_parser: Callable | lib.NoDefault = lib.no_default, - date_format: str | None = None, + date_format: dict[Hashable, str] | str | None = None, thousands: str | None = None, decimal: str = ".", comment: str | None = None, @@ -726,7 +726,7 @@ def parse( verbose: bool = False, parse_dates: list | dict | bool = False, date_parser: Callable | lib.NoDefault = lib.no_default, - date_format: str | None = None, + date_format: dict[Hashable, str] | str | None = None, thousands: str | None = None, decimal: str = ".", comment: str | None = None, @@ -1554,7 +1554,7 @@ def parse( na_values=None, parse_dates: list | dict | bool = False, date_parser: Callable | lib.NoDefault = lib.no_default, - date_format: str | None = None, + date_format: str | dict[Hashable, str] | None = None, thousands: str | None = None, comment: str | None = None, skipfooter: int = 0, diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index a7abaeba5766e..090f62b932a2b 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -455,7 +455,10 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index: for i, arr in enumerate(index): if try_parse_dates and self._should_parse_dates(i): - arr = self._date_conv(arr) + arr = self._date_conv( + arr, + col=self.index_names[i] if self.index_names is not None else None, + ) if self.na_filter: col_na_values = self.na_values @@ -1094,7 +1097,7 @@ def _make_date_converter( date_parser=lib.no_default, dayfirst: bool = False, cache_dates: bool = True, - date_format: str | None = None, + date_format: dict[Hashable, str] | str | None = None, ): if date_parser is not lib.no_default: warnings.warn( @@ -1108,13 +1111,16 @@ def _make_date_converter( if date_parser is not lib.no_default and date_format is not None: raise TypeError("Cannot use both 'date_parser' and 'date_format'") - def converter(*date_cols): + def converter(*date_cols, col: Hashable): if date_parser is lib.no_default: strs = parsing.concat_date_cols(date_cols) + date_fmt = ( + date_format.get(col) if isinstance(date_format, dict) else date_format + ) return tools.to_datetime( ensure_object(strs), - format=date_format, + format=date_fmt, utc=False, dayfirst=dayfirst, errors="ignore", @@ -1218,7 +1224,9 @@ def _isindex(colspec): continue # Pyarrow engine returns Series which we need to convert to # numpy array before converter, its a no-op for other parsers - data_dict[colspec] = converter(np.asarray(data_dict[colspec])) + data_dict[colspec] = converter( + np.asarray(data_dict[colspec]), col=colspec + ) else: new_name, col, old_names = _try_convert_dates( converter, colspec, data_dict, orig_names @@ -1279,7 +1287,7 @@ def _try_convert_dates(parser: Callable, colspec, data_dict, columns): new_name = "_".join([str(x) for x in colnames]) to_parse = [np.asarray(data_dict[c]) for c in colnames if c in data_dict] - new_col = parser(*to_parse) + new_col = parser(*to_parse, col=new_name) return new_name, new_col, colnames diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index dbf36b830971b..c10f1811751cd 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -355,7 +355,10 @@ def _get_index_names(self): def _maybe_parse_dates(self, values, index: int, try_parse_dates: bool = True): if try_parse_dates and self._should_parse_dates(index): - values = self._date_conv(values) + values = self._date_conv( + values, + col=self.index_names[index] if self.index_names is not None else None, + ) return values diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 28a005df19442..92d0256c242d1 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -265,9 +265,9 @@ .. deprecated:: 2.0.0 Use ``date_format`` instead, or read in as ``object`` and then apply :func:`to_datetime` as-needed. -date_format : str, default ``None`` +date_format : str or dict of column -> format, default ``None`` If used in conjunction with ``parse_dates``, will parse dates according to this - format. For anything more complex (e.g. different formats for different columns), + format. For anything more complex, please read in as ``object`` and then apply :func:`to_datetime` as-needed. .. versionadded:: 2.0.0 @@ -1794,7 +1794,7 @@ def TextParser(*args, **kwds) -> TextFileReader: date_parser : function, optional .. deprecated:: 2.0.0 - date_format : str, default ``None`` + date_format : str or dict of column -> format, default ``None`` .. versionadded:: 2.0.0 skiprows : list of integers diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index f9e152a25081d..1106f699b80f8 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -2155,3 +2155,66 @@ def test_parse_dot_separated_dates(all_parsers): ) expected = DataFrame({"b": [1, 2]}, index=expected_index) tm.assert_frame_equal(result, expected) + + +def test_parse_dates_dict_format(all_parsers): + # GH#51240 + parser = all_parsers + data = """a,b +2019-12-31,31-12-2019 +2020-12-31,31-12-2020""" + + result = parser.read_csv( + StringIO(data), + date_format={"a": "%Y-%m-%d", "b": "%d-%m-%Y"}, + parse_dates=["a", "b"], + ) + expected = DataFrame( + { + "a": [Timestamp("2019-12-31"), Timestamp("2020-12-31")], + "b": [Timestamp("2019-12-31"), Timestamp("2020-12-31")], + } + ) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +@pytest.mark.parametrize( + "key, parse_dates", [("a_b", [[0, 1]]), ("foo", {"foo": [0, 1]})] +) +def test_parse_dates_dict_format_two_columns(all_parsers, key, parse_dates): + # GH#51240 + parser = all_parsers + data = """a,b +31-,12-2019 +31-,12-2020""" + + result = parser.read_csv( + StringIO(data), date_format={key: "%d- %m-%Y"}, parse_dates=parse_dates + ) + expected = DataFrame( + { + key: [Timestamp("2019-12-31"), Timestamp("2020-12-31")], + } + ) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +def test_parse_dates_dict_format_index(all_parsers): + # GH#51240 + parser = all_parsers + data = """a,b +2019-12-31,31-12-2019 +2020-12-31,31-12-2020""" + + result = parser.read_csv( + StringIO(data), date_format={"a": "%Y-%m-%d"}, parse_dates=True, index_col=0 + ) + expected = DataFrame( + { + "b": ["31-12-2019", "31-12-2020"], + }, + index=Index([Timestamp("2019-12-31"), Timestamp("2020-12-31")], name="a"), + ) + tm.assert_frame_equal(result, expected)