Skip to content

Commit 7598f71

Browse files
authored
ENH: Add dict support for read_csv date_format (#51418)
* ENH: Add dict support for read_csv date_format * Fix converter * fixup mypy / some docs stuff * Add gh ref --------- Co-authored-by: MarcoGorelli <>
1 parent 6057d7a commit 7598f71

File tree

6 files changed

+94
-20
lines changed

6 files changed

+94
-20
lines changed

doc/source/user_guide/io.rst

+3-3
Original file line numberDiff line numberDiff line change
@@ -294,9 +294,9 @@ date_parser : function, default ``None``
294294
.. deprecated:: 2.0.0
295295
Use ``date_format`` instead, or read in as ``object`` and then apply
296296
:func:`to_datetime` as-needed.
297-
date_format : str, default ``None``
297+
date_format : str or dict of column -> format, default ``None``
298298
If used in conjunction with ``parse_dates``, will parse dates according to this
299-
format. For anything more complex (e.g. different formats for different columns),
299+
format. For anything more complex,
300300
please read in as ``object`` and then apply :func:`to_datetime` as-needed.
301301

302302
.. versionadded:: 2.0.0
@@ -912,7 +912,7 @@ Finally, the parser allows you to specify a custom ``date_format``.
912912
Performance-wise, you should try these methods of parsing dates in order:
913913

914914
1. If you know the format, use ``date_format``, e.g.:
915-
``date_format="%d/%m/%Y"``.
915+
``date_format="%d/%m/%Y"`` or ``date_format={column_name: "%d/%m/%Y"}``.
916916

917917
2. If you different formats for different columns, or want to pass any extra options (such
918918
as ``utc``) to ``to_datetime``, then you should read in your data as ``object`` dtype, and

pandas/io/excel/_base.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -254,9 +254,9 @@
254254
.. deprecated:: 2.0.0
255255
Use ``date_format`` instead, or read in as ``object`` and then apply
256256
:func:`to_datetime` as-needed.
257-
date_format : str, default ``None``
257+
date_format : str or dict of column -> format, default ``None``
258258
If used in conjunction with ``parse_dates``, will parse dates according to this
259-
format. For anything more complex (e.g. different formats for different columns),
259+
format. For anything more complex,
260260
please read in as ``object`` and then apply :func:`to_datetime` as-needed.
261261
262262
.. versionadded:: 2.0.0
@@ -397,7 +397,7 @@ def read_excel(
397397
verbose: bool = ...,
398398
parse_dates: list | dict | bool = ...,
399399
date_parser: Callable | lib.NoDefault = ...,
400-
date_format: str | None = ...,
400+
date_format: dict[Hashable, str] | str | None = ...,
401401
thousands: str | None = ...,
402402
decimal: str = ...,
403403
comment: str | None = ...,
@@ -437,7 +437,7 @@ def read_excel(
437437
verbose: bool = ...,
438438
parse_dates: list | dict | bool = ...,
439439
date_parser: Callable | lib.NoDefault = ...,
440-
date_format: str | None = ...,
440+
date_format: dict[Hashable, str] | str | None = ...,
441441
thousands: str | None = ...,
442442
decimal: str = ...,
443443
comment: str | None = ...,
@@ -477,7 +477,7 @@ def read_excel(
477477
verbose: bool = False,
478478
parse_dates: list | dict | bool = False,
479479
date_parser: Callable | lib.NoDefault = lib.no_default,
480-
date_format: str | None = None,
480+
date_format: dict[Hashable, str] | str | None = None,
481481
thousands: str | None = None,
482482
decimal: str = ".",
483483
comment: str | None = None,
@@ -726,7 +726,7 @@ def parse(
726726
verbose: bool = False,
727727
parse_dates: list | dict | bool = False,
728728
date_parser: Callable | lib.NoDefault = lib.no_default,
729-
date_format: str | None = None,
729+
date_format: dict[Hashable, str] | str | None = None,
730730
thousands: str | None = None,
731731
decimal: str = ".",
732732
comment: str | None = None,
@@ -1554,7 +1554,7 @@ def parse(
15541554
na_values=None,
15551555
parse_dates: list | dict | bool = False,
15561556
date_parser: Callable | lib.NoDefault = lib.no_default,
1557-
date_format: str | None = None,
1557+
date_format: str | dict[Hashable, str] | None = None,
15581558
thousands: str | None = None,
15591559
comment: str | None = None,
15601560
skipfooter: int = 0,

pandas/io/parsers/base_parser.py

+14-6
Original file line numberDiff line numberDiff line change
@@ -455,7 +455,10 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index:
455455

456456
for i, arr in enumerate(index):
457457
if try_parse_dates and self._should_parse_dates(i):
458-
arr = self._date_conv(arr)
458+
arr = self._date_conv(
459+
arr,
460+
col=self.index_names[i] if self.index_names is not None else None,
461+
)
459462

460463
if self.na_filter:
461464
col_na_values = self.na_values
@@ -1094,7 +1097,7 @@ def _make_date_converter(
10941097
date_parser=lib.no_default,
10951098
dayfirst: bool = False,
10961099
cache_dates: bool = True,
1097-
date_format: str | None = None,
1100+
date_format: dict[Hashable, str] | str | None = None,
10981101
):
10991102
if date_parser is not lib.no_default:
11001103
warnings.warn(
@@ -1108,13 +1111,16 @@ def _make_date_converter(
11081111
if date_parser is not lib.no_default and date_format is not None:
11091112
raise TypeError("Cannot use both 'date_parser' and 'date_format'")
11101113

1111-
def converter(*date_cols):
1114+
def converter(*date_cols, col: Hashable):
11121115
if date_parser is lib.no_default:
11131116
strs = parsing.concat_date_cols(date_cols)
1117+
date_fmt = (
1118+
date_format.get(col) if isinstance(date_format, dict) else date_format
1119+
)
11141120

11151121
return tools.to_datetime(
11161122
ensure_object(strs),
1117-
format=date_format,
1123+
format=date_fmt,
11181124
utc=False,
11191125
dayfirst=dayfirst,
11201126
errors="ignore",
@@ -1218,7 +1224,9 @@ def _isindex(colspec):
12181224
continue
12191225
# Pyarrow engine returns Series which we need to convert to
12201226
# numpy array before converter, its a no-op for other parsers
1221-
data_dict[colspec] = converter(np.asarray(data_dict[colspec]))
1227+
data_dict[colspec] = converter(
1228+
np.asarray(data_dict[colspec]), col=colspec
1229+
)
12221230
else:
12231231
new_name, col, old_names = _try_convert_dates(
12241232
converter, colspec, data_dict, orig_names
@@ -1279,7 +1287,7 @@ def _try_convert_dates(parser: Callable, colspec, data_dict, columns):
12791287
new_name = "_".join([str(x) for x in colnames])
12801288
to_parse = [np.asarray(data_dict[c]) for c in colnames if c in data_dict]
12811289

1282-
new_col = parser(*to_parse)
1290+
new_col = parser(*to_parse, col=new_name)
12831291
return new_name, new_col, colnames
12841292

12851293

pandas/io/parsers/c_parser_wrapper.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -355,7 +355,10 @@ def _get_index_names(self):
355355

356356
def _maybe_parse_dates(self, values, index: int, try_parse_dates: bool = True):
357357
if try_parse_dates and self._should_parse_dates(index):
358-
values = self._date_conv(values)
358+
values = self._date_conv(
359+
values,
360+
col=self.index_names[index] if self.index_names is not None else None,
361+
)
359362
return values
360363

361364

pandas/io/parsers/readers.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -265,9 +265,9 @@
265265
.. deprecated:: 2.0.0
266266
Use ``date_format`` instead, or read in as ``object`` and then apply
267267
:func:`to_datetime` as-needed.
268-
date_format : str, default ``None``
268+
date_format : str or dict of column -> format, default ``None``
269269
If used in conjunction with ``parse_dates``, will parse dates according to this
270-
format. For anything more complex (e.g. different formats for different columns),
270+
format. For anything more complex,
271271
please read in as ``object`` and then apply :func:`to_datetime` as-needed.
272272
273273
.. versionadded:: 2.0.0
@@ -1794,7 +1794,7 @@ def TextParser(*args, **kwds) -> TextFileReader:
17941794
date_parser : function, optional
17951795
17961796
.. deprecated:: 2.0.0
1797-
date_format : str, default ``None``
1797+
date_format : str or dict of column -> format, default ``None``
17981798
17991799
.. versionadded:: 2.0.0
18001800
skiprows : list of integers

pandas/tests/io/parser/test_parse_dates.py

+63
Original file line numberDiff line numberDiff line change
@@ -2155,3 +2155,66 @@ def test_parse_dot_separated_dates(all_parsers):
21552155
)
21562156
expected = DataFrame({"b": [1, 2]}, index=expected_index)
21572157
tm.assert_frame_equal(result, expected)
2158+
2159+
2160+
def test_parse_dates_dict_format(all_parsers):
2161+
# GH#51240
2162+
parser = all_parsers
2163+
data = """a,b
2164+
2019-12-31,31-12-2019
2165+
2020-12-31,31-12-2020"""
2166+
2167+
result = parser.read_csv(
2168+
StringIO(data),
2169+
date_format={"a": "%Y-%m-%d", "b": "%d-%m-%Y"},
2170+
parse_dates=["a", "b"],
2171+
)
2172+
expected = DataFrame(
2173+
{
2174+
"a": [Timestamp("2019-12-31"), Timestamp("2020-12-31")],
2175+
"b": [Timestamp("2019-12-31"), Timestamp("2020-12-31")],
2176+
}
2177+
)
2178+
tm.assert_frame_equal(result, expected)
2179+
2180+
2181+
@skip_pyarrow
2182+
@pytest.mark.parametrize(
2183+
"key, parse_dates", [("a_b", [[0, 1]]), ("foo", {"foo": [0, 1]})]
2184+
)
2185+
def test_parse_dates_dict_format_two_columns(all_parsers, key, parse_dates):
2186+
# GH#51240
2187+
parser = all_parsers
2188+
data = """a,b
2189+
31-,12-2019
2190+
31-,12-2020"""
2191+
2192+
result = parser.read_csv(
2193+
StringIO(data), date_format={key: "%d- %m-%Y"}, parse_dates=parse_dates
2194+
)
2195+
expected = DataFrame(
2196+
{
2197+
key: [Timestamp("2019-12-31"), Timestamp("2020-12-31")],
2198+
}
2199+
)
2200+
tm.assert_frame_equal(result, expected)
2201+
2202+
2203+
@skip_pyarrow
2204+
def test_parse_dates_dict_format_index(all_parsers):
2205+
# GH#51240
2206+
parser = all_parsers
2207+
data = """a,b
2208+
2019-12-31,31-12-2019
2209+
2020-12-31,31-12-2020"""
2210+
2211+
result = parser.read_csv(
2212+
StringIO(data), date_format={"a": "%Y-%m-%d"}, parse_dates=True, index_col=0
2213+
)
2214+
expected = DataFrame(
2215+
{
2216+
"b": ["31-12-2019", "31-12-2020"],
2217+
},
2218+
index=Index([Timestamp("2019-12-31"), Timestamp("2020-12-31")], name="a"),
2219+
)
2220+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)