From 0109fcb2f88b1cae9e5f9707aa64ebcf18502a59 Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 12 Nov 2021 15:11:40 +0100 Subject: [PATCH 1/2] BUG: read_csv raising if parse_dates is used with MultiIndex columns --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/io/parsers/base_parser.py | 10 +++++-- pandas/tests/io/parser/test_parse_dates.py | 33 ++++++++++++++++++++++ 3 files changed, 41 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index d1e209adb1b8f..d3df785c23544 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -574,6 +574,7 @@ I/O - Bug in :func:`json_normalize` where multi-character ``sep`` parameter is incorrectly prefixed to every key (:issue:`43831`) - Bug in :func:`read_csv` with :code:`float_precision="round_trip"` which did not skip initial/trailing whitespace (:issue:`43713`) - Bug in dumping/loading a :class:`DataFrame` with ``yaml.dump(frame)`` (:issue:`42748`) +- Bug in :func:`read_csv` raising ``ValueError`` when ``parse_dates`` was used with ``MultiIndex`` columns (:issue:`8991`) - Period diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 339585810bec1..b67995593d947 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -259,7 +259,8 @@ def _validate_parse_dates_presence(self, columns: list[str]) -> None: # ParseDates = Union[DateGroups, List[DateGroups], # Dict[ColReference, DateGroups]] cols_needed = itertools.chain.from_iterable( - col if is_list_like(col) else [col] for col in self.parse_dates + col if is_list_like(col) and not isinstance(col, tuple) else [col] + for col in self.parse_dates ) else: cols_needed = [] @@ -1091,7 +1092,7 @@ def _isindex(colspec): if isinstance(parse_spec, list): # list of column lists for colspec in parse_spec: - if is_scalar(colspec): + if is_scalar(colspec) or isinstance(colspec, tuple): if isinstance(colspec, int) and colspec not in data_dict: colspec = orig_names[colspec] if _isindex(colspec): @@ -1146,7 +1147,10 @@ def _try_convert_dates(parser: Callable, colspec, data_dict, columns): else: colnames.append(c) - new_name = "_".join([str(x) for x in colnames]) + if all(isinstance(x, tuple) for x in colnames): + new_name = tuple(map("_".join, zip(*colnames))) + else: + new_name = "_".join([str(x) for x in colnames]) to_parse = [np.asarray(data_dict[c]) for c in colnames if c in data_dict] new_col = parser(*to_parse) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index c8bea9592e82a..470440290016d 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -1732,6 +1732,39 @@ def test_date_parser_and_names(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow +def test_date_parser_multiindex_columns(all_parsers): + parser = all_parsers + data = """a,b +1,2 +2019-12-31,6""" + result = parser.read_csv(StringIO(data), parse_dates=[("a", "1")], header=[0, 1]) + expected = DataFrame({("a", "1"): Timestamp("2019-12-31"), ("b", "2"): [6]}) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +@pytest.mark.parametrize( + "parse_spec, col_name", + [ + ([[("a", "1"), ("b", "2")]], ("a_b", "1_2")), + ({("foo", "1"): [("a", "1"), ("b", "2")]}, ("foo", "1")), + ], +) +def test_date_parser_multiindex_columns_combine_cols(all_parsers, parse_spec, col_name): + parser = all_parsers + data = """a,b,c +1,2,3 +2019-12,-31,6""" + result = parser.read_csv( + StringIO(data), + parse_dates=parse_spec, + header=[0, 1], + ) + expected = DataFrame({col_name: Timestamp("2019-12-31"), ("c", "3"): [6]}) + tm.assert_frame_equal(result, expected) + + @skip_pyarrow def test_date_parser_usecols_thousands(all_parsers): # GH#39365 From ac973fbe5bbe66ac6d57ad0ac34c99db2882d211 Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 12 Nov 2021 16:53:58 +0100 Subject: [PATCH 2/2] Add type hint to silence mypy --- pandas/io/parsers/base_parser.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index b67995593d947..ba39b6a933a81 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -1147,6 +1147,7 @@ def _try_convert_dates(parser: Callable, colspec, data_dict, columns): else: colnames.append(c) + new_name: tuple | str if all(isinstance(x, tuple) for x in colnames): new_name = tuple(map("_".join, zip(*colnames))) else: