diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index d1e209adb1b8f..d3df785c23544 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -574,6 +574,7 @@ I/O - Bug in :func:`json_normalize` where multi-character ``sep`` parameter is incorrectly prefixed to every key (:issue:`43831`) - Bug in :func:`read_csv` with :code:`float_precision="round_trip"` which did not skip initial/trailing whitespace (:issue:`43713`) - Bug in dumping/loading a :class:`DataFrame` with ``yaml.dump(frame)`` (:issue:`42748`) +- Bug in :func:`read_csv` raising ``ValueError`` when ``parse_dates`` was used with ``MultiIndex`` columns (:issue:`8991`) - Period diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 339585810bec1..ba39b6a933a81 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -259,7 +259,8 @@ def _validate_parse_dates_presence(self, columns: list[str]) -> None: # ParseDates = Union[DateGroups, List[DateGroups], # Dict[ColReference, DateGroups]] cols_needed = itertools.chain.from_iterable( - col if is_list_like(col) else [col] for col in self.parse_dates + col if is_list_like(col) and not isinstance(col, tuple) else [col] + for col in self.parse_dates ) else: cols_needed = [] @@ -1091,7 +1092,7 @@ def _isindex(colspec): if isinstance(parse_spec, list): # list of column lists for colspec in parse_spec: - if is_scalar(colspec): + if is_scalar(colspec) or isinstance(colspec, tuple): if isinstance(colspec, int) and colspec not in data_dict: colspec = orig_names[colspec] if _isindex(colspec): @@ -1146,7 +1147,11 @@ def _try_convert_dates(parser: Callable, colspec, data_dict, columns): else: colnames.append(c) - new_name = "_".join([str(x) for x in colnames]) + new_name: tuple | str + if all(isinstance(x, tuple) for x in colnames): + new_name = tuple(map("_".join, zip(*colnames))) + else: + new_name = "_".join([str(x) for x in colnames]) to_parse = [np.asarray(data_dict[c]) for c in colnames if c in data_dict] new_col = parser(*to_parse) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index c8bea9592e82a..470440290016d 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -1732,6 +1732,39 @@ def test_date_parser_and_names(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow +def test_date_parser_multiindex_columns(all_parsers): + parser = all_parsers + data = """a,b +1,2 +2019-12-31,6""" + result = parser.read_csv(StringIO(data), parse_dates=[("a", "1")], header=[0, 1]) + expected = DataFrame({("a", "1"): Timestamp("2019-12-31"), ("b", "2"): [6]}) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +@pytest.mark.parametrize( + "parse_spec, col_name", + [ + ([[("a", "1"), ("b", "2")]], ("a_b", "1_2")), + ({("foo", "1"): [("a", "1"), ("b", "2")]}, ("foo", "1")), + ], +) +def test_date_parser_multiindex_columns_combine_cols(all_parsers, parse_spec, col_name): + parser = all_parsers + data = """a,b,c +1,2,3 +2019-12,-31,6""" + result = parser.read_csv( + StringIO(data), + parse_dates=parse_spec, + header=[0, 1], + ) + expected = DataFrame({col_name: Timestamp("2019-12-31"), ("c", "3"): [6]}) + tm.assert_frame_equal(result, expected) + + @skip_pyarrow def test_date_parser_usecols_thousands(all_parsers): # GH#39365