diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index aca2f9f5ac5bb..c54e264faedd2 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -10,7 +10,7 @@ import re import sys from textwrap import fill -from typing import Any, Dict, Iterable, List, Set +from typing import Any, Dict, Iterable, List, Optional, Sequence, Set import warnings import numpy as np @@ -20,7 +20,7 @@ import pandas._libs.parsers as parsers from pandas._libs.parsers import STR_NA_VALUES from pandas._libs.tslibs import parsing -from pandas._typing import FilePathOrBuffer +from pandas._typing import FilePathOrBuffer, Union from pandas.errors import ( AbstractMethodError, EmptyDataError, @@ -1168,7 +1168,9 @@ def _is_index_col(col): return col is not None and col is not False -def _is_potential_multi_index(columns): +def _is_potential_multi_index( + columns, index_col: Optional[Union[bool, Sequence[int]]] = None +): """ Check whether or not the `columns` parameter could be converted into a MultiIndex. @@ -1177,15 +1179,20 @@ def _is_potential_multi_index(columns): ---------- columns : array-like Object which may or may not be convertible into a MultiIndex + index_col : None, bool or list, optional + Column or columns to use as the (possibly hierarchical) index Returns ------- boolean : Whether or not columns could become a MultiIndex """ + if index_col is None or isinstance(index_col, bool): + index_col = [] + return ( len(columns) and not isinstance(columns, MultiIndex) - and all(isinstance(c, tuple) for c in columns) + and all(isinstance(c, tuple) for c in columns if c not in list(index_col)) ) @@ -1570,7 +1577,7 @@ def _maybe_dedup_names(self, names): if self.mangle_dupe_cols: names = list(names) # so we can index counts = defaultdict(int) - is_potential_mi = _is_potential_multi_index(names) + is_potential_mi = _is_potential_multi_index(names, self.index_col) for i, col in enumerate(names): cur_count = counts[col] diff --git a/pandas/tests/io/data/excel/df_empty.xlsx b/pandas/tests/io/data/excel/df_empty.xlsx new file mode 100644 index 0000000000000..d65a92b10e293 Binary files /dev/null and b/pandas/tests/io/data/excel/df_empty.xlsx differ diff --git a/pandas/tests/io/data/excel/df_equals.xlsx b/pandas/tests/io/data/excel/df_equals.xlsx new file mode 100644 index 0000000000000..d65a92b10e293 Binary files /dev/null and b/pandas/tests/io/data/excel/df_equals.xlsx differ diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 99447c03e89af..5401c4bea79f4 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1084,3 +1084,16 @@ def test_excel_high_surrogate(self, engine): # should not produce a segmentation violation actual = pd.read_excel("high_surrogate.xlsx") tm.assert_frame_equal(expected, actual) + + @pytest.mark.parametrize("filename", ["df_empty.xlsx", "df_equals.xlsx"]) + def test_header_with_index_col(self, engine, filename): + # GH 33476 + idx = pd.Index(["Z"], name="I2") + cols = pd.MultiIndex.from_tuples( + [("A", "B"), ("A", "B.1")], names=["I11", "I12"] + ) + expected = pd.DataFrame([[1, 3]], index=idx, columns=cols, dtype="int64") + result = pd.read_excel( + filename, sheet_name="Sheet1", index_col=0, header=[0, 1] + ) + tm.assert_frame_equal(expected, result) diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index f67a658cadfa2..9f425168540ba 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -184,3 +184,26 @@ def test_no_multi_index_level_names_empty(all_parsers): expected.to_csv(path) result = parser.read_csv(path, index_col=[0, 1, 2]) tm.assert_frame_equal(result, expected) + + +def test_header_with_index_col(all_parsers): + # GH 33476 + parser = all_parsers + data = """ +I11,A,A +I12,B,B +I2,1,3 +""" + midx = MultiIndex.from_tuples([("A", "B"), ("A", "B.1")], names=["I11", "I12"]) + idx = Index(["I2"]) + expected = DataFrame([[1, 3]], index=idx, columns=midx) + + result = parser.read_csv(StringIO(data), index_col=0, header=[0, 1]) + tm.assert_frame_equal(result, expected) + + col_idx = Index(["A", "A.1"]) + idx = Index(["I12", "I2"], name="I11") + expected = DataFrame([["B", "B"], ["1", "3"]], index=idx, columns=col_idx) + + result = parser.read_csv(StringIO(data), index_col="I11", header=0) + tm.assert_frame_equal(result, expected)