diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 6fc1ec9c6ff90..a86436fac3487 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -934,6 +934,7 @@ I/O - Bug in :func:`read_parquet` when ``engine="fastparquet"`` where the file was not closed on error (:issue:`46555`) - :meth:`to_html` now excludes the ``border`` attribute from ```` elements when ``border`` keyword is set to ``False``. - Bug in :func:`read_sas` with certain types of compressed SAS7BDAT files (:issue:`35545`) +- Bug in :func:`read_excel` not forward filling :class:`MultiIndex` when no names were given (:issue:`47487`) - Bug in :func:`read_sas` returned ``None`` rather than an empty DataFrame for SAS7BDAT files with zero rows (:issue:`18198`) - Bug in :class:`StataWriter` where value labels were always written with default encoding (:issue:`46750`) - Bug in :class:`StataWriterUTF8` where some valid characters were removed from variable names (:issue:`47276`) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 0c4268626099a..a0abddc82e6c8 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -788,9 +788,27 @@ def parse( # If there is a MultiIndex header and an index then there is also # a row containing just the index name(s) - has_index_names = ( - is_list_header and not is_len_one_list_header and index_col is not None - ) + has_index_names = False + if is_list_header and not is_len_one_list_header and index_col is not None: + + index_col_list: Sequence[int] + if isinstance(index_col, int): + index_col_list = [index_col] + else: + assert isinstance(index_col, Sequence) + index_col_list = index_col + + # We have to handle mi without names. If any of the entries in the data + # columns are not empty, this is a regular row + assert isinstance(header, Sequence) + if len(header) < len(data): + potential_index_names = data[len(header)] + potential_data = [ + x + for i, x in enumerate(potential_index_names) + if not control_row[i] and i not in index_col_list + ] + has_index_names = all(x == "" or x is None for x in potential_data) if is_list_like(index_col): # Forward fill values for MultiIndex index. diff --git a/pandas/tests/io/data/excel/multiindex_no_index_names.xlsx b/pandas/tests/io/data/excel/multiindex_no_index_names.xlsx new file mode 100755 index 0000000000000..3913ffce5befb Binary files /dev/null and b/pandas/tests/io/data/excel/multiindex_no_index_names.xlsx differ diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index ea9a45cf829f2..3b122c8572751 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -396,3 +396,17 @@ def test_ints_spelled_with_decimals(datapath, ext): result = pd.read_excel(path) expected = DataFrame(range(2, 12), columns=[1]) tm.assert_frame_equal(result, expected) + + +def test_read_multiindex_header_no_index_names(datapath, ext): + # GH#47487 + path = datapath("io", "data", "excel", f"multiindex_no_index_names{ext}") + result = pd.read_excel(path, index_col=[0, 1, 2], header=[0, 1, 2]) + expected = DataFrame( + [[np.nan, "x", "x", "x"], ["x", np.nan, np.nan, np.nan]], + columns=pd.MultiIndex.from_tuples( + [("X", "Y", "A1"), ("X", "Y", "A2"), ("XX", "YY", "B1"), ("XX", "YY", "B2")] + ), + index=pd.MultiIndex.from_tuples([("A", "AA", "AAA"), ("A", "BB", "BBB")]), + ) + tm.assert_frame_equal(result, expected)