Skip to content

BUG: Handling columns from index_col in _is_potential_multi_index #33982

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Jun 4, 2020
11 changes: 8 additions & 3 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1168,7 +1168,7 @@ def _is_index_col(col):
return col is not None and col is not False


def _is_potential_multi_index(columns):
def _is_potential_multi_index(columns, index_col=None):
"""
Check whether or not the `columns` parameter
could be converted into a MultiIndex.
Expand All @@ -1177,15 +1177,20 @@ def _is_potential_multi_index(columns):
----------
columns : array-like
Object which may or may not be convertible into a MultiIndex
index_col : None, bool or list, optional
Column or columns to use as the (possibly hierarchical) index

Returns
-------
boolean : Whether or not columns could become a MultiIndex
"""
if index_col is None or isinstance(index_col, bool):
index_col = []

return (
len(columns)
and not isinstance(columns, MultiIndex)
and all(isinstance(c, tuple) for c in columns)
and all(isinstance(c, tuple) for c in columns if c not in list(index_col))
)


Expand Down Expand Up @@ -1570,7 +1575,7 @@ def _maybe_dedup_names(self, names):
if self.mangle_dupe_cols:
names = list(names) # so we can index
counts = defaultdict(int)
is_potential_mi = _is_potential_multi_index(names)
is_potential_mi = _is_potential_multi_index(names, self.index_col)

for i, col in enumerate(names):
cur_count = counts[col]
Expand Down
Binary file added pandas/tests/io/data/excel/df_empty.xlsx
Binary file not shown.
Binary file added pandas/tests/io/data/excel/df_equals.xlsx
Binary file not shown.
4 changes: 4 additions & 0 deletions pandas/tests/io/excel/test_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1084,3 +1084,7 @@ def test_excel_high_surrogate(self, engine):
# should not produce a segmentation violation
actual = pd.read_excel("high_surrogate.xlsx")
tm.assert_frame_equal(expected, actual)

@pytest.mark.parametrize("filename", ["df_empty.xlsx", "df_equals.xlsx"])
def test_header_empty_cells(self, engine, filename):
pd.read_excel(filename, sheet_name="Sheet1", index_col=0, header=[0, 1])