Skip to content

BUG: Handling columns from index_col in _is_potential_multi_index #33982

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Jun 4, 2020
17 changes: 12 additions & 5 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import re
import sys
from textwrap import fill
from typing import Any, Dict, Iterable, List, Set
from typing import Any, Dict, Iterable, List, Optional, Sequence, Set
import warnings

import numpy as np
Expand All @@ -20,7 +20,7 @@
import pandas._libs.parsers as parsers
from pandas._libs.parsers import STR_NA_VALUES
from pandas._libs.tslibs import parsing
from pandas._typing import FilePathOrBuffer
from pandas._typing import FilePathOrBuffer, Union
from pandas.errors import (
AbstractMethodError,
EmptyDataError,
Expand Down Expand Up @@ -1168,7 +1168,9 @@ def _is_index_col(col):
return col is not None and col is not False


def _is_potential_multi_index(columns):
def _is_potential_multi_index(
columns, index_col: Optional[Union[bool, Sequence[int]]] = None
):
"""
Check whether or not the `columns` parameter
could be converted into a MultiIndex.
Expand All @@ -1177,15 +1179,20 @@ def _is_potential_multi_index(columns):
----------
columns : array-like
Object which may or may not be convertible into a MultiIndex
index_col : None, bool or list, optional
Column or columns to use as the (possibly hierarchical) index

Returns
-------
boolean : Whether or not columns could become a MultiIndex
"""
if index_col is None or isinstance(index_col, bool):
index_col = []

return (
len(columns)
and not isinstance(columns, MultiIndex)
and all(isinstance(c, tuple) for c in columns)
and all(isinstance(c, tuple) for c in columns if c not in list(index_col))
)


Expand Down Expand Up @@ -1570,7 +1577,7 @@ def _maybe_dedup_names(self, names):
if self.mangle_dupe_cols:
names = list(names) # so we can index
counts = defaultdict(int)
is_potential_mi = _is_potential_multi_index(names)
is_potential_mi = _is_potential_multi_index(names, self.index_col)

for i, col in enumerate(names):
cur_count = counts[col]
Expand Down
Binary file added pandas/tests/io/data/excel/df_empty.xlsx
Binary file not shown.
Binary file added pandas/tests/io/data/excel/df_equals.xlsx
Binary file not shown.
13 changes: 13 additions & 0 deletions pandas/tests/io/excel/test_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1084,3 +1084,16 @@ def test_excel_high_surrogate(self, engine):
# should not produce a segmentation violation
actual = pd.read_excel("high_surrogate.xlsx")
tm.assert_frame_equal(expected, actual)

@pytest.mark.parametrize("filename", ["df_empty.xlsx", "df_equals.xlsx"])
def test_header_with_index_col(self, engine, filename):
# GH 33476
idx = pd.Index(["Z"], name="I2")
cols = pd.MultiIndex.from_tuples(
[("A", "B"), ("A", "B.1")], names=["I11", "I12"]
)
expected = pd.DataFrame([[1, 3]], index=idx, columns=cols, dtype="int64")
result = pd.read_excel(
filename, sheet_name="Sheet1", index_col=0, header=[0, 1]
)
tm.assert_frame_equal(expected, result)
23 changes: 23 additions & 0 deletions pandas/tests/io/parser/test_index_col.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,3 +184,26 @@ def test_no_multi_index_level_names_empty(all_parsers):
expected.to_csv(path)
result = parser.read_csv(path, index_col=[0, 1, 2])
tm.assert_frame_equal(result, expected)


def test_header_with_index_col(all_parsers):
# GH 33476
parser = all_parsers
data = """
I11,A,A
I12,B,B
I2,1,3
"""
midx = MultiIndex.from_tuples([("A", "B"), ("A", "B.1")], names=["I11", "I12"])
idx = Index(["I2"])
expected = DataFrame([[1, 3]], index=idx, columns=midx)

result = parser.read_csv(StringIO(data), index_col=0, header=[0, 1])
tm.assert_frame_equal(result, expected)

col_idx = Index(["A", "A.1"])
idx = Index(["I12", "I2"], name="I11")
expected = DataFrame([["B", "B"], ["1", "3"]], index=idx, columns=col_idx)

result = parser.read_csv(StringIO(data), index_col="I11", header=0)
tm.assert_frame_equal(result, expected)