Skip to content

Commit c71bfc3

Browse files
authored
BUG: Handling columns from index_col in _is_potential_multi_index (#33982)
1 parent 89c5a59 commit c71bfc3

File tree

5 files changed

+48
-5
lines changed

5 files changed

+48
-5
lines changed

pandas/io/parsers.py

+12-5
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
import re
1111
import sys
1212
from textwrap import fill
13-
from typing import Any, Dict, Iterable, List, Set
13+
from typing import Any, Dict, Iterable, List, Optional, Sequence, Set
1414
import warnings
1515

1616
import numpy as np
@@ -20,7 +20,7 @@
2020
import pandas._libs.parsers as parsers
2121
from pandas._libs.parsers import STR_NA_VALUES
2222
from pandas._libs.tslibs import parsing
23-
from pandas._typing import FilePathOrBuffer
23+
from pandas._typing import FilePathOrBuffer, Union
2424
from pandas.errors import (
2525
AbstractMethodError,
2626
EmptyDataError,
@@ -1168,7 +1168,9 @@ def _is_index_col(col):
11681168
return col is not None and col is not False
11691169

11701170

1171-
def _is_potential_multi_index(columns):
1171+
def _is_potential_multi_index(
1172+
columns, index_col: Optional[Union[bool, Sequence[int]]] = None
1173+
):
11721174
"""
11731175
Check whether or not the `columns` parameter
11741176
could be converted into a MultiIndex.
@@ -1177,15 +1179,20 @@ def _is_potential_multi_index(columns):
11771179
----------
11781180
columns : array-like
11791181
Object which may or may not be convertible into a MultiIndex
1182+
index_col : None, bool or list, optional
1183+
Column or columns to use as the (possibly hierarchical) index
11801184
11811185
Returns
11821186
-------
11831187
boolean : Whether or not columns could become a MultiIndex
11841188
"""
1189+
if index_col is None or isinstance(index_col, bool):
1190+
index_col = []
1191+
11851192
return (
11861193
len(columns)
11871194
and not isinstance(columns, MultiIndex)
1188-
and all(isinstance(c, tuple) for c in columns)
1195+
and all(isinstance(c, tuple) for c in columns if c not in list(index_col))
11891196
)
11901197

11911198

@@ -1570,7 +1577,7 @@ def _maybe_dedup_names(self, names):
15701577
if self.mangle_dupe_cols:
15711578
names = list(names) # so we can index
15721579
counts = defaultdict(int)
1573-
is_potential_mi = _is_potential_multi_index(names)
1580+
is_potential_mi = _is_potential_multi_index(names, self.index_col)
15741581

15751582
for i, col in enumerate(names):
15761583
cur_count = counts[col]
5.46 KB
Binary file not shown.
5.46 KB
Binary file not shown.

pandas/tests/io/excel/test_readers.py

+13
Original file line numberDiff line numberDiff line change
@@ -1130,3 +1130,16 @@ def test_excel_high_surrogate(self, engine):
11301130
# should not produce a segmentation violation
11311131
actual = pd.read_excel("high_surrogate.xlsx")
11321132
tm.assert_frame_equal(expected, actual)
1133+
1134+
@pytest.mark.parametrize("filename", ["df_empty.xlsx", "df_equals.xlsx"])
1135+
def test_header_with_index_col(self, engine, filename):
1136+
# GH 33476
1137+
idx = pd.Index(["Z"], name="I2")
1138+
cols = pd.MultiIndex.from_tuples(
1139+
[("A", "B"), ("A", "B.1")], names=["I11", "I12"]
1140+
)
1141+
expected = pd.DataFrame([[1, 3]], index=idx, columns=cols, dtype="int64")
1142+
result = pd.read_excel(
1143+
filename, sheet_name="Sheet1", index_col=0, header=[0, 1]
1144+
)
1145+
tm.assert_frame_equal(expected, result)

pandas/tests/io/parser/test_index_col.py

+23
Original file line numberDiff line numberDiff line change
@@ -184,3 +184,26 @@ def test_no_multi_index_level_names_empty(all_parsers):
184184
expected.to_csv(path)
185185
result = parser.read_csv(path, index_col=[0, 1, 2])
186186
tm.assert_frame_equal(result, expected)
187+
188+
189+
def test_header_with_index_col(all_parsers):
190+
# GH 33476
191+
parser = all_parsers
192+
data = """
193+
I11,A,A
194+
I12,B,B
195+
I2,1,3
196+
"""
197+
midx = MultiIndex.from_tuples([("A", "B"), ("A", "B.1")], names=["I11", "I12"])
198+
idx = Index(["I2"])
199+
expected = DataFrame([[1, 3]], index=idx, columns=midx)
200+
201+
result = parser.read_csv(StringIO(data), index_col=0, header=[0, 1])
202+
tm.assert_frame_equal(result, expected)
203+
204+
col_idx = Index(["A", "A.1"])
205+
idx = Index(["I12", "I2"], name="I11")
206+
expected = DataFrame([["B", "B"], ["1", "3"]], index=idx, columns=col_idx)
207+
208+
result = parser.read_csv(StringIO(data), index_col="I11", header=0)
209+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)