Skip to content

Commit 980f650

Browse files
WillAydgfyoung
authored andcommitted
BUG: MultiIndex mangling during parsing (#18094)
Closes gh-18062.
1 parent f17aa26 commit 980f650

File tree

3 files changed

+52
-4
lines changed

3 files changed

+52
-4
lines changed

doc/source/whatsnew/v0.22.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ Bug Fixes
8989

9090
- Bug in ``pd.read_msgpack()`` with a non existent file is passed in Python 2 (:issue:`15296`)
9191
- Bug in ``DataFrame.groupby`` where key as tuple in a ``MultiIndex`` were interpreted as a list of keys (:issue:`17979`)
92+
- Bug in :func:`pd.read_csv` where a ``MultiIndex`` with duplicate columns was not being mangled appropriately (:issue:`18062`)
9293

9394
Conversion
9495
^^^^^^^^^^

pandas/io/parsers.py

+24-4
Original file line numberDiff line numberDiff line change
@@ -1106,6 +1106,24 @@ def _is_index_col(col):
11061106
return col is not None and col is not False
11071107

11081108

1109+
def _is_potential_multi_index(columns):
1110+
"""
1111+
Check whether or not the `columns` parameter
1112+
could be converted into a MultiIndex.
1113+
1114+
Parameters
1115+
----------
1116+
columns : array-like
1117+
Object which may or may not be convertible into a MultiIndex
1118+
1119+
Returns
1120+
-------
1121+
boolean : Whether or not columns could become a MultiIndex
1122+
"""
1123+
return (len(columns) and not isinstance(columns, MultiIndex) and
1124+
all([isinstance(c, tuple) for c in columns]))
1125+
1126+
11091127
def _evaluate_usecols(usecols, names):
11101128
"""
11111129
Check whether or not the 'usecols' parameter
@@ -1374,14 +1392,18 @@ def _maybe_dedup_names(self, names):
13741392
if self.mangle_dupe_cols:
13751393
names = list(names) # so we can index
13761394
counts = defaultdict(int)
1395+
is_potential_mi = _is_potential_multi_index(names)
13771396

13781397
for i, col in enumerate(names):
13791398
cur_count = counts[col]
13801399

13811400
while cur_count > 0:
13821401
counts[col] = cur_count + 1
13831402

1384-
col = '%s.%d' % (col, cur_count)
1403+
if is_potential_mi:
1404+
col = col[:-1] + ('%s.%d' % (col[-1], cur_count),)
1405+
else:
1406+
col = '%s.%d' % (col, cur_count)
13851407
cur_count = counts[col]
13861408

13871409
names[i] = col
@@ -1391,9 +1413,7 @@ def _maybe_dedup_names(self, names):
13911413

13921414
def _maybe_make_multi_index_columns(self, columns, col_names=None):
13931415
# possibly create a column mi here
1394-
if (not self.tupleize_cols and len(columns) and
1395-
not isinstance(columns, MultiIndex) and
1396-
all([isinstance(c, tuple) for c in columns])):
1416+
if _is_potential_multi_index(columns):
13971417
columns = MultiIndex.from_tuples(columns, names=col_names)
13981418
return columns
13991419

pandas/tests/io/parser/header.py

+27
Original file line numberDiff line numberDiff line change
@@ -290,3 +290,30 @@ def test_singleton_header(self):
290290
df = self.read_csv(StringIO(data), header=[0])
291291
expected = DataFrame({"a": [0, 1], "b": [1, 2], "c": [2, 3]})
292292
tm.assert_frame_equal(df, expected)
293+
294+
def test_mangles_multi_index(self):
295+
# See GH 18062
296+
data = """A,A,A,B\none,one,one,two\n0,40,34,0.1"""
297+
df = self.read_csv(StringIO(data), header=[0, 1])
298+
expected = DataFrame([[0, 40, 34, 0.1]],
299+
columns=MultiIndex.from_tuples(
300+
[('A', 'one'), ('A', 'one.1'),
301+
('A', 'one.2'), ('B', 'two')]))
302+
tm.assert_frame_equal(df, expected)
303+
304+
data = """A,A,A,B\none,one,one.1,two\n0,40,34,0.1"""
305+
df = self.read_csv(StringIO(data), header=[0, 1])
306+
expected = DataFrame([[0, 40, 34, 0.1]],
307+
columns=MultiIndex.from_tuples(
308+
[('A', 'one'), ('A', 'one.1'),
309+
('A', 'one.1.1'), ('B', 'two')]))
310+
tm.assert_frame_equal(df, expected)
311+
312+
data = """A,A,A,B,B\none,one,one.1,two,two\n0,40,34,0.1,0.1"""
313+
df = self.read_csv(StringIO(data), header=[0, 1])
314+
expected = DataFrame([[0, 40, 34, 0.1, 0.1]],
315+
columns=MultiIndex.from_tuples(
316+
[('A', 'one'), ('A', 'one.1'),
317+
('A', 'one.1.1'), ('B', 'two'),
318+
('B', 'two.1')]))
319+
tm.assert_frame_equal(df, expected)

0 commit comments

Comments
 (0)