Skip to content

[BUG] Reading multiindex, incorrectly names columns without name. #13115

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 29 additions & 18 deletions pandas/formats/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -1277,29 +1277,41 @@ def _write_hierarchical_rows(self, fmt_values, indent):


def _get_level_lengths(levels, sentinel=''):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While you have rewritten this function a bit, would you like to add a docstring at once as well?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@brandys11 ok, if you can update this doc-string. ping on green. ty.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback ping, everything is green

from itertools import groupby
"""For each index in each level the function returns lengths of indexes.

def _make_grouper():
record = {'count': 0}
Parameters
----------
levels : list of lists
List of values on for level.
sentinel : string, optional
Value which states that no new index starts on there.

def grouper(x):
if x != sentinel:
record['count'] += 1
return record['count']
Returns
----------
Returns list of maps. For each level returns map of indexes (key is index
in row and value is length of index).
"""
if len(levels) == 0:
return []

return grouper
control = [True for x in levels[0]]

result = []
for lev in levels:
i = 0
f = _make_grouper()
recs = {}
for key, gpr in groupby(lev, f):
values = list(gpr)
recs[i] = len(values)
i += len(values)
for level in levels:
last_index = 0

result.append(recs)
lengths = {}
for i, key in enumerate(level):
if control[i] and key == sentinel:
pass
else:
control[i] = False
lengths[last_index] = i - last_index
last_index = i

lengths[last_index] = len(level) - last_index

result.append(lengths)

return result

Expand Down Expand Up @@ -1762,7 +1774,6 @@ def _format_value(self, val):
return val

def _format_header_mi(self):

if self.columns.nlevels > 1:
if not self.index:
raise NotImplementedError("Writing to Excel with MultiIndex"
Expand Down
32 changes: 27 additions & 5 deletions pandas/io/excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,10 +431,13 @@ def _parse_cell(cell_contents, cell_typ):
if header is not None:
if com.is_list_like(header):
header_names = []
control_row = [True for x in data[0]]
for row in header:
if com.is_integer(skiprows):
row += skiprows
data[row] = _fill_mi_header(data[row])

data[row], control_row = _fill_mi_header(
data[row], control_row)
header_name, data[row] = _pop_header_name(
data[row], index_col)
header_names.append(header_name)
Expand Down Expand Up @@ -511,16 +514,35 @@ def _trim_excel_header(row):
return row


def _fill_mi_header(row):
# forward fill blanks entries
# from headers if parsing as MultiIndex
def _fill_mi_header(row, control_row):
"""Forward fills blank entries in row, but only inside the same parent index

Used for creating headers in Multiindex.
Parameters
----------
row : list
List of items in a single row.
constrol_row : list of boolean
Helps to determine if particular column is in same parent index as the
previous value. Used to stop propagation of empty cells between
different indexes.

Returns
----------
Returns changed row and control_row
"""
last = row[0]
for i in range(1, len(row)):
if not control_row[i]:
last = row[i]

if row[i] == '' or row[i] is None:
row[i] = last
else:
control_row[i] = False
last = row[i]
return row

return row, control_row

# fill blank if index_col not None

Expand Down
40 changes: 40 additions & 0 deletions pandas/io/tests/test_excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -726,6 +726,46 @@ def test_read_excel_multiindex(self):
header=[0, 1], skiprows=2)
tm.assert_frame_equal(actual, expected)

def test_read_excel_multiindex_empty_level(self):
# GH 12453
_skip_if_no_xlsxwriter()
with ensure_clean('.xlsx') as path:
df = DataFrame({
('Zero', ''): {0: 0},
('One', 'x'): {0: 1},
('Two', 'X'): {0: 3},
('Two', 'Y'): {0: 7}
})

expected = DataFrame({
('Zero', 'Unnamed: 3_level_1'): {0: 0},
('One', u'x'): {0: 1},
('Two', u'X'): {0: 3},
('Two', u'Y'): {0: 7}
})

df.to_excel(path)
actual = pd.read_excel(path, header=[0, 1])
tm.assert_frame_equal(actual, expected)

df = pd.DataFrame({
('Beg', ''): {0: 0},
('Middle', 'x'): {0: 1},
('Tail', 'X'): {0: 3},
('Tail', 'Y'): {0: 7}
})

expected = pd.DataFrame({
('Beg', 'Unnamed: 0_level_1'): {0: 0},
('Middle', u'x'): {0: 1},
('Tail', u'X'): {0: 3},
('Tail', u'Y'): {0: 7}
})

df.to_excel(path)
actual = pd.read_excel(path, header=[0, 1])
tm.assert_frame_equal(actual, expected)

def test_excel_multindex_roundtrip(self):
# GH 4679
_skip_if_no_xlsxwriter()
Expand Down