Skip to content

Commit 1121fd5

Browse files
committed
ENH: read_excel MultiIndex pandas-dev#4679
1 parent 2531503 commit 1121fd5

File tree

7 files changed

+229
-25
lines changed

7 files changed

+229
-25
lines changed

doc/source/io.rst

+35
Original file line numberDiff line numberDiff line change
@@ -1989,6 +1989,41 @@ advanced strategies
19891989
Reading Excel Files
19901990
'''''''''''''''''''
19911991

1992+
.. versionadded:: 0.17
1993+
1994+
``read_excel`` can read a ``MultiIndex`` index, by passing a list of columns to ``index_col``
1995+
and a ``MultiIndex`` column by passing a list of rows to ``header``. If either the ``index``
1996+
or ``columns`` have serialized level names those will be read in as well by specifying
1997+
the rows/columns that make up the levels.
1998+
1999+
.. ipython:: python
2000+
2001+
# MultiIndex index - no names
2002+
df = pd.DataFrame({'a':[1,2,3,4], 'b':[5,6,7,8]},
2003+
index=pd.MultiIndex.from_product([['a','b'],['c','d']]))
2004+
df.to_excel('path_to_file.xlsx')
2005+
df = pd.read_excel('path_to_file.xlsx', index_col=[0,1])
2006+
df
2007+
2008+
# MultiIndex index - with names
2009+
df.index = df.index.set_names(['lvl1', 'lvl2'])
2010+
df.to_excel('path_to_file.xlsx')
2011+
df = pd.read_excel('path_to_file.xlsx', index_col=[0,1])
2012+
df
2013+
2014+
# MultiIndex index and column - with names
2015+
df.columns = pd.MultiIndex.from_product([['a'],['b', 'd']], names=['c1', 'c2'])
2016+
df.to_excel('path_to_file.xlsx')
2017+
df = pd.read_excel('path_to_file.xlsx',
2018+
index_col=[0,1], header=[0,1])
2019+
df
2020+
2021+
.. ipython:: python
2022+
:suppress:
2023+
2024+
import os
2025+
os.remove('path_to_file.xlsx')
2026+
19922027
.. versionadded:: 0.16
19932028

19942029
``read_excel`` can read more than one sheet, by setting ``sheetname`` to either

doc/source/whatsnew/v0.17.0.txt

+30-2
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,36 @@ incrementally.
186186

187187
See the :ref:`docs <io.sas>` for more details.
188188

189+
190+
Changes to Excel with ``MultiIndex``
191+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
192+
In version 0.16.2 a ``DataFrame`` with ``MultiIndex`` columns could not be written to Excel via `to_excel`.
193+
That functionality has been added (:issue:`10564`), along with updating `read_excel` so that the data can
194+
be read back with no loss of information by specifying which columns/rows make up the ``MultiIndex``
195+
in the `header` and `index_col` parameters (:issue:`4679`)
196+
197+
.. ipython:: python
198+
199+
df = pd.DataFrame([[1,2,3,4], [5,6,7,8]],
200+
columns = pd.MultiIndex.from_product([['foo','bar'],['a','b']],
201+
names = ['col1', 'col2']),
202+
index = pd.MultiIndex.from_product([['j'], ['l', 'k']],
203+
names = ['i1', 'i2']))
204+
205+
df
206+
df.to_excel('test.xlsx')
207+
208+
df = pd.read_excel('test.xlsx', header=[0,1], index_col=[0,1])
209+
df
210+
211+
.. ipython:: python
212+
:suppress:
213+
214+
import os
215+
os.remove('test.xlsx')
216+
217+
See the :ref:`documentation <io.excel>` for more details.
218+
189219
.. _whatsnew_0170.enhancements.other:
190220

191221
Other enhancements
@@ -739,7 +769,6 @@ Changes to ``Categorical.unique``
739769
cat
740770
cat.unique()
741771

742-
743772
.. _whatsnew_0170.api_breaking.other:
744773

745774
Other API Changes
@@ -749,7 +778,6 @@ Other API Changes
749778
- Calling the ``.value_counts`` method on a Series with ``categorical`` dtype now returns a Series with a ``CategoricalIndex`` (:issue:`10704`)
750779
- Allow passing `kwargs` to the interpolation methods (:issue:`10378`).
751780
- The metadata properties of subclasses of pandas objects will now be serialized (:issue:`10553`).
752-
- Allow ``DataFrame`` with ``MultiIndex`` columns to be written to Excel (:issue:`10564`). This was changed in 0.16.2 as the read-back method could not always guarantee perfect fidelity (:issue:`9794`).
753781
- ``groupby`` using ``Categorical`` follows the same rule as ``Categorical.unique`` described above (:issue:`10508`)
754782
- Improved error message when concatenating an empty iterable of dataframes (:issue:`9157`)
755783
- When constructing ``DataFrame`` with an array of ``complex64`` dtype that meant the corresponding column was automatically promoted to the ``complex128`` dtype. Pandas will now preserve the itemsize of the input for complex data (:issue:`10952`)

pandas/core/format.py

-4
Original file line numberDiff line numberDiff line change
@@ -1682,10 +1682,6 @@ def _format_header_mi(self):
16821682
raise NotImplementedError("Writing to Excel with MultiIndex"
16831683
" columns and no index ('index'=False) "
16841684
"is not yet implemented.")
1685-
elif self.index and self.verbose:
1686-
warnings.warn("Writing to Excel with MultiIndex columns is a"
1687-
" one way serializable operation. You will not"
1688-
" be able to re-read or parse the output file.")
16891685

16901686
has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index))
16911687
if not(has_aliases or self.header):

pandas/io/excel.py

+83-9
Original file line numberDiff line numberDiff line change
@@ -97,8 +97,10 @@ def read_excel(io, sheetname=0, **kwds):
9797
* [0,1,"Sheet5"] -> 1st, 2nd & 5th sheet as a dictionary of DataFrames
9898
* None -> All sheets as a dictionary of DataFrames
9999
100-
header : int, default 0
100+
header : int, list of ints, default 0
101101
Row to use for the column labels of the parsed DataFrame
102+
If a list of integers is passed those row positions will
103+
be combined into a ``MultiIndex``
102104
skiprows : list-like
103105
Rows to skip at the beginning (0-indexed)
104106
skip_footer : int, default 0
@@ -108,9 +110,10 @@ def read_excel(io, sheetname=0, **kwds):
108110
either be integers or column labels, values are functions that take one
109111
input argument, the Excel cell content, and return the transformed
110112
content.
111-
index_col : int, default None
113+
index_col : int, list of ints, default None
112114
Column to use as the row labels of the DataFrame. Pass None if
113-
there is no such column
115+
there is no such column. If a list is passed, those columns will beginning
116+
combined into a ``MultiIndex``
114117
parse_cols : int or list, default None
115118
* If None then parse all columns,
116119
* If int then indicates last column to be parsed
@@ -135,6 +138,9 @@ def read_excel(io, sheetname=0, **kwds):
135138
True if the cols defined in index_col have an index name and are
136139
not in the header. Index name will be placed on a separate line below
137140
the header.
141+
has_header_names: boolean, default False
142+
True if rows defined in header have names, in the leftmost data
143+
columns. Reads format output by `to_excel`
138144
139145
Returns
140146
-------
@@ -196,7 +202,8 @@ def __init__(self, io, **kwds):
196202
def parse(self, sheetname=0, header=0, skiprows=None, skip_footer=0,
197203
index_col=None, parse_cols=None, parse_dates=False,
198204
date_parser=None, na_values=None, thousands=None, chunksize=None,
199-
convert_float=True, has_index_names=False, converters=None, **kwds):
205+
convert_float=True, has_index_names=False, has_header_names=False,
206+
converters=None, **kwds):
200207
"""Read an Excel table into DataFrame
201208
202209
Parameters
@@ -220,7 +227,10 @@ def parse(self, sheetname=0, header=0, skiprows=None, skip_footer=0,
220227
* "Sheet1" -> 1st sheet as a DataFrame
221228
* [0,1,"Sheet5"] -> 1st, 2nd & 5th sheet as a dictionary of DataFrames
222229
* None -> All sheets as a dictionary of DataFrames
223-
header : int, default 0
230+
header : int, list of ints, default 0
231+
Row to use for the column labels of the parsed DataFrame
232+
If a list of integers is passed those row positions will
233+
be combined into a ``MultiIndex``
224234
Row to use for the column labels of the parsed DataFrame
225235
skiprows : list-like
226236
Rows to skip at the beginning (0-indexed)
@@ -229,9 +239,10 @@ def parse(self, sheetname=0, header=0, skiprows=None, skip_footer=0,
229239
converters : dict, default None
230240
Dict of functions for converting values in certain columns. Keys can
231241
either be integers or column labels
232-
index_col : int, default None
242+
index_col : int, list of ints, default None
233243
Column to use as the row labels of the DataFrame. Pass None if
234-
there is no such column
244+
there is no such column. If a list is passed, those columns will beginning
245+
combined into a ``MultiIndex``
235246
parse_cols : int or list, default None
236247
* If None then parse all columns
237248
* If int then indicates last column to be parsed
@@ -256,6 +267,9 @@ def parse(self, sheetname=0, header=0, skiprows=None, skip_footer=0,
256267
has_index_names : boolean, default False
257268
True if the cols defined in index_col have an index name and are
258269
not in the header
270+
has_header_names: boolean, default False
271+
True if rows defined in header have names, in the leftmost data
272+
columns. Reads format output by `to_excel`
259273
verbose : boolean, default False
260274
Set to True to print a single statement when reading each
261275
excel sheet.
@@ -270,10 +284,17 @@ def parse(self, sheetname=0, header=0, skiprows=None, skip_footer=0,
270284
if skipfooter is not None:
271285
skip_footer = skipfooter
272286

287+
if not com.is_list_like(header) and has_header_names:
288+
raise ValueError("column names can only be read when the file"
289+
"contains `MultIndex` columns with a list"
290+
"of columns that making up the index "
291+
"passed in the `header` parameter")
292+
273293
return self._parse_excel(sheetname=sheetname, header=header,
274294
skiprows=skiprows,
275295
index_col=index_col,
276296
has_index_names=has_index_names,
297+
has_header_names=has_header_names,
277298
parse_cols=parse_cols,
278299
parse_dates=parse_dates,
279300
date_parser=date_parser, na_values=na_values,
@@ -320,7 +341,7 @@ def _parse_excel(self, sheetname=0, header=0, skiprows=None, skip_footer=0,
320341
index_col=None, has_index_names=None, parse_cols=None,
321342
parse_dates=False, date_parser=None, na_values=None,
322343
thousands=None, chunksize=None, convert_float=True,
323-
verbose=False, **kwds):
344+
has_header_names=False, verbose=False, **kwds):
324345
import xlrd
325346
from xlrd import (xldate, XL_CELL_DATE,
326347
XL_CELL_ERROR, XL_CELL_BOOLEAN,
@@ -418,8 +439,37 @@ def _parse_cell(cell_contents,cell_typ):
418439
if sheet.nrows == 0:
419440
return DataFrame()
420441

442+
# forward fill and pull out names for MultiIndex column
443+
header_names = None
421444
if header is not None:
422-
data[header] = _trim_excel_header(data[header])
445+
if com.is_list_like(header):
446+
header_names = []
447+
for row in header:
448+
if com.is_integer(skiprows):
449+
row += skiprows
450+
data[row] = _fill_mi_header(data[row])
451+
header_name, data[row] = _pop_header_name(data[row], index_col)
452+
header_names.append(header_name)
453+
else:
454+
data[header] = _trim_excel_header(data[header])
455+
456+
# forward fill values for MultiIndex index
457+
if com.is_list_like(index_col):
458+
if not com.is_list_like(header):
459+
offset = 1 + header
460+
else:
461+
offset = 1 + max(header)
462+
463+
for col in index_col:
464+
last = data[offset][col]
465+
for row in range(offset + 1, len(data)):
466+
if data[row][col] == '' or data[row][col] is None:
467+
data[row][col] = last
468+
else:
469+
last = data[row][col]
470+
471+
if index_col is not None:
472+
has_index_names = True
423473

424474
parser = TextParser(data, header=header, index_col=index_col,
425475
has_index_names=has_index_names,
@@ -433,6 +483,7 @@ def _parse_cell(cell_contents,cell_typ):
433483
**kwds)
434484

435485
output[asheetname] = parser.read()
486+
output[asheetname].columns = output[asheetname].columns.set_names(header_names)
436487

437488
if ret_dict:
438489
return output
@@ -463,6 +514,29 @@ def _trim_excel_header(row):
463514
row = row[1:]
464515
return row
465516

517+
def _fill_mi_header(row):
518+
# forward fill blanks entries
519+
# from headers if parsing as MultiIndex
520+
last = row[0]
521+
for i in range(1, len(row)):
522+
if row[i] == '' or row[i] is None:
523+
row[i] = last
524+
else:
525+
last = row[i]
526+
return row
527+
528+
# fill blank if index_col not None
529+
def _pop_header_name(row, index_col):
530+
""" (header, new_data) for header rows in MultiIndex parsing"""
531+
none_fill = lambda x: None if x == '' else x
532+
533+
if index_col is None:
534+
# no index col specified, trim data for inference path
535+
return none_fill(row[0]), row[1:]
536+
else:
537+
# pop out header name and fill w/ blank
538+
i = index_col if not com.is_list_like(index_col) else max(index_col)
539+
return none_fill(row[i]), row[:i] + [''] + row[i+1:]
466540

467541
def _conv_value(val):
468542
# Convert numpy types to Python types for the Excel writers.
29.5 KB
Binary file not shown.
14 KB
Binary file not shown.

0 commit comments

Comments
 (0)