Merge pull request #10570 from flamingbear/10564-allow-multiindex-excel-writing

jorisvandenbossche · jorisvandenbossche · commit b63206b87587 · 2015-08-20T22:33:14.000+02:00
COMPAT: Allow multi-indexes to be written to excel
diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
@@ -521,6 +521,7 @@ Other API Changes
 - Enable serialization of lists and dicts to strings in ExcelWriter (:issue:`8188`)
 - Allow passing `kwargs` to the interpolation methods (:issue:`10378`).
 - Serialize metadata properties of subclasses of pandas objects (:issue:`10553`).
+- Allow ``DataFrame`` with ``MultiIndex`` columns to be written to Excel (:issue: `10564`). This was changed in 0.16.2 as the read-back method could not always guarantee perfect fidelity (:issue:`9794`).
 - ``Categorical.unique`` now returns new ``Categorical`` which ``categories`` and ``codes`` are unique, rather than returning ``np.array`` (:issue:`10508`)
 
    - unordered category: values and categories are sorted by appearance order.
diff --git a/pandas/core/format.py b/pandas/core/format.py
@@ -4,6 +4,7 @@
 # pylint: disable=W0141
 
 import sys
+import warnings
 
 from pandas.core.base import PandasObject
 from pandas.core.common import adjoin, notnull
@@ -1640,11 +1641,14 @@ class ExcelFormatter(object):
     inf_rep : string, default `'inf'`
         representation for np.inf values (which aren't representable in Excel)
         A `'-'` sign will be added in front of -inf.
+    verbose: boolean, default True
+        If True, warn user that the resulting output file may not be
+        re-read or parsed directly by pandas.
     """
 
     def __init__(self, df, na_rep='', float_format=None, cols=None,
                  header=True, index=True, index_label=None, merge_cells=False,
-                 inf_rep='inf'):
+                 inf_rep='inf', verbose=True):
         self.df = df
         self.rowcounter = 0
         self.na_rep = na_rep
@@ -1657,6 +1661,7 @@ def __init__(self, df, na_rep='', float_format=None, cols=None,
         self.header = header
         self.merge_cells = merge_cells
         self.inf_rep = inf_rep
+        self.verbose = verbose
 
     def _format_value(self, val):
         if lib.checknull(val):
@@ -1671,6 +1676,17 @@ def _format_value(self, val):
         return val
 
     def _format_header_mi(self):
+
+        if self.columns.nlevels > 1:
+            if not self.index:
+                raise NotImplementedError("Writing to Excel with MultiIndex"
+                                          " columns and no index ('index'=False) "
+                                          "is not yet implemented.")
+            elif self.index and self.verbose:
+                warnings.warn("Writing to Excel with MultiIndex columns is a"
+                              " one way serializable operation. You will not"
+                              " be able to re-read or parse the output file.")
+
         has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index))
         if not(has_aliases or self.header):
             return
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1254,7 +1254,8 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
     def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='',
                  float_format=None, columns=None, header=True, index=True,
                  index_label=None, startrow=0, startcol=0, engine=None,
-                 merge_cells=True, encoding=None, inf_rep='inf'):
+                 merge_cells=True, encoding=None, inf_rep='inf',
+                 verbose=True):
         """
         Write DataFrame to a excel sheet
 
@@ -1295,6 +1296,9 @@ def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='',
         inf_rep : string, default 'inf'
             Representation for infinity (there is no native representation for
             infinity in Excel)
+        verbose: boolean, default True
+             If True, warn user that the resulting output file may not be
+             re-read or parsed directly by pandas.
 
         Notes
         -----
@@ -1311,12 +1315,8 @@ def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='',
         strings before writing.
         """
         from pandas.io.excel import ExcelWriter
-        if self.columns.nlevels > 1:
-            raise NotImplementedError("Writing as Excel with a MultiIndex is "
-                                      "not yet implemented.")
-
         need_save = False
-        if encoding == None:
+        if encoding is None:
             encoding = 'ascii'
 
         if isinstance(excel_writer, compat.string_types):
@@ -1331,7 +1331,7 @@ def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='',
                                        index=index,
                                        index_label=index_label,
                                        merge_cells=merge_cells,
-                                       inf_rep=inf_rep)
+                                       inf_rep=inf_rep, verbose=verbose)
         formatted_cells = formatter.get_formatted_cells()
         excel_writer.write_cells(formatted_cells, sheet_name,
                                  startrow=startrow, startcol=startcol)
diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py
@@ -316,7 +316,6 @@ def test_read_from_file_url(self):
 
         tm.assert_frame_equal(url_table, local_table)
 
-
     def test_xlsx_table(self):
         _skip_if_no_xlrd()
         _skip_if_no_openpyxl()
@@ -1145,10 +1144,10 @@ def test_excel_010_hemstring(self):
         # ensure limited functionality in 0.10
         # override of #2370 until sorted out in 0.11
 
-        def roundtrip(df, header=True, parser_hdr=0):
+        def roundtrip(df, header=True, parser_hdr=0, index=True):
 
             with ensure_clean(self.ext) as path:
-                df.to_excel(path, header=header, merge_cells=self.merge_cells)
+                df.to_excel(path, header=header, merge_cells=self.merge_cells, index=index)
                 xf = pd.ExcelFile(path)
                 res = xf.parse(xf.sheet_names[0], header=parser_hdr)
                 return res
@@ -1164,7 +1163,7 @@ def roundtrip(df, header=True, parser_hdr=0):
                     #is implemented for now fixing #9794
                     if j>1:
                         with tm.assertRaises(NotImplementedError):
-                            res = roundtrip(df, use_headers)
+                            res = roundtrip(df, use_headers, index=False)
                     else:
                         res = roundtrip(df, use_headers)
 
@@ -1187,6 +1186,33 @@ def roundtrip(df, header=True, parser_hdr=0):
         self.assertEqual(res.shape, (1, 2))
         self.assertTrue(res.ix[0, 0] is not np.nan)
 
+    def test_excel_010_hemstring_raises_NotImplementedError(self):
+        # This test was failing only for j>1 and header=False,
+        # So I reproduced a simple test.
+        _skip_if_no_xlrd()
+
+        if self.merge_cells:
+            raise nose.SkipTest('Skip tests for merged MI format.')
+
+        from pandas.util.testing import makeCustomDataframe as mkdf
+        # ensure limited functionality in 0.10
+        # override of #2370 until sorted out in 0.11
+
+        def roundtrip2(df, header=True, parser_hdr=0, index=True):
+
+            with ensure_clean(self.ext) as path:
+                df.to_excel(path, header=header, merge_cells=self.merge_cells, index=index)
+                xf = pd.ExcelFile(path)
+                res = xf.parse(xf.sheet_names[0], header=parser_hdr)
+                return res
+
+        nrows = 5; ncols = 3
+        j = 2; i = 1
+        df = mkdf(nrows, ncols, r_idx_nlevels=i, c_idx_nlevels=j)
+        with tm.assertRaises(NotImplementedError):
+            res = roundtrip2(df, header=False, index=False)
+
+
     def test_duplicated_columns(self):
         # Test for issue #5235.
         _skip_if_no_xlrd()
@@ -1439,29 +1465,37 @@ class XlwtTests(ExcelWriterBase, tm.TestCase):
     engine_name = 'xlwt'
     check_skip = staticmethod(_skip_if_no_xlwt)
 
-    def test_excel_raise_not_implemented_error_on_multiindex_columns(self):
+    def test_excel_raise_error_on_multiindex_columns_and_no_index(self):
         _skip_if_no_xlwt()
-        #MultiIndex as columns is not yet implemented 9794
-        cols = pd.MultiIndex.from_tuples([('site',''),
-                                          ('2014','height'),
-                                          ('2014','weight')])
-        df = pd.DataFrame(np.random.randn(10,3), columns=cols)
+        # MultiIndex as columns is not yet implemented 9794
+        cols = pd.MultiIndex.from_tuples([('site', ''),
+                                          ('2014', 'height'),
+                                          ('2014', 'weight')])
+        df = pd.DataFrame(np.random.randn(10, 3), columns=cols)
         with tm.assertRaises(NotImplementedError):
             with ensure_clean(self.ext) as path:
                 df.to_excel(path, index=False)
 
+    def test_excel_warns_verbosely_on_multiindex_columns_and_index_true(self):
+        _skip_if_no_xlwt()
+        cols = pd.MultiIndex.from_tuples([('site', ''),
+                                          ('2014', 'height'),
+                                          ('2014', 'weight')])
+        df = pd.DataFrame(np.random.randn(10, 3), columns=cols)
+        with tm.assert_produces_warning(UserWarning):
+            with ensure_clean(self.ext) as path:
+                df.to_excel(path, index=True)
+
     def test_excel_multiindex_index(self):
         _skip_if_no_xlwt()
-        #MultiIndex as index works so assert no error #9794
-        cols = pd.MultiIndex.from_tuples([('site',''),
-                                          ('2014','height'),
-                                          ('2014','weight')])
-        df = pd.DataFrame(np.random.randn(3,10), index=cols)
+        # MultiIndex as index works so assert no error #9794
+        cols = pd.MultiIndex.from_tuples([('site', ''),
+                                          ('2014', 'height'),
+                                          ('2014', 'weight')])
+        df = pd.DataFrame(np.random.randn(3, 10), index=cols)
         with ensure_clean(self.ext) as path:
             df.to_excel(path, index=False)
 
-
-
     def test_to_excel_styleconverter(self):
         _skip_if_no_xlwt()