BUG: unnamed columns in a multi-index will be named like: Unamed 2_level_0, so they are not duplicated

jreback · jreback · commit b0dadc5c3d14 · 2013-05-19T10:20:04.000-04:00
ENH: add options ``multi_index_columns_compat`` both to to_csv and read_csv (default is False),

    to force (when True) the previous behavior of creating a list of tuples (when writing), and
    reading as a list of tuples (and NOT as a MultiIndex)

DOC: add compat flags to io.rst
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -115,6 +115,10 @@ They can take a number of arguments:
   - ``error_bad_lines``: if False then any lines causing an error will be skipped :ref:`bad lines <io.bad_lines>`
   - ``usecols``: a subset of columns to return, results in much faster parsing 
     time and lower memory usage.
+  - ``mangle_dup_columns``: boolean, default True, then duplicate columns will be specified 
+    as 'X.0'...'X.N', rather than 'X'...'X'
+  - ``multi_index_columns_compat``: boolean, default False, leave a list of tuples on columns
+    as is (default is to convert to a Multi Index on the columns)
 
 .. ipython:: python
    :suppress:
@@ -271,6 +275,9 @@ specified.
     data = 'C0,C_l0_g0,C_l0_g1\nC1,C_l1_g0,C_l1_g1\nR0,,\nR_l0_g0,R0C0,R0C1\nR_l0_g1,R1C0,R1C1\nR_l0_g2,R2C0,R2C1\n'
     pd.read_csv(StringIO(data), header=[0,1], index_col=[0])
 
+You can pass ``multi_index_columns_compat=True`` to preserve the pre-0.12 behavior of
+not converting a list of tuples in the columns to a Multi Index.
+
 .. _io.usecols:
 
 Filtering columns (``usecols``)
diff --git a/pandas/core/format.py b/pandas/core/format.py
@@ -772,9 +772,10 @@ def grouper(x):
 class CSVFormatter(object):
 
     def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None,
-               cols=None, header=True, index=True, index_label=None,
-               mode='w', nanRep=None, encoding=None, quoting=None,
-               line_terminator='\n', chunksize=None, engine=None):
+                 cols=None, header=True, index=True, index_label=None,
+                 mode='w', nanRep=None, encoding=None, quoting=None,
+                 line_terminator='\n', chunksize=None, engine=None,
+                 multi_index_columns_compat=False):
 
         self.engine = engine  # remove for 0.12
 
@@ -803,6 +804,7 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None,
             msg= "columns.is_unique == False not supported with engine='python'"
             raise NotImplementedError(msg)
 
+        self.multi_index_columns_compat=multi_index_columns_compat
         if cols is not None:
             if isinstance(cols,Index):
                 cols = cols.to_native_types(na_rep=na_rep,float_format=float_format)
@@ -959,7 +961,8 @@ def _save_header(self):
         index_label = self.index_label
         cols = self.cols
         header = self.header
-        has_mi_columns = isinstance(obj.columns, MultiIndex)
+        has_mi_columns = isinstance(obj.columns, MultiIndex
+                                    ) and not self.multi_index_columns_compat
         encoded_labels = []
 
         has_aliases = isinstance(header, (tuple, list, np.ndarray))
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1391,7 +1391,8 @@ def to_panel(self):
     def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None,
                cols=None, header=True, index=True, index_label=None,
                mode='w', nanRep=None, encoding=None, quoting=None,
-               line_terminator='\n', chunksize=None,**kwds):
+               line_terminator='\n', chunksize=None,
+               multi_index_columns_compat=False, **kwds):
         """
         Write DataFrame to a comma-separated values (csv) file
 
@@ -1429,6 +1430,9 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None,
         quoting : optional constant from csv module
             defaults to csv.QUOTE_MINIMAL
         chunksize : rows to write at a time
+        multi_index_columns_compat : boolean, default False
+            write multi_index columns as a list of tuples (if True)
+            or new (expanded format)m if False)
         """
         if nanRep is not None:  # pragma: no cover
             import warnings
@@ -1445,7 +1449,8 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None,
                                          float_format=float_format, cols=cols,
                                          header=header, index=index,
                                          index_label=index_label,mode=mode,
-                                         chunksize=chunksize,engine=kwds.get("engine") )
+                                         chunksize=chunksize,engine=kwds.get("engine"),
+                                         multi_index_columns_compat=multi_index_columns_compat)
             formatter.save()
 
     def to_excel(self, excel_writer, sheet_name='sheet1', na_rep='',
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -127,6 +127,11 @@ class DateConversionError(Exception):
 usecols : array-like
     Return a subset of the columns.
     Results in much faster parsing time and lower memory usage.
+mangle_dup_columns: boolean, default True
+    Duplicate columns will be specified as 'X.0'...'X.N', rather than 'X'...'X'
+multi_index_columns_compat: boolean, default False
+    Leave a list of tuples on columns as is (default is to convert to
+    a Multi Index on the columns)
 
 Returns
 -------
@@ -294,6 +299,7 @@ def _read(filepath_or_buffer, kwds):
     'squeeze': False,
     'compression': None,
     'mangle_dupe_cols': True,
+    'multi_index_columns_compat':False,
 }
 
 
@@ -380,7 +386,8 @@ def parser_f(filepath_or_buffer,
                  verbose=False,
                  encoding=None,
                  squeeze=False,
-                 mangle_dupe_cols=True
+                 mangle_dupe_cols=True,
+                 multi_index_columns_compat=False,
                  ):
 
         # Alias sep -> delimiter.
@@ -438,7 +445,7 @@ def parser_f(filepath_or_buffer,
                     error_bad_lines=error_bad_lines,
                     low_memory=low_memory,
                     buffer_lines=buffer_lines,
-                    mangle_dupe_cols=mangle_dupe_cols
+                    mangle_dupe_cols=mangle_dupe_cols,
             )
 
         return _read(filepath_or_buffer, kwds)
@@ -730,6 +737,7 @@ def __init__(self, kwds):
         self.na_values = kwds.get('na_values')
         self.true_values = kwds.get('true_values')
         self.false_values = kwds.get('false_values')
+        self.multi_index_columns_compat = kwds.get('multi_index_columns_compat',False)
 
         self._date_conv = _make_date_converter(date_parser=self.date_parser,
                                                dayfirst=self.dayfirst)
@@ -786,7 +794,8 @@ def extract(r):
 
     def _maybe_make_multi_index_columns(self, columns, col_names=None):
         # possibly create a column mi here
-        if len(columns) and not isinstance(columns, MultiIndex) and all([ isinstance(c,tuple) for c in columns]):
+        if not self.multi_index_columns_compat and len(columns) and not isinstance(
+            columns, MultiIndex) and all([ isinstance(c,tuple) for c in columns]):
             columns = MultiIndex.from_tuples(columns,names=col_names)
         return columns
 
@@ -1430,12 +1439,14 @@ def _infer_columns(self):
 
             # we have a mi columns, so read and extra line
             if isinstance(header,(list,tuple,np.ndarray)):
+                have_mi_columns = True
                 header = list(header) + [header[-1]+1]
             else:
+                have_mi_columns = False
                 header = [ header ]
 
             columns = []
-            for hr in header:
+            for level, hr in enumerate(header):
 
                 if len(self.buf) > 0:
                     line = self.buf[0]
@@ -1448,7 +1459,10 @@ def _infer_columns(self):
                 this_columns = []
                 for i, c in enumerate(line):
                     if c == '':
-                        this_columns.append('Unnamed: %d' % i)
+                        if have_mi_columns:
+                            this_columns.append('Unnamed: %d_level_%d' % (i,level))
+                        else:
+                            this_columns.append('Unnamed: %d' % i)
                     else:
                         this_columns.append(c)
 
diff --git a/pandas/src/parser.pyx b/pandas/src/parser.pyx
@@ -232,7 +232,7 @@ cdef class TextReader:
     cdef:
         parser_t *parser
         object file_handle
-        bint factorize, na_filter, verbose, has_usecols
+        bint factorize, na_filter, verbose, has_usecols, has_mi_columns
         int parser_start
         list clocks
         char *c_encoding
@@ -252,6 +252,7 @@ cdef class TextReader:
         object encoding
         object compression
         object mangle_dupe_cols
+        object multi_index_columns_compat
         set noconvert, usecols
 
     def __cinit__(self, source,
@@ -304,12 +305,14 @@ cdef class TextReader:
                   skiprows=None,
                   skip_footer=0,
                   verbose=False,
-                  mangle_dupe_cols=True):
+                  mangle_dupe_cols=True,
+                  multi_index_columns_compat=False):
 
         self.parser = parser_new()
         self.parser.chunksize = tokenize_chunksize
 
         self.mangle_dupe_cols=mangle_dupe_cols
+        self.multi_index_columns_compat=multi_index_columns_compat
 
         # For timekeeping
         self.clocks = []
@@ -437,6 +440,7 @@ cdef class TextReader:
         self.leading_cols = 0
 
         # TODO: no header vs. header is not the first row
+        self.has_mi_columns = 0
         if header is None:
             # sentinel value
             self.parser.header_start = -1
@@ -454,6 +458,7 @@ cdef class TextReader:
                 self.parser.header_end = header[-1]
                 self.parser.header = header[0]
                 self.parser_start = header[-1] + 1
+                self.has_mi_columns = 1
                 self.header = header
             else:
                 self.parser.header_start = header
@@ -570,7 +575,7 @@ cdef class TextReader:
         if self.parser.header_start >= 0:
 
             # Header is in the file
-            for hr in self.header:
+            for level, hr in enumerate(self.header):
 
                 this_header = []
 
@@ -600,7 +605,10 @@ cdef class TextReader:
                                                     self.c_encoding, errors)
 
                     if name == '':
-                        name = 'Unnamed: %d' % i
+                        if self.has_mi_columns:
+                            name = 'Unnamed: %d_level_%d' % (i,level)
+                        else:
+                            name = 'Unnamed: %d' % i
 
                     count = counts.get(name, 0)
                     if count > 0 and self.mangle_dupe_cols:
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -4991,7 +4991,7 @@ def test_to_csv_multiindex(self):
         with ensure_clean(pname) as path:
             # GH3571, GH1651, GH3141
 
-            # column & index are multi-iindex
+            # column & index are multi-index
             df = mkdf(5,3,r_idx_nlevels=2,c_idx_nlevels=4)
             df.to_csv(path)
             result = read_csv(path,header=[0,1,2,3],index_col=[0,1])
@@ -5003,6 +5003,22 @@ def test_to_csv_multiindex(self):
             result = read_csv(path,header=[0,1,2,3],index_col=0)
             assert_frame_equal(df,result)
 
+            # dup column names?
+            df = mkdf(5,3,r_idx_nlevels=3,c_idx_nlevels=4)
+            df.to_csv(path)
+            result = read_csv(path,header=[0,1,2,3],index_col=[0,1])
+            result.columns = ['R2','A','B','C']
+            new_result = result.reset_index().set_index(['R0','R1','R2'])
+            new_result.columns = df.columns
+            assert_frame_equal(df,new_result)
+
+            # column & index are multi-index (compatibility)
+            df = mkdf(5,3,r_idx_nlevels=2,c_idx_nlevels=4)
+            df.to_csv(path,multi_index_columns_compat=True)
+            result = read_csv(path,header=0,index_col=[0,1],multi_index_columns_compat=True)
+            result.columns = df.columns
+            assert_frame_equal(df,result)
+
         with ensure_clean(pname) as path:
             # empty
             tsframe[:0].to_csv(path)