Merge pull request #5298 from jreback/mi_csv

jreback · jreback · commit e067b6108ce2 · 2013-10-24T06:39:19.000-07:00
BUG: parser can handle a common_format multi-column index (no row index cols), (GH4702)
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -890,6 +890,22 @@ of tupleizing columns, specify ``tupleize_cols=True``.
    print(open('mi.csv').read())
    pd.read_csv('mi.csv',header=[0,1,2,3],index_col=[0,1])
 
+Starting in 0.13.0, ``read_csv`` will be able to interpret a more common format
+of multi-columns indices.
+
+.. ipython:: python
+   :suppress:
+
+   data = ",a,a,a,b,c,c\n,q,r,s,t,u,v\none,1,2,3,4,5,6\ntwo,7,8,9,10,11,12"
+   fh = open('mi2.csv','w')
+   fh.write(data)
+   fh.close()
+
+.. ipython:: python
+
+   print(open('mi2.csv').read())
+   pd.read_csv('mi2.csv',header=[0,1],index_col=0)
+
 Note: If an ``index_col`` is not specified (e.g. you don't have an index, or wrote it
 with ``df.to_csv(..., index=False``), then any ``names`` on the columns index will be *lost*.
 
@@ -898,6 +914,7 @@ with ``df.to_csv(..., index=False``), then any ``names`` on the columns index wi
 
    import os
    os.remove('mi.csv')
+   os.remove('mi2.csv')
 
 .. _io.sniff:
 
@@ -1069,7 +1086,7 @@ Note ``NaN``'s, ``NaT``'s and ``None`` will be converted to ``null`` and ``datet
 Orient Options
 ++++++++++++++
 
-There are a number of different options for the format of the resulting JSON 
+There are a number of different options for the format of the resulting JSON
 file / string. Consider the following DataFrame and Series:
 
 .. ipython:: python
@@ -1080,7 +1097,7 @@ file / string. Consider the following DataFrame and Series:
   sjo = Series(dict(x=15, y=16, z=17), name='D')
   sjo
 
-**Column oriented** (the default for ``DataFrame``) serialises the data as 
+**Column oriented** (the default for ``DataFrame``) serialises the data as
 nested JSON objects with column labels acting as the primary index:
 
 .. ipython:: python
@@ -1113,7 +1130,7 @@ values only, column and index labels are not included:
   dfjo.to_json(orient="values")
   # Not available for Series
 
-**Split oriented** serialises to a JSON object containing separate entries for 
+**Split oriented** serialises to a JSON object containing separate entries for
 values, index and columns. Name is also included for ``Series``:
 
 .. ipython:: python
@@ -1123,7 +1140,7 @@ values, index and columns. Name is also included for ``Series``:
 
 .. note::
 
-  Any orient option that encodes to a JSON object will not preserve the ordering of 
+  Any orient option that encodes to a JSON object will not preserve the ordering of
   index and column labels during round-trip serialisation. If you wish to preserve
   label ordering use the `split` option as it uses ordered containers.
 
@@ -1351,7 +1368,7 @@ The Numpy Parameter
 
 If ``numpy=True`` is passed to ``read_json`` an attempt will be made to sniff
 an appropriate dtype during deserialisation and to subsequently decode directly
-to numpy arrays, bypassing the need for intermediate Python objects. 
+to numpy arrays, bypassing the need for intermediate Python objects.
 
 This can provide speedups if you are deserialising a large amount of numeric
 data:
@@ -1375,7 +1392,7 @@ data:
 The speedup is less noticable for smaller datasets:
 
 .. ipython:: python
-   
+
    jsonfloats = dffloats.head(100).to_json()
 
 .. ipython:: python
@@ -1399,7 +1416,7 @@ The speedup is less noticable for smaller datasets:
 
     - labels are ordered. Labels are only read from the first container, it is assumed
       that each subsequent row / column has been encoded in the same order. This should be satisfied if the
-      data was encoded using ``to_json`` but may not be the case if the JSON 
+      data was encoded using ``to_json`` but may not be the case if the JSON
       is from another source.
 
 .. ipython:: python
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -634,6 +634,8 @@ Bug Fixes
   - Fixed seg fault in C parser caused by passing more names than columns in
     the file. (:issue:`5156`)
   - Fix ``Series.isin`` with date/time-like dtypes (:issue:`5021`)
+  - C and Python Parser can now handle the more common multi-index column format
+    which doesn't have a row for index names (:issue:`4702`)
 
 pandas 0.12.0
 -------------
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -569,7 +569,6 @@ def _clean_options(self, options, engine):
         skiprows = set() if skiprows is None else set(skiprows)
 
         # put stuff back
-        result['index_col'] = index_col
         result['names'] = names
         result['converters'] = converters
         result['na_values'] = na_values
@@ -641,7 +640,7 @@ def __init__(self, kwds):
         self.orig_names = None
         self.prefix = kwds.pop('prefix', None)
 
-        self.index_col = kwds.pop('index_col', None)
+        self.index_col = kwds.get('index_col', None)
         self.index_names = None
         self.col_names = None
 
@@ -1455,6 +1454,7 @@ def _convert_data(self, data):
     def _infer_columns(self):
         names = self.names
         num_original_columns = 0
+        clear_buffer = True
         if self.header is not None:
             header = self.header
 
@@ -1473,13 +1473,15 @@ def _infer_columns(self):
                 while self.pos <= hr:
                     line = self._next_line()
 
+                unnamed_count = 0
                 this_columns = []
                 for i, c in enumerate(line):
                     if c == '':
                         if have_mi_columns:
                             this_columns.append('Unnamed: %d_level_%d' % (i, level))
                         else:
                             this_columns.append('Unnamed: %d' % i)
+                        unnamed_count += 1
                     else:
                         this_columns.append(c)
 
@@ -1490,12 +1492,25 @@ def _infer_columns(self):
                         if cur_count > 0:
                             this_columns[i] = '%s.%d' % (col, cur_count)
                         counts[col] = cur_count + 1
+                elif have_mi_columns:
+
+                    # if we have grabbed an extra line, but its not in our format
+                    # so save in the buffer, and create an blank extra line for the rest of the
+                    # parsing code
+                    if hr == header[-1]:
+                        lc = len(this_columns)
+                        ic = len(self.index_col) if self.index_col is not None else 0
+                        if lc != unnamed_count and lc-ic > unnamed_count:
+                            clear_buffer = False
+                            this_columns = [ None ] * lc
+                            self.buf = [ self.buf[-1] ]
 
                 columns.append(this_columns)
                 if len(columns) == 1:
                     num_original_columns = len(this_columns)
 
-            self._clear_buffer()
+            if clear_buffer:
+                self._clear_buffer()
 
             if names is not None:
                 if (self.usecols is not None and len(names) != len(self.usecols)) \
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -1215,29 +1215,113 @@ def test_header_multi_index(self):
 R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2
 """
 
-        df = read_csv(StringIO(data), header=[0, 2, 3, 4], index_col=[0, 1], tupleize_cols=False)
+        df = self.read_csv(StringIO(data), header=[0, 2, 3, 4], index_col=[0, 1], tupleize_cols=False)
         tm.assert_frame_equal(df, expected)
 
         # skipping lines in the header
-        df = read_csv(StringIO(data), header=[0, 2, 3, 4], index_col=[0, 1], tupleize_cols=False)
+        df = self.read_csv(StringIO(data), header=[0, 2, 3, 4], index_col=[0, 1], tupleize_cols=False)
         tm.assert_frame_equal(df, expected)
 
         #### invalid options ####
 
         # no as_recarray
-        self.assertRaises(ValueError, read_csv, StringIO(data), header=[0,1,2,3],
+        self.assertRaises(ValueError, self.read_csv, StringIO(data), header=[0,1,2,3],
                           index_col=[0,1], as_recarray=True, tupleize_cols=False)
 
         # names
-        self.assertRaises(ValueError, read_csv, StringIO(data), header=[0,1,2,3],
+        self.assertRaises(ValueError, self.read_csv, StringIO(data), header=[0,1,2,3],
                           index_col=[0,1], names=['foo','bar'], tupleize_cols=False)
         # usecols
-        self.assertRaises(ValueError, read_csv, StringIO(data), header=[0,1,2,3],
+        self.assertRaises(ValueError, self.read_csv, StringIO(data), header=[0,1,2,3],
                           index_col=[0,1], usecols=['foo','bar'], tupleize_cols=False)
         # non-numeric index_col
-        self.assertRaises(ValueError, read_csv, StringIO(data), header=[0,1,2,3],
+        self.assertRaises(ValueError, self.read_csv, StringIO(data), header=[0,1,2,3],
                           index_col=['foo','bar'], tupleize_cols=False)
 
+    def test_header_multiindex_common_format(self):
+
+        df = DataFrame([[1,2,3,4,5,6],[7,8,9,10,11,12]],
+                       index=['one','two'],
+                       columns=MultiIndex.from_tuples([('a','q'),('a','r'),('a','s'),
+                                                       ('b','t'),('c','u'),('c','v')]))
+
+        # to_csv
+        data = """,a,a,a,b,c,c
+,q,r,s,t,u,v
+,,,,,,
+one,1,2,3,4,5,6
+two,7,8,9,10,11,12"""
+
+        result = self.read_csv(StringIO(data),header=[0,1],index_col=0)
+        tm.assert_frame_equal(df,result)
+
+        # common
+        data = """,a,a,a,b,c,c
+,q,r,s,t,u,v
+one,1,2,3,4,5,6
+two,7,8,9,10,11,12"""
+
+        result = self.read_csv(StringIO(data),header=[0,1],index_col=0)
+        tm.assert_frame_equal(df,result)
+
+        # common, no index_col
+        data = """a,a,a,b,c,c
+q,r,s,t,u,v
+1,2,3,4,5,6
+7,8,9,10,11,12"""
+
+        result = self.read_csv(StringIO(data),header=[0,1],index_col=None)
+        tm.assert_frame_equal(df.reset_index(drop=True),result)
+
+        # malformed case 1
+        expected = DataFrame(np.array([[ 2,  3,  4,  5,  6],
+                                       [ 8,  9, 10, 11, 12]]),
+                             index=Index([1, 7]),
+                             columns=MultiIndex(levels=[[u('a'), u('b'), u('c')], [u('r'), u('s'), u('t'), u('u'), u('v')]],
+                                                labels=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]],
+                                                names=[u('a'), u('q')]))
+
+        data = """a,a,a,b,c,c
+q,r,s,t,u,v
+1,2,3,4,5,6
+7,8,9,10,11,12"""
+
+        result = self.read_csv(StringIO(data),header=[0,1],index_col=0)
+        tm.assert_frame_equal(expected,result)
+
+        # malformed case 2
+        expected = DataFrame(np.array([[ 2,  3,  4,  5,  6],
+                                       [ 8,  9, 10, 11, 12]]),
+                             index=Index([1, 7]),
+                             columns=MultiIndex(levels=[[u('a'), u('b'), u('c')], [u('r'), u('s'), u('t'), u('u'), u('v')]],
+                                                labels=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]],
+                                                names=[None, u('q')]))
+
+        data = """,a,a,b,c,c
+q,r,s,t,u,v
+1,2,3,4,5,6
+7,8,9,10,11,12"""
+
+        result = self.read_csv(StringIO(data),header=[0,1],index_col=0)
+        tm.assert_frame_equal(expected,result)
+
+        # mi on columns and index (malformed)
+        expected = DataFrame(np.array([[ 3,  4,  5,  6],
+                                       [ 9, 10, 11, 12]]),
+                             index=MultiIndex(levels=[[1, 7], [2, 8]],
+                                              labels=[[0, 1], [0, 1]]),
+                             columns=MultiIndex(levels=[[u('a'), u('b'), u('c')], [u('s'), u('t'), u('u'), u('v')]],
+                                                labels=[[0, 1, 2, 2], [0, 1, 2, 3]],
+                                                names=[None, u('q')]))
+
+        data = """,a,a,b,c,c
+q,r,s,t,u,v
+1,2,3,4,5,6
+7,8,9,10,11,12"""
+
+        result = self.read_csv(StringIO(data),header=[0,1],index_col=[0, 1])
+        tm.assert_frame_equal(expected,result)
+
     def test_pass_names_with_index(self):
         lines = self.data1.split('\n')
         no_header = '\n'.join(lines[1:])
diff --git a/pandas/parser.pyx b/pandas/parser.pyx
@@ -250,6 +250,7 @@ cdef class TextReader:
         object memory_map
         object as_recarray
         object header, orig_header, names, header_start, header_end
+        object index_col
         object low_memory
         object skiprows
         object compact_ints, use_unsigned
@@ -266,6 +267,7 @@ cdef class TextReader:
                   header=0,
                   header_start=0,
                   header_end=0,
+                  index_col=None,
                   names=None,
 
                   memory_map=False,
@@ -439,6 +441,8 @@ cdef class TextReader:
         # XXX
         self.noconvert = set()
 
+        self.index_col = index_col
+
         #----------------------------------------
         # header stuff
 
@@ -574,7 +578,7 @@ cdef class TextReader:
         # header is now a list of lists, so field_count should use header[0]
 
         cdef:
-            size_t i, start, data_line, field_count, passed_count, hr
+            size_t i, start, data_line, field_count, passed_count, hr, unnamed_count
             char *word
             object name
             int status
@@ -606,6 +610,7 @@ cdef class TextReader:
 
                 # TODO: Py3 vs. Py2
                 counts = {}
+                unnamed_count = 0
                 for i in range(field_count):
                     word = self.parser.words[start + i]
 
@@ -623,6 +628,7 @@ cdef class TextReader:
                             name = 'Unnamed: %d_level_%d' % (i,level)
                         else:
                             name = 'Unnamed: %d' % i
+                        unnamed_count += 1
 
                     count = counts.get(name, 0)
                     if count > 0 and self.mangle_dupe_cols and not self.has_mi_columns:
@@ -631,6 +637,19 @@ cdef class TextReader:
                         this_header.append(name)
                     counts[name] = count + 1
 
+                if self.has_mi_columns:
+
+                    # if we have grabbed an extra line, but its not in our format
+                    # so save in the buffer, and create an blank extra line for the rest of the
+                    # parsing code
+                    if hr == self.header[-1]:
+                        lc = len(this_header)
+                        ic = len(self.index_col) if self.index_col is not None else 0
+                        if lc != unnamed_count and lc-ic > unnamed_count:
+                           hr -= 1
+                           self.parser_start -= 1
+                           this_header = [ None ] * lc
+
                 data_line = hr + 1
                 header.append(this_header)
 
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py