TST: more test cases

jreback · jreback · commit c64555b006fe · 2013-05-19T10:20:03.000-04:00
ENH: catching some invalid option combinations

BUG: fix as_recarray

DOC: io.rst updated
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -57,7 +57,10 @@ They can take a number of arguments:
     specified, data types will be inferred.
   - ``header``: row number to use as the column names, and the start of the
     data.  Defaults to 0 if no ``names`` passed, otherwise ``None``. Explicitly
-    pass ``header=0`` to be able to replace existing names.
+    pass ``header=0`` to be able to replace existing names. The header can be
+    a list of integers that specify row locations for a multi-index on the columns
+    E.g. [0,1,3]. Interveaning rows that are not specified will be skipped.
+    (E.g. 2 in this example are skipped)
   - ``skiprows``: A collection of numbers for rows in the file to skip. Can
     also be an integer to skip the first ``n`` rows
   - ``index_col``: column number, column name, or list of column numbers/names,
@@ -253,6 +256,21 @@ If the header is in a row other than the first, pass the row number to
     data = 'skip this skip it\na,b,c\n1,2,3\n4,5,6\n7,8,9'
     pd.read_csv(StringIO(data), header=1)
 
+.. _io.multi_index_columns:
+
+Specifying a multi-index columns
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+By specifying list of row locations for the ``header`` argument, you
+can read in a multi-index for the columns. Specifying non-consecutive
+rows will skip the interveaing rows. The ``index_col`` must also be
+specified.
+
+.. ipython:: python
+
+    data = 'C0,C_l0_g0,C_l0_g1\nC1,C_l1_g0,C_l1_g1\nR0,,\nR_l0_g0,R0C0,R0C1\nR_l0_g1,R1C0,R1C1\nR_l0_g2,R2C0,R2C1\n'
+    pd.read_csv(StringIO(data), header=[0,1], index_col=[0])
+
 .. _io.usecols:
 
 Filtering columns (``usecols``)
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -52,9 +52,11 @@ class DateConversionError(Exception):
 dialect : string or csv.Dialect instance, default None
     If None defaults to Excel dialect. Ignored if sep longer than 1 char
     See csv.Dialect documentation for more details
-header : int, default 0 if names parameter not specified, otherwise None
+header : int, default 0 if names parameter not specified,
     Row to use for the column labels of the parsed DataFrame. Specify None if
-    there is no header row.
+    there is no header row. Can be a list of integers that specify row
+    locations for a multi-index on the columns E.g. [0,1,3]. Interveaning
+    rows that are not specified (E.g. 2 in this example are skipped)
 skiprows : list-like or integer
     Row numbers to skip (0-indexed) or number of rows to skip (int)
     at the start of the file
@@ -531,6 +533,16 @@ def __init__(self, f, engine='python', **kwds):
         if kwds.get('header', 'infer') == 'infer':
             kwds['header'] = 0 if kwds.get('names') is None else None
 
+        # validate header options for mi
+        h = kwds['header']
+        if isinstance(h,(list,tuple,np.ndarray)):
+            if kwds.get('index_col') is None:
+                raise Exception("must have an index_col when have a "
+                                "multi-index header is specified")
+            if kwds.get('as_recarray'):
+                raise Exception("cannot specify as_recarray when "
+                                "specifying a multi-index header")
+
         self.orig_options = kwds
 
         # miscellanea
@@ -965,7 +977,8 @@ def extract(r):
                     self.col_names = [ r[0] if len(r[0]) else None for r in header ]
                     passed_names = True
                 else:
-                    raise Exception("must have an index_col when have a multi-index specified")
+                    raise Exception("must have an index_col when have a multi-index "
+                                    "header is specified")
             else:
                 self.names = list(self._reader.header[0])
 
@@ -1381,6 +1394,9 @@ def _infer_columns(self):
         names = self.names
 
         if self.header is not None:
+            if isinstance(self.header,(list,tuple,np.ndarray)):
+                raise Exception("PythonParser does not support a multi-index header")
+
             if len(self.buf) > 0:
                 line = self.buf[0]
             else:
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -20,6 +20,7 @@
                                TextFileReader, TextParser)
 from pandas.util.testing import (assert_almost_equal,
                                  assert_series_equal,
+                                 makeCustomDataframe as mkdf,
                                  network,
                                  ensure_clean)
 import pandas.util.testing as tm
@@ -994,6 +995,38 @@ def test_header_not_first_line(self):
         expected = self.read_csv(StringIO(data2), header=0, index_col=0)
         tm.assert_frame_equal(df, expected)
 
+    def test_header_multi_index(self):
+        expected = mkdf(5,3,r_idx_nlevels=2,c_idx_nlevels=4)
+
+        data = """\
+C0,,C_l0_g0,C_l0_g1,C_l0_g2
+
+C1,,C_l1_g0,C_l1_g1,C_l1_g2
+C2,,C_l2_g0,C_l2_g1,C_l2_g2
+C3,,C_l3_g0,C_l3_g1,C_l3_g2
+R0,R1,,,
+R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2
+R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2
+R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2
+R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2
+R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2
+"""
+
+        # python-engine
+        self.assertRaises(Exception, read_csv, StringIO(data), header=[0,1,2,3], 
+                          index_col=[0,1], engine='python')
+
+        # must specify index_col
+        self.assertRaises(Exception, read_csv, StringIO(data), header=[0,1,2,3])
+
+        # no as_recarray
+        self.assertRaises(Exception, read_csv, StringIO(data), header=[0,1,2,3], 
+                          index_col=[0,1], as_recarray=True)
+
+        # skipping lines in the header
+        df = read_csv(StringIO(data), header=[0,2,3,4],index_col=[0,1])
+        tm.assert_frame_equal(df, expected)
+
     def test_pass_names_with_index(self):
         lines = self.data1.split('\n')
         no_header = '\n'.join(lines[1:])
diff --git a/pandas/src/parser.pyx b/pandas/src/parser.pyx
@@ -1789,6 +1789,9 @@ def _to_structured_array(dict columns, object names):
 
     if names is None:
         names = ['%d' % i for i in range(len(columns))]
+    else:
+        # single line header
+        names = names[0]
 
     dt = np.dtype([(str(name), columns[i].dtype)
                    for i, name in enumerate(names)])