From 4038cfb12fd0d62b0f64d961c3416460e22e6eec Mon Sep 17 00:00:00 2001
From: "Richard T. Guy" <richardtguy84@gmail.com>
Date: Mon, 30 Sep 2013 21:30:34 -0400
Subject: [PATCH] ENH: Add usecols option to python parser.

Closes #4335

Added release note and fixed py3 compat

Updated docs for consistency
---
 doc/source/io.rst               |   8 +-
 doc/source/v0.13.0.txt          |  16 +--
 pandas/io/parsers.py            | 209 +++++++++++++++++++++-----------
 pandas/io/tests/test_parsers.py | 128 +++++++++----------
 4 files changed, 215 insertions(+), 146 deletions(-)

diff --git a/doc/source/io.rst b/doc/source/io.rst
index e75de91582b49..37227edc83fe2 100644
--- a/doc/source/io.rst
+++ b/doc/source/io.rst
@@ -85,11 +85,11 @@ They can take a number of arguments:
     ways to specify the file format
   - ``dtype``: A data type name or a dict of column name to data type. If not
     specified, data types will be inferred.
-  - ``header``: row number to use as the column names, and the start of the
+  - ``header``: row number(s) to use as the column names, and the start of the
     data.  Defaults to 0 if no ``names`` passed, otherwise ``None``. Explicitly
     pass ``header=0`` to be able to replace existing names. The header can be
     a list of integers that specify row locations for a multi-index on the columns
-    E.g. [0,1,3]. Interveaning rows that are not specified will be skipped.
+    E.g. [0,1,3]. Intervening rows that are not specified will be skipped.
     (E.g. 2 in this example are skipped)
   - ``skiprows``: A collection of numbers for rows in the file to skip. Can
     also be an integer to skip the first ``n`` rows
@@ -2938,7 +2938,7 @@ into BigQuery and pull it into a DataFrame.
 .. code-block:: python
 
    from pandas.io import gbq
-   
+
    # Insert your BigQuery Project ID Here
    # Can be found in the web console, or
    # using the command line tool `bq ls`
@@ -2998,7 +2998,7 @@ To add more rows to this, simply:
 
    To use this module, you will need a BigQuery account. See
    <https://cloud.google.com/products/big-query> for details.
- 
+
    As of 10/10/13, there is a bug in Google's API preventing result sets
    from being larger than 100,000 rows. A patch is scheduled for the week of
    10/14/13.
diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt
index b2c78f38140b4..603cffcc1b76b 100644
--- a/doc/source/v0.13.0.txt
+++ b/doc/source/v0.13.0.txt
@@ -505,11 +505,13 @@ Enhancements
 - :meth:`~pandas.io.json.json_normalize` is a new method to allow you to create a flat table
   from semi-structured JSON data. See :ref:`the docs<io.json_normalize>` (:issue:`1067`)
 
-
 - Added PySide support for the qtpandas DataFrameModel and DataFrameWidget.
 
+- Python csv parser now supports usecols (:issue:`4335`)
+
 - DataFrame has a new ``interpolate`` method, similar to Series (:issue:`4434`, :issue:`1892`)
 
+
   .. ipython:: python
 
       df = DataFrame({'A': [1, 2.1, np.nan, 4.7, 5.6, 6.8],
@@ -654,7 +656,7 @@ Experimental
   against extremely large datasets. :ref:`See the docs <io.bigquery>`
 
   .. code-block:: python
-  
+
      from pandas.io import gbq
 
      # A query to select the average monthly temperatures in the
@@ -665,8 +667,8 @@ Experimental
      query = """SELECT station_number as STATION,
      month as MONTH, AVG(mean_temp) as MEAN_TEMP
      FROM publicdata:samples.gsod
-     WHERE YEAR = 2000 
-     GROUP BY STATION, MONTH 
+     WHERE YEAR = 2000
+     GROUP BY STATION, MONTH
      ORDER BY STATION, MONTH ASC"""
 
      # Fetch the result set for this query
@@ -675,7 +677,7 @@ Experimental
      # To find this, see your dashboard:
      # https://code.google.com/apis/console/b/0/?noredirect
      projectid = xxxxxxxxx;
-     
+
      df = gbq.read_gbq(query, project_id = projectid)
 
      # Use pandas to process and reshape the dataset
@@ -686,9 +688,9 @@ Experimental
 
   The resulting dataframe is::
 
-     > df3 
+     > df3
                  Min Tem  Mean Temp    Max Temp
-      MONTH                                  
+      MONTH
       1     -53.336667  39.827892   89.770968
       2     -49.837500  43.685219   93.437932
       3     -77.926087  48.708355   96.099998
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 76d6a3909f89f..e9e82824326a7 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -52,11 +52,12 @@
 dialect : string or csv.Dialect instance, default None
     If None defaults to Excel dialect. Ignored if sep longer than 1 char
     See csv.Dialect documentation for more details
-header : int, default 0 if names parameter not specified,
-    Row to use for the column labels of the parsed DataFrame. Specify None if
-    there is no header row. Can be a list of integers that specify row
-    locations for a multi-index on the columns E.g. [0,1,3]. Interveaning
-    rows that are not specified (E.g. 2 in this example are skipped)
+header : int row number(s) to use as the column names, and the start of the
+    data.  Defaults to 0 if no ``names`` passed, otherwise ``None``. Explicitly
+    pass ``header=0`` to be able to replace existing names. The header can be
+    a list of integers that specify row locations for a multi-index on the columns
+    E.g. [0,1,3]. Intervening rows that are not specified will be skipped.
+    (E.g. 2 in this example are skipped)
 skiprows : list-like or integer
     Row numbers to skip (0-indexed) or number of rows to skip (int)
     at the start of the file
@@ -917,22 +918,6 @@ def _do_date_conversions(self, names, data):
 
         return names, data
 
-    def _exclude_implicit_index(self, alldata):
-
-        if self._implicit_index:
-            excl_indices = self.index_col
-
-            data = {}
-            offset = 0
-            for i, col in enumerate(self.orig_names):
-                while i + offset in excl_indices:
-                    offset += 1
-                data[col] = alldata[i + offset]
-        else:
-            data = dict((k, v) for k, v in zip(self.orig_names, alldata))
-
-        return data
-
 
 class CParserWrapper(ParserBase):
     """
@@ -1173,22 +1158,6 @@ def TextParser(*args, **kwds):
     return TextFileReader(*args, **kwds)
 
 
-# delimiter=None, dialect=None, names=None, header=0,
-# index_col=None,
-# na_values=None,
-# na_filter=True,
-# thousands=None,
-# quotechar='"',
-# escapechar=None,
-# doublequote=True,
-# skipinitialspace=False,
-# quoting=csv.QUOTE_MINIMAL,
-# comment=None, parse_dates=False, keep_date_col=False,
-# date_parser=None, dayfirst=False,
-# chunksize=None, skiprows=None, skip_footer=0, converters=None,
-# verbose=False, encoding=None, squeeze=False):
-
-
 def count_empty_vals(vals):
     return sum([1 for v in vals if v == '' or v is None])
 
@@ -1242,10 +1211,6 @@ def __init__(self, f, **kwds):
         self.buf = []
         self.pos = 0
 
-        if kwds['usecols'] is not None:
-            raise Exception("usecols not supported with engine='python'"
-                            " or multicharacter separators (yet).")
-
         self.encoding = kwds['encoding']
         self.compression = kwds['compression']
         self.skiprows = kwds['skiprows']
@@ -1259,7 +1224,10 @@ def __init__(self, f, **kwds):
         self.skipinitialspace = kwds['skipinitialspace']
         self.lineterminator = kwds['lineterminator']
         self.quoting = kwds['quoting']
-        self.mangle_dupe_cols = kwds.get('mangle_dupe_cols',True)
+        self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True)
+        self.usecols = kwds['usecols']
+
+        self.names_passed = kwds['names'] or None
 
         self.has_index_names = False
         if 'has_index_names' in kwds:
@@ -1283,17 +1251,25 @@ def __init__(self, f, **kwds):
 
             f = TextIOWrapper(f, encoding=self.encoding)
 
+        # Set self.data to something that can read lines.
         if hasattr(f, 'readline'):
             self._make_reader(f)
         else:
             self.data = f
 
-        self.columns = self._infer_columns()
+        # Get columns in two steps: infer from data, then
+        # infer column indices from self.usecols if is is specified.
+        self._col_indices = None
+        self.columns, self.num_original_columns = self._infer_columns()
 
-        # we are processing a multi index column
+        # Now self.columns has the set of columns that we will process.
+        # The original set is stored in self.original_columns.
         if len(self.columns) > 1:
+            # we are processing a multi index column
             self.columns, self.index_names, self.col_names, _ = self._extract_multi_indexer_columns(
                 self.columns, self.index_names, self.col_names)
+            # Update list of original names to include all indices.
+            self.num_original_columns = len(self.columns)
         else:
             self.columns = self.columns[0]
 
@@ -1304,7 +1280,7 @@ def __init__(self, f, **kwds):
         # multiple date column thing turning into a real spaghetti factory
         if not self._has_complex_date_col:
             (index_names,
-             self.orig_names, _) = self._get_index_name(self.columns)
+             self.orig_names, columns_) = self._get_index_name(self.columns)
             self._name_processed = True
             if self.index_names is None:
                 self.index_names = index_names
@@ -1442,6 +1418,22 @@ def read(self, rows=None):
 
         return index, columns, data
 
+    def _exclude_implicit_index(self, alldata):
+
+        if self._implicit_index:
+            excl_indices = self.index_col
+
+            data = {}
+            offset = 0
+            for i, col in enumerate(self.orig_names):
+                while i + offset in excl_indices:
+                    offset += 1
+                data[col] = alldata[i + offset]
+        else:
+            data = dict((k, v) for k, v in zip(self.orig_names, alldata))
+
+        return data
+
     # legacy
     def get_chunk(self, size=None):
         if size is None:
@@ -1462,7 +1454,7 @@ def _convert_data(self, data):
 
     def _infer_columns(self):
         names = self.names
-
+        num_original_columns = 0
         if self.header is not None:
             header = self.header
 
@@ -1476,10 +1468,7 @@ def _infer_columns(self):
 
             columns = []
             for level, hr in enumerate(header):
-                if len(self.buf) > 0:
-                    line = self.buf[0]
-                else:
-                    line = self._next_line()
+                line = self._buffered_line()
 
                 while self.pos <= hr:
                     line = self._next_line()
@@ -1488,51 +1477,103 @@ def _infer_columns(self):
                 for i, c in enumerate(line):
                     if c == '':
                         if have_mi_columns:
-                            this_columns.append('Unnamed: %d_level_%d' % (i,level))
+                            this_columns.append('Unnamed: %d_level_%d' % (i, level))
                         else:
                             this_columns.append('Unnamed: %d' % i)
                     else:
                         this_columns.append(c)
 
-                if not have_mi_columns:
-                    if self.mangle_dupe_cols:
-                        counts = {}
-                        for i, col in enumerate(this_columns):
-                            cur_count = counts.get(col, 0)
-                            if cur_count > 0:
-                                this_columns[i] = '%s.%d' % (col, cur_count)
-                            counts[col] = cur_count + 1
+                if not have_mi_columns and self.mangle_dupe_cols:
+                    counts = {}
+                    for i, col in enumerate(this_columns):
+                        cur_count = counts.get(col, 0)
+                        if cur_count > 0:
+                            this_columns[i] = '%s.%d' % (col, cur_count)
+                        counts[col] = cur_count + 1
 
                 columns.append(this_columns)
+                if len(columns) == 1:
+                    num_original_columns = len(this_columns)
 
             self._clear_buffer()
 
             if names is not None:
-                if len(names) != len(columns[0]):
+                if (self.usecols is not None and len(names) != len(self.usecols)) \
+                    or (self.usecols is None and len(names) != len(columns[0])):
+
                     raise ValueError('Number of passed names did not match '
-                                     'number of header fields in the file')
+                                    'number of header fields in the file')
                 if len(columns) > 1:
                     raise TypeError('Cannot pass names with multi-index '
                                     'columns')
-                columns = [ names ]
 
-        else:
-            if len(self.buf) > 0:
-                line = self.buf[0]
+                if self.usecols is not None:
+                    # Set _use_cols. We don't store columns because they are overwritten.
+                    self._handle_usecols(columns, names)
+                else:
+                    self._col_indices = None
+                    num_original_columns = len(names)
+                columns = [names]
             else:
-                line = self._next_line()
-
+                columns = self._handle_usecols(columns, columns[0])
+        else:
+            # header is None
+            line = self._buffered_line()
             ncols = len(line)
+            num_original_columns = ncols
             if not names:
                 if self.prefix:
                     columns = [ ['X%d' % i for i in range(ncols)] ]
                 else:
                     columns = [ lrange(ncols) ]
+                columns = self._handle_usecols(columns, columns[0])
             else:
-                columns = [ names ]
+                if self.usecols is None or len(names) == num_original_columns:
+                    columns = self._handle_usecols([names], names)
+                    num_original_columns = len(names)
+                else:
+                    if self.usecols and len(names) != len(self.usecols):
+                        raise ValueError('Number of passed names did not match '
+                                    'number of header fields in the file')
+                    # Ignore output but set used columns.
+                    self._handle_usecols([names], names)
+                    columns = [names]
+                    num_original_columns = ncols
 
+        return columns, num_original_columns
+
+    def _handle_usecols(self, columns, usecols_key):
+        """
+        Sets self._col_indices
+
+        usecols_key is used if there are string usecols.
+        """
+        if self.usecols is not None:
+            if any([isinstance(u, string_types) for u in self.usecols]):
+                if len(columns) > 1:
+                    raise ValueError("If using multiple headers, usecols must be integers.")
+                col_indices = []
+                for u in self.usecols:
+                    if isinstance(u, string_types):
+                        col_indices.append(usecols_key.index(u))
+                    else:
+                        col_indices.append(u)
+            else:
+                col_indices = self.usecols
+
+            columns = [[n for i, n in enumerate(column) if i in col_indices] for column in columns]
+            self._col_indices = col_indices
         return columns
 
+    def _buffered_line(self):
+        """
+        Return a line from buffer, filling buffer if required.
+        """
+        if len(self.buf) > 0:
+            return self.buf[0]
+        else:
+            return self._next_line()
+
     def _next_line(self):
         if isinstance(self.data, list):
             while self.pos in self.skiprows:
@@ -1598,6 +1639,17 @@ def _clear_buffer(self):
     _implicit_index = False
 
     def _get_index_name(self, columns):
+        """
+        Try several cases to get lines:
+
+        0) There are headers on row 0 and row 1 and their
+        total summed lengths equals the length of the next line.
+        Treat row 0 as columns and row 1 as indices
+        1) Look for implicit index: there are more columns
+        on row 1 than row 0. If this is true, assume that row
+        1 lists index columns and row 0 lists normal columns.
+        2) Get index from the columns if it was listed.
+        """
         orig_names = list(columns)
         columns = list(columns)
 
@@ -1615,29 +1667,34 @@ def _get_index_name(self, columns):
         implicit_first_cols = 0
         if line is not None:
             # leave it 0, #2442
+            # Case 1
             if self.index_col is not False:
-                implicit_first_cols = len(line) - len(columns)
+                implicit_first_cols = len(line) - self.num_original_columns
 
+            # Case 0
             if next_line is not None:
-                if len(next_line) == len(line) + len(columns):
+                if len(next_line) == len(line) + self.num_original_columns:
                     # column and index names on diff rows
-                    implicit_first_cols = 0
-
                     self.index_col = lrange(len(line))
                     self.buf = self.buf[1:]
 
                     for c in reversed(line):
                         columns.insert(0, c)
 
+                    # Update list of original names to include all indices.
+                    self.num_original_columns = len(next_line)
                     return line, columns, orig_names
 
         if implicit_first_cols > 0:
+            # Case 1
             self._implicit_index = True
             if self.index_col is None:
                 self.index_col = lrange(implicit_first_cols)
+
             index_name = None
 
         else:
+            # Case 2
             (index_name, columns,
              self.index_col) = _clean_index_names(columns, self.index_col)
 
@@ -1646,7 +1703,7 @@ def _get_index_name(self, columns):
     def _rows_to_cols(self, content):
         zipped_content = list(lib.to_object_array(content).T)
 
-        col_len = len(self.orig_names)
+        col_len = self.num_original_columns
         zip_len = len(zipped_content)
 
         if self._implicit_index:
@@ -1655,6 +1712,7 @@ def _rows_to_cols(self, content):
         if self.skip_footer < 0:
             raise ValueError('skip footer cannot be negative')
 
+        # Loop through rows to verify lengths are correct.
         if col_len != zip_len and self.index_col is not False:
             i = 0
             for (i, l) in enumerate(content):
@@ -1671,6 +1729,11 @@ def _rows_to_cols(self, content):
                    (col_len, row_num + 1, zip_len))
             raise ValueError(msg)
 
+        if self.usecols:
+            if self._implicit_index:
+                zipped_content = [a for i, a in enumerate(zipped_content) if i < len(self.index_col) or i - len(self.index_col) in self._col_indices]
+            else:
+                zipped_content = [a for i, a in enumerate(zipped_content) if i in self._col_indices]
         return zipped_content
 
     def _get_lines(self, rows=None):
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
index cf0c01c8dff50..b81feec6ab6f8 100644
--- a/pandas/io/tests/test_parsers.py
+++ b/pandas/io/tests/test_parsers.py
@@ -18,7 +18,7 @@
 from pandas.compat import(
     StringIO, BytesIO, PY3, range, long, lrange, lmap, u
 )
-from pandas.io.common import urlopen, URLError
+from pandas.io.common import URLError
 import pandas.io.parsers as parsers
 from pandas.io.parsers import (read_csv, read_table, read_fwf,
                                TextFileReader, TextParser)
@@ -761,8 +761,6 @@ def test_deep_skiprows(self):
         condensed_data = self.read_csv(StringIO(condensed_text))
         tm.assert_frame_equal(data, condensed_data)
 
-
-
     def test_detect_string_na(self):
         data = """A,B
 foo,bar
@@ -1217,14 +1215,11 @@ def test_header_multi_index(self):
 R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2
 """
 
-        # basic test with both engines
-        for engine in ['c','python']:
-            df = read_csv(StringIO(data), header=[0,2,3,4],index_col=[0,1], tupleize_cols=False,
-                          engine=engine)
-            tm.assert_frame_equal(df, expected)
+        df = read_csv(StringIO(data), header=[0, 2, 3, 4], index_col=[0, 1], tupleize_cols=False)
+        tm.assert_frame_equal(df, expected)
 
         # skipping lines in the header
-        df = read_csv(StringIO(data), header=[0,2,3,4],index_col=[0,1], tupleize_cols=False)
+        df = read_csv(StringIO(data), header=[0, 2, 3, 4], index_col=[0, 1], tupleize_cols=False)
         tm.assert_frame_equal(df, expected)
 
         #### invalid options ####
@@ -1825,9 +1820,6 @@ def test_integer_overflow_bug(self):
         result = self.read_csv(StringIO(data), header=None, sep=' ')
         self.assertTrue(result[0].dtype == np.float64)
 
-        result = self.read_csv(StringIO(data), header=None, sep='\s+')
-        self.assertTrue(result[0].dtype == np.float64)
-
     def test_int64_min_issues(self):
         # #2599
         data = 'A,B\n0,0\n0,'
@@ -1908,6 +1900,61 @@ def test_warn_if_chunks_have_mismatched_type(self):
             df = self.read_csv(StringIO(data))
         self.assertEqual(df.a.dtype, np.object)
 
+    def test_usecols(self):
+        data = """\
+a,b,c
+1,2,3
+4,5,6
+7,8,9
+10,11,12"""
+
+        result = self.read_csv(StringIO(data), usecols=(1, 2))
+        result2 = self.read_csv(StringIO(data), usecols=('b', 'c'))
+        exp = self.read_csv(StringIO(data))
+
+        self.assertEquals(len(result.columns), 2)
+        self.assertTrue((result['b'] == exp['b']).all())
+        self.assertTrue((result['c'] == exp['c']).all())
+
+        tm.assert_frame_equal(result, result2)
+
+        result = self.read_csv(StringIO(data), usecols=[1, 2], header=0,
+                               names=['foo', 'bar'])
+        expected = self.read_csv(StringIO(data), usecols=[1, 2])
+        expected.columns = ['foo', 'bar']
+        tm.assert_frame_equal(result, expected)
+
+        data = """\
+1,2,3
+4,5,6
+7,8,9
+10,11,12"""
+        result = self.read_csv(StringIO(data), names=['b', 'c'],
+                               header=None, usecols=[1, 2])
+
+        expected = self.read_csv(StringIO(data), names=['a', 'b', 'c'],
+                                 header=None)
+        expected = expected[['b', 'c']]
+        tm.assert_frame_equal(result, expected)
+
+        result2 = self.read_csv(StringIO(data), names=['a', 'b', 'c'],
+                                header=None, usecols=['b', 'c'])
+        tm.assert_frame_equal(result2, result)
+
+        # length conflict, passed names and usecols disagree
+        self.assertRaises(ValueError, self.read_csv, StringIO(data),
+                          names=['a', 'b'], usecols=[1], header=None)
+
+    def test_integer_overflow_bug(self):
+        # #2601
+        data = "65248E10 11\n55555E55 22\n"
+
+        result = self.read_csv(StringIO(data), header=None, sep=' ')
+        self.assertTrue(result[0].dtype == np.float64)
+
+        result = self.read_csv(StringIO(data), header=None, sep='\s+')
+        self.assertTrue(result[0].dtype == np.float64)
+
 
 class TestPythonParser(ParserTests, unittest.TestCase):
     def test_negative_skipfooter_raises(self):
@@ -2360,6 +2407,9 @@ def test_parse_dates_empty_string(self):
         result = pd.read_csv(s, parse_dates=["Date"], na_filter=False)
         self.assertTrue(result['Date'].isnull()[1])
 
+    def test_usecols(self):
+        raise nose.SkipTest("Usecols is not supported in C High Memory engine.")
+
 
 class TestCParserLowMemory(ParserTests, unittest.TestCase):
 
@@ -2406,51 +2456,6 @@ def test_pass_dtype(self):
         self.assert_(result['one'].dtype == 'u1')
         self.assert_(result['two'].dtype == 'S1')
 
-    def test_usecols(self):
-        data = """\
-a,b,c
-1,2,3
-4,5,6
-7,8,9
-10,11,12"""
-
-        result = self.read_csv(StringIO(data), usecols=(1, 2))
-        result2 = self.read_csv(StringIO(data), usecols=('b', 'c'))
-        exp = self.read_csv(StringIO(data))
-
-        self.assertEquals(len(result.columns), 2)
-        self.assertTrue((result['b'] == exp['b']).all())
-        self.assertTrue((result['c'] == exp['c']).all())
-
-        tm.assert_frame_equal(result, result2)
-
-        result = self.read_csv(StringIO(data), usecols=[1, 2], header=0,
-                               names=['foo', 'bar'])
-        expected = self.read_csv(StringIO(data), usecols=[1, 2])
-        expected.columns = ['foo', 'bar']
-        tm.assert_frame_equal(result, expected)
-
-        data = """\
-1,2,3
-4,5,6
-7,8,9
-10,11,12"""
-        result = self.read_csv(StringIO(data), names=['a', 'b', 'c'],
-                               header=None, usecols=[1, 2])
-
-        expected = self.read_csv(StringIO(data), names=['a', 'b', 'c'],
-                                 header=None)
-        expected = expected[['b', 'c']]
-        tm.assert_frame_equal(result, expected)
-
-        result2 = self.read_csv(StringIO(data), names=['a', 'b', 'c'],
-                                header=None, usecols=['b', 'c'])
-        tm.assert_frame_equal(result2, result)
-
-        # length conflict, passed names and usecols disagree
-        self.assertRaises(ValueError, self.read_csv, StringIO(data),
-                          names=['a', 'b'], usecols=[1], header=None)
-
     def test_usecols_dtypes(self):
         data = """\
 1,2,3
@@ -2496,12 +2501,11 @@ def test_usecols_regex_sep(self):
         # #2733
         data = 'a  b  c\n4  apple  bat  5.7\n8  orange  cow  10'
 
-        self.assertRaises(Exception, self.read_csv, StringIO(data),
-                          sep='\s+', usecols=('a', 'b'))
+        df = self.read_csv(StringIO(data), sep='\s+', usecols=('a', 'b'))
 
-        # expected = DataFrame({'a': ['apple', 'orange'],
-        #                       'b': ['bat', 'cow']}, index=[4, 8])
-        # tm.assert_frame_equal(result, expected)
+        expected = DataFrame({'a': ['apple', 'orange'],
+                              'b': ['bat', 'cow']}, index=[4, 8])
+        tm.assert_frame_equal(df, expected)
 
     def test_pure_python_failover(self):
         data = "a,b,c\n1,2,3#ignore this!\n4,5,6#ignorethistoo"