BUG: Respect 'usecols' parameter even when CSV rows are uneven

gfyoung · jreback · commit e55875e57535 · 2016-03-20T10:58:53.000-04:00
Closes #12203 by overriding the row alignment checks for both engines when the `usecols` parameter is passed into `read_csv`. Author: gfyoung <gfyoung17@gmail.com> Closes #12551 from gfyoung/usecol_long_lines and squashes the following commits: d3824dc [gfyoung] BUG: Respect 'usecols' parameter even when CSV rows are uneven
diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt
@@ -55,6 +55,12 @@ API changes
 
 
 
+- ``CParserError`` is now a ``ValueError`` instead of just an ``Exception`` (:issue:`12551`)
+
+
+
+
+
 
 
 
@@ -95,6 +101,7 @@ Performance Improvements
 
 Bug Fixes
 ~~~~~~~~~
+- ``usecols`` parameter in ``pd.read_csv`` is now respected even when the lines of a CSV file are not even (:issue:`12203`)
 
 - Bug in ``Period`` and ``PeriodIndex`` creation raises ``KeyError`` if ``freq="Minute"`` is specified. Note that "Minute" freq is deprecated in v0.17.0, and recommended to use ``freq="T"`` instead (:issue:`11854`)
 - Bug in printing data which contains ``Period`` with different ``freq`` raises ``ValueError`` (:issue:`12615`)
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -1984,7 +1984,9 @@ def _rows_to_cols(self, content):
             raise ValueError('skip footer cannot be negative')
 
         # Loop through rows to verify lengths are correct.
-        if col_len != zip_len and self.index_col is not False:
+        if (col_len != zip_len and
+                self.index_col is not False and
+                self.usecols is None):
             i = 0
             for (i, l) in enumerate(content):
                 if len(l) != col_len:
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -2664,6 +2664,37 @@ def test_empty_header_read(count):
         for count in range(1, 101):
             test_empty_header_read(count)
 
+    def test_uneven_lines_with_usecols(self):
+        # See gh-12203
+        csv = r"""a,b,c
+        0,1,2
+        3,4,5,6,7
+        8,9,10
+        """
+
+        # make sure that an error is still thrown
+        # when the 'usecols' parameter is not provided
+        msg = "Expected \d+ fields in line \d+, saw \d+"
+        with tm.assertRaisesRegexp(ValueError, msg):
+            df = self.read_csv(StringIO(csv))
+
+        expected = DataFrame({
+            'a': [0, 3, 8],
+            'b': [1, 4, 9]
+        })
+
+        usecols = [0, 1]
+        df = self.read_csv(StringIO(csv), usecols=usecols)
+        tm.assert_frame_equal(df, expected)
+
+        usecols = ['a', 1]
+        df = self.read_csv(StringIO(csv), usecols=usecols)
+        tm.assert_frame_equal(df, expected)
+
+        usecols = ['a', 'b']
+        df = self.read_csv(StringIO(csv), usecols=usecols)
+        tm.assert_frame_equal(df, expected)
+
 
 class TestPythonParser(ParserTests, tm.TestCase):
 
diff --git a/pandas/parser.pyx b/pandas/parser.pyx
@@ -143,6 +143,8 @@ cdef extern from "parser/tokenizer.h":
         int allow_embedded_newline
         int strict                 # raise exception on bad CSV */
 
+        int usecols
+
         int expected_fields
         int error_bad_lines
         int warn_bad_lines
@@ -350,6 +352,8 @@ cdef class TextReader:
         self.compression = compression
         self.memory_map = memory_map
 
+        self.parser.usecols = (usecols is not None)
+
         self._setup_parser_source(source)
         parser_set_default_options(self.parser)
 
@@ -1208,7 +1212,7 @@ cdef class TextReader:
             else:
                 return None
 
-class CParserError(Exception):
+class CParserError(ValueError):
     pass
 
 
diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c
@@ -494,7 +494,8 @@ static int end_line(parser_t *self) {
     /* printf("Line: %d, Fields: %d, Ex-fields: %d\n", self->lines, fields, ex_fields); */
 
     if (!(self->lines <= self->header_end + 1)
-        && (self->expected_fields < 0 && fields > ex_fields)) {
+        && (self->expected_fields < 0 && fields > ex_fields)
+        && !(self->usecols)) {
         // increment file line count
         self->file_lines++;
 
diff --git a/pandas/src/parser/tokenizer.h b/pandas/src/parser/tokenizer.h
@@ -184,6 +184,8 @@ typedef struct parser_t {
     int allow_embedded_newline;
     int strict;                 /* raise exception on bad CSV */
 
+    int usecols; // Boolean: 1: usecols provided, 0: none provided
+
     int expected_fields;
     int error_bad_lines;
     int warn_bad_lines;