Merge pull request #5268 from guyrt/issue-5156-segfault

jreback · jreback · commit 6f6b0dfb01cf · 2013-10-19T14:13:49.000-07:00
BUG: Fixed issue #5156: segfault on read_csv
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -183,6 +183,7 @@ Improvements to existing features
   - ``Series`` now supports a ``to_frame`` method to convert it to a single-column DataFrame (:issue:`5164`)
   - DatetimeIndex (and date_range) can now be constructed in a left- or
     right-open fashion using the ``closed`` parameter (:issue:`4579`)
+  - Python csv parser now supports usecols (:issue:`4335`)
 
 API Changes
 ~~~~~~~~~~~
@@ -625,6 +626,8 @@ Bug Fixes
   - Fixed bug in Excel writers where frames with duplicate column names weren't
     written correctly. (:issue:`5235`)
   - Fixed issue with ``drop`` and a non-unique index on Series (:issue:`5248`)
+  - Fixed seg fault in C parser caused by passing more names than columns in
+    the file. (:issue:`5156`)
 
 pandas 0.12.0
 -------------
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -1955,6 +1955,15 @@ def test_integer_overflow_bug(self):
         result = self.read_csv(StringIO(data), header=None, sep='\s+')
         self.assertTrue(result[0].dtype == np.float64)
 
+    def test_catch_too_many_names(self):
+        # Issue 5156
+        data = """\
+1,2,3
+4,,6
+7,8,9
+10,11,12\n"""
+        tm.assertRaises(Exception, read_csv, StringIO(data), header=0, names=['a', 'b', 'c', 'd'])
+
 
 class TestPythonParser(ParserTests, unittest.TestCase):
     def test_negative_skipfooter_raises(self):
diff --git a/pandas/parser.pyx b/pandas/parser.pyx
@@ -801,7 +801,6 @@ cdef class TextReader:
             raise StopIteration
         self._end_clock('Tokenization')
 
-
         self._start_clock()
         columns = self._convert_column_data(rows=rows,
                                             footer=footer,
@@ -840,11 +839,12 @@ cdef class TextReader:
 
     def _convert_column_data(self, rows=None, upcast_na=False, footer=0):
         cdef:
-            Py_ssize_t i, nused, ncols
+            Py_ssize_t i, nused
             kh_str_t *na_hashset = NULL
             int start, end
             object name, na_flist
             bint na_filter = 0
+            Py_ssize_t num_cols
 
         start = self.parser_start
 
@@ -857,6 +857,22 @@ cdef class TextReader:
         # if footer > 0:
         #     end -= footer
 
+        #print >> sys.stderr, self.table_width
+        #print >> sys.stderr, self.leading_cols
+        #print >> sys.stderr, self.parser.lines
+        #print >> sys.stderr, start
+        #print >> sys.stderr, end
+        #print >> sys.stderr, self.header
+        #print >> sys.stderr, "index"
+        num_cols = -1
+        for i in range(self.parser.lines):
+            num_cols = (num_cols < self.parser.line_fields[i]) * self.parser.line_fields[i] +\
+                (num_cols >= self.parser.line_fields[i]) * num_cols
+
+        if self.table_width - self.leading_cols > num_cols:
+            raise CParserError("Too many columns specified: expected %s and found %s" %
+                (self.table_width - self.leading_cols, num_cols))
+
         results = {}
         nused = 0
         for i in range(self.table_width):
@@ -1446,7 +1462,6 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end,
     if na_filter:
         for i in range(lines):
             word = COLITER_NEXT(it)
-
             k = kh_get_str(na_hashset, word)
             # in the hash table
             if k != na_hashset.n_buckets:
@@ -1828,16 +1843,6 @@ cdef _apply_converter(object f, parser_t *parser, int col,
 
     return lib.maybe_convert_objects(result)
 
-    # if issubclass(values.dtype.type, (np.number, np.bool_)):
-    #     return values
-
-    # # XXX
-    # na_values = set([''])
-    # try:
-    #     return lib.maybe_convert_numeric(values, na_values, False)
-    # except Exception:
-    #     na_count = lib.sanitize_objects(values, na_values, False)
-    #     return result
 
 def _to_structured_array(dict columns, object names):
     cdef:
diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c
@@ -709,7 +709,6 @@ int tokenize_delimited(parser_t *self, size_t line_limit)
             if (c == '\n') {
                 END_FIELD();
                 END_LINE();
-                /* self->state = START_RECORD; */
             } else if (c == '\r') {
                 END_FIELD();
                 self->state = EAT_CRNL;
diff --git a/pandas/src/parser/tokenizer.h b/pandas/src/parser/tokenizer.h
@@ -161,7 +161,7 @@ typedef struct parser_t {
 
     int *line_start;      // position in words for start of line
     int *line_fields;     // Number of fields in each line
-    int lines;            // Number of (good) lines observedb
+    int lines;            // Number of (good) lines observed
     int file_lines;       // Number of file lines observed (including bad or skipped)
     int lines_cap;        // Vector capacity