ENH: handle ragged CSV files nicely when specifying explicit list of column names. close #2981

wesm · wesm · commit 900a552f92aa · 2013-03-29T11:32:00.000-04:00
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -119,6 +119,9 @@ pandas 0.11.0
   - Improved performance across several core functions by taking memory
     ordering of arrays into account. Courtesy of @stephenwlin (GH3130_)
   - Improved performance of groupby transform method (GH2121_)
+  - Handle "ragged" CSV files missing trailing delimiters in rows with missing
+    fields when also providing explicit list of column names (so the parser
+    knows how many columns to expect in the result) (GH2981_)
 
 **API Changes**
 
@@ -304,6 +307,7 @@ pandas 0.11.0
 .. _GH2932: https://github.com/pydata/pandas/issues/2932
 .. _GH2973: https://github.com/pydata/pandas/issues/2973
 .. _GH2967: https://github.com/pydata/pandas/issues/2967
+.. _GH2981: https://github.com/pydata/pandas/issues/2981
 .. _GH2982: https://github.com/pydata/pandas/issues/2982
 .. _GH2989: https://github.com/pydata/pandas/issues/2989
 .. _GH2993: https://github.com/pydata/pandas/issues/2993
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -1548,6 +1548,26 @@ def test_int64_min_issues(self):
 
         tm.assert_frame_equal(result, expected)
 
+    def test_parse_ragged_csv(self):
+        data = """1,2,3
+1,2,3,4
+1,2,3,4,5
+1,2
+1,2,3,4"""
+
+        nice_data = """1,2,3,,
+1,2,3,4,
+1,2,3,4,5
+1,2,,,
+1,2,3,4,"""
+        result = self.read_csv(StringIO(data), header=None,
+                               names=['a', 'b', 'c', 'd', 'e'])
+
+        expected = self.read_csv(StringIO(nice_data), header=None,
+                                 names=['a', 'b', 'c', 'd', 'e'])
+
+        tm.assert_frame_equal(result, expected)
+
 
 class TestPythonParser(ParserTests, unittest.TestCase):
 
diff --git a/pandas/src/parser.pyx b/pandas/src/parser.pyx
@@ -131,6 +131,7 @@ cdef extern from "parser/tokenizer.h":
         int allow_embedded_newline
         int strict                 # raise exception on bad CSV */
 
+        int expected_fields
         int error_bad_lines
         int warn_bad_lines
 
@@ -583,6 +584,10 @@ cdef class TextReader:
                 header = self.names
 
         elif self.names is not None:
+            # Enforce this unless usecols
+            if not self.has_usecols:
+                self.parser.expected_fields = len(self.names)
+
             # Names passed
             if self.parser.lines < 1:
                 self._tokenize_rows(1)
@@ -605,14 +610,19 @@ cdef class TextReader:
         if self.parser.lines < data_line + 1:
             field_count = len(header)
         else: # not self.has_usecols:
+
             field_count = self.parser.line_fields[data_line]
 
+            # #2981
+            if self.names is not None:
+                field_count = max(field_count, len(self.names))
+
             passed_count = len(header)
 
-            if passed_count > field_count:
-                raise CParserError('Column names have %d fields, '
-                                   'data has %d fields'
-                                   % (passed_count, field_count))
+            # if passed_count > field_count:
+            #     raise CParserError('Column names have %d fields, '
+            #                        'data has %d fields'
+            #                        % (passed_count, field_count))
 
             if self.has_usecols:
                 nuse = len(self.usecols)
@@ -623,8 +633,8 @@ cdef class TextReader:
                 elif passed_count != field_count:
                     raise ValueError('Passed header names '
                                      'mismatches usecols')
-            # oh boy, #2442
-            elif self.allow_leading_cols:
+            # oh boy, #2442, #2981
+            elif self.allow_leading_cols and passed_count < field_count:
                 self.leading_cols = field_count - passed_count
 
         return header, field_count
diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c
@@ -148,6 +148,7 @@ void parser_set_default_options(parser_t *self) {
     self->allow_embedded_newline = 1;
     self->strict = 0;
 
+    self->expected_fields = -1;
     self->error_bad_lines = 0;
     self->warn_bad_lines = 0;
 
@@ -428,16 +429,19 @@ static void append_warning(parser_t *self, const char *msg) {
 static int end_line(parser_t *self) {
     int fields;
     khiter_t k;  /* for hash set detection */
-    int ex_fields = -1;
+    int ex_fields = self->expected_fields;
     char *msg;
 
     fields = self->line_fields[self->lines];
 
     TRACE(("Line end, nfields: %d\n", fields));
 
-
     if (self->lines > 0) {
-        ex_fields = self->line_fields[self->lines - 1];
+        if (self->expected_fields >= 0) {
+            ex_fields = self->expected_fields;
+        } else {
+            ex_fields = self->line_fields[self->lines - 1];
+        }
     }
 
     if (self->skipset != NULL) {
@@ -457,7 +461,10 @@ static int end_line(parser_t *self) {
         }
     }
 
-    if (!(self->lines <= self->header + 1) && fields > ex_fields) {
+    /* printf("Line: %d, Fields: %d, Ex-fields: %d\n", self->lines, fields, ex_fields); */
+
+    if (!(self->lines <= self->header + 1)
+        && (self->expected_fields < 0 && fields > ex_fields)) {
         // increment file line count
         self->file_lines++;
 
@@ -491,7 +498,7 @@ static int end_line(parser_t *self) {
     }
     else {
         /* missing trailing delimiters */
-        if (self->lines >= self->header + 1 && self->lines > 0) {
+        if (self->lines >= self->header + 1) {
             while (fields < ex_fields){
                 end_field(self);
                 /* printf("Prior word: %s\n", self->words[self->words_len - 2]); */
diff --git a/pandas/src/parser/tokenizer.h b/pandas/src/parser/tokenizer.h
@@ -183,6 +183,7 @@ typedef struct parser_t {
     int allow_embedded_newline;
     int strict;                 /* raise exception on bad CSV */
 
+    int expected_fields;
     int error_bad_lines;
     int warn_bad_lines;