BUG: parser-tokenizer incorrect state in trailing field handling logic. stupid goto. close #2668

wesm · wesm · commit e05a3d064164 · 2013-01-19T20:29:39.000-05:00
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -70,6 +70,7 @@ pandas 0.10.1
     - handle correctly ``Term`` passed types (e.g. ``index<1000``, when index
       is ``Int64``), (closes GH512_)
     - handle Timestamp correctly in data_columns (closes GH2637_)
+
   - Fix DataFrame.info bug with UTF8-encoded columns. (GH2576_)
   - Fix DatetimeIndex handling of FixedOffset tz (GH2604_)
   - More robust detection of being in IPython session for wide DataFrame
@@ -86,6 +87,7 @@ pandas 0.10.1
   - Don't bork Series containing datetime64 values with to_datetime (GH2699_)
   - Fix DataFrame.from_records corner case when passed columns, index column,
     but empty record list (GH2633_)
+  - Fix C parser-tokenizer bug with trailing fields. (GH2668_)
 
 **API Changes**
 
@@ -108,6 +110,7 @@ pandas 0.10.1
 .. _GH2631: https://github.com/pydata/pandas/issues/2631
 .. _GH2633: https://github.com/pydata/pandas/issues/2633
 .. _GH2637: https://github.com/pydata/pandas/issues/2637
+.. _GH2668: https://github.com/pydata/pandas/issues/2668
 .. _GH2690: https://github.com/pydata/pandas/issues/2690
 .. _GH2692: https://github.com/pydata/pandas/issues/2692
 .. _GH2699: https://github.com/pydata/pandas/issues/2699
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -2017,6 +2017,15 @@ def test_raise_on_passed_int_dtype_with_nas(self):
                           skipinitialspace=True,
                           dtype={'DOY': np.int64})
 
+    def test_na_trailing_columns(self):
+        data = """Date,Currenncy,Symbol,Type,Units,UnitPrice,Cost,Tax
+2012-03-14,USD,AAPL,BUY,1000
+2012-05-12,USD,SBUX,SELL,500"""
+
+        result = self.read_csv(StringIO(data))
+        self.assertEquals(result['Date'][1], '2012-05-12')
+        self.assertTrue(result['UnitPrice'].isnull().all())
+
 
 class TestParseSQL(unittest.TestCase):
 
diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c
@@ -494,6 +494,7 @@ static int end_line(parser_t *self) {
         if (self->lines >= self->header + 1 && self->lines > 0) {
             while (fields < ex_fields){
                 end_field(self);
+                /* printf("Prior word: %s\n", self->words[self->words_len - 2]); */
                 fields++;
             }
         }
@@ -503,6 +504,10 @@ static int end_line(parser_t *self) {
 
         self->lines++;
 
+        /* coliter_t it; */
+        /* coliter_setup(&it, self, 5, self->lines - 1); */
+        /* printf("word at column 5: %s\n", COLITER_NEXT(it)); */
+
         // good line, set new start point
         self->line_start[self->lines] = (self->line_start[self->lines - 1] +
                                          fields);
@@ -601,13 +606,13 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) {
     if (end_line(self) < 0) {                                           \
         goto parsingerror;                                              \
     }                                                                   \
+    stream = self->stream + self->stream_len;                           \
+    slen = self->stream_len;                                            \
     self->state = STATE;                                                \
     if (line_limit > 0 && self->lines == start_lines + line_limit) {    \
         goto linelimit;                                                 \
                                                                         \
-    }                                                                   \
-    stream = self->stream + self->stream_len;                           \
-    slen = self->stream_len;
+    }
 
 #define END_LINE_AND_FIELD_STATE(STATE)                                 \
     self->stream_len = slen;                                            \
diff --git a/pandas/src/parser/tokenizer.h b/pandas/src/parser/tokenizer.h
@@ -88,7 +88,7 @@ See LICENSE for the license
 #define ERROR_NO_DATA                  23
 
 
-/* #define VERBOSE */
+// #define VERBOSE
 
 #if defined(VERBOSE)
 #define TRACE(X) printf X;