pandas-dev · jreback · Apr 12, 2015 · Apr 9, 2015
diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt
@@ -91,6 +91,7 @@ Bug Fixes
 
 - Fixed bug (:issue:`9542`) where labels did not appear properly in legend of ``DataFrame.plot()``. Passing ``label=`` args also now works, and series indices are no longer mutated.
 - Bug in json serialization when frame has length zero.(:issue:`9805`)
+- Bug in `read_csv` where missing trailing delimiters would cause segfault. (:issue:`5664`)
 
 
 - Bug in ``scatter_matrix`` draws unexpected axis ticklabels (:issue:`5662`)

diff --git a/pandas/io/tests/test_cparser.py b/pandas/io/tests/test_cparser.py
@@ -336,6 +336,28 @@ def test_empty_field_eof(self):
                     2: np.array(['3', ''], dtype=object)}
         assert_array_dicts_equal(result, expected)
 
+        # GH5664
+        a = DataFrame([['b'], [nan]], columns=['a'], index=['a', 'c'])
+        b = DataFrame([[1, 1, 1, 0], [1, 1, 1, 0]],
+                      columns=list('abcd'),
+                      index=[1, 1])
+        c = DataFrame([[1, 2, 3, 4], [6, nan, nan, nan],
+                       [8, 9, 10, 11], [13, 14, nan, nan]],
+                       columns=list('abcd'),
+                       index=[0, 5, 7, 12])
+
+        for _ in range(100):
+            df = read_csv(StringIO('a,b\nc\n'), skiprows=0,
+                          names=['a'], engine='c')
+            assert_frame_equal(df, a)
+
+            df = read_csv(StringIO('1,1,1,1,0\n'*2 + '\n'*2),
+                          names=list("abcd"), engine='c')
+            assert_frame_equal(df, b)
+
+            df = read_csv(StringIO('0,1,2,3,4\n5,6\n7,8,9,10,11\n12,13,14'),
+                          names=list('abcd'), engine='c')
+            assert_frame_equal(df, c)
 
 def assert_array_dicts_equal(left, right):
     for k, v in compat.iteritems(left):

diff --git a/pandas/parser.pyx b/pandas/parser.pyx
@@ -175,7 +175,7 @@ cdef extern from "parser/tokenizer.h":
         int col
 
     void coliter_setup(coliter_t *it, parser_t *parser, int i, int start)
-    char* COLITER_NEXT(coliter_t it)
+    void COLITER_NEXT(coliter_t, const char *)
 
     parser_t* parser_new()
 
@@ -212,7 +212,7 @@ cdef extern from "parser/tokenizer.h":
     inline int to_longlong(char *item, long long *p_value)
 #    inline int to_longlong_thousands(char *item, long long *p_value,
 #                                     char tsep)
-    int to_boolean(char *item, uint8_t *val)
+    int to_boolean(const char *item, uint8_t *val)
 
 
 cdef extern from "parser/io.h":
@@ -1279,7 +1279,7 @@ cdef _string_box_factorize(parser_t *parser, int col,
         Py_ssize_t i
         size_t lines
         coliter_t it
-        char *word
+        const char *word = NULL
         ndarray[object] result
 
         int ret = 0
@@ -1296,7 +1296,7 @@ cdef _string_box_factorize(parser_t *parser, int col,
     coliter_setup(&it, parser, col, line_start)
 
     for i in range(lines):
-        word = COLITER_NEXT(it)
+        COLITER_NEXT(it, word)
 
         if na_filter:
             k = kh_get_str(na_hashset, word)
@@ -1333,7 +1333,7 @@ cdef _string_box_utf8(parser_t *parser, int col,
         Py_ssize_t i
         size_t lines
         coliter_t it
-        char *word
+        const char *word = NULL
         ndarray[object] result
 
         int ret = 0
@@ -1350,7 +1350,7 @@ cdef _string_box_utf8(parser_t *parser, int col,
     coliter_setup(&it, parser, col, line_start)
 
     for i in range(lines):
-        word = COLITER_NEXT(it)
+        COLITER_NEXT(it, word)
 
         if na_filter:
             k = kh_get_str(na_hashset, word)
@@ -1388,7 +1388,7 @@ cdef _string_box_decode(parser_t *parser, int col,
         Py_ssize_t i, size
         size_t lines
         coliter_t it
-        char *word
+        const char *word = NULL
         ndarray[object] result
 
         int ret = 0
@@ -1407,7 +1407,7 @@ cdef _string_box_decode(parser_t *parser, int col,
     coliter_setup(&it, parser, col, line_start)
 
     for i in range(lines):
-        word = COLITER_NEXT(it)
+        COLITER_NEXT(it, word)
 
         if na_filter:
             k = kh_get_str(na_hashset, word)
@@ -1444,7 +1444,7 @@ cdef _to_fw_string(parser_t *parser, int col, int line_start,
         int error
         Py_ssize_t i, j
         coliter_t it
-        char *word
+        const char *word = NULL
         char *data
         ndarray result
 
@@ -1454,7 +1454,7 @@ cdef _to_fw_string(parser_t *parser, int col, int line_start,
     coliter_setup(&it, parser, col, line_start)
 
     for i in range(line_end - line_start):
-        word = COLITER_NEXT(it)
+        COLITER_NEXT(it, word)
         strncpy(data, word, width)
         data += width
 
@@ -1469,7 +1469,7 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end,
         int error, na_count = 0
         size_t i, lines
         coliter_t it
-        char *word
+        const char *word = NULL
         char *p_end
         double *data
         double NA = na_values[np.float64]
@@ -1485,7 +1485,7 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end,
 
     if na_filter:
         for i in range(lines):
-            word = COLITER_NEXT(it)
+            COLITER_NEXT(it, word)
 
             k = kh_get_str(na_hashset, word)
             # in the hash table
@@ -1509,7 +1509,7 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end,
             data += 1
     else:
         for i in range(lines):
-            word = COLITER_NEXT(it)
+            COLITER_NEXT(it, word)
             data[0] = parser.converter(word, &p_end, parser.decimal, parser.sci,
                                          parser.thousands, 1)
             if errno != 0 or p_end[0] or p_end == word:
@@ -1530,7 +1530,7 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end,
         int error, na_count = 0
         size_t i, lines
         coliter_t it
-        char *word
+        const char *word = NULL
         int64_t *data
         ndarray result
 
@@ -1544,7 +1544,7 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end,
 
     if na_filter:
         for i in range(lines):
-            word = COLITER_NEXT(it)
+            COLITER_NEXT(it, word)
             k = kh_get_str(na_hashset, word)
             # in the hash table
             if k != na_hashset.n_buckets:
@@ -1561,7 +1561,7 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end,
                 return None, None
     else:
         for i in range(lines):
-            word = COLITER_NEXT(it)
+            COLITER_NEXT(it, word)
             data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
                                    &error, parser.thousands)
             if error != 0:
@@ -1578,7 +1578,7 @@ cdef _try_bool(parser_t *parser, int col, int line_start, int line_end,
         int error, na_count = 0
         size_t i, lines
         coliter_t it
-        char *word
+        const char *word = NULL
         uint8_t *data
         ndarray result
 
@@ -1592,7 +1592,7 @@ cdef _try_bool(parser_t *parser, int col, int line_start, int line_end,
 
     if na_filter:
         for i in range(lines):
-            word = COLITER_NEXT(it)
+            COLITER_NEXT(it, word)
 
             k = kh_get_str(na_hashset, word)
             # in the hash table
@@ -1608,7 +1608,7 @@ cdef _try_bool(parser_t *parser, int col, int line_start, int line_end,
             data += 1
     else:
         for i in range(lines):
-            word = COLITER_NEXT(it)
+            COLITER_NEXT(it, word)
 
             error = to_boolean(word, data)
             if error != 0:
@@ -1625,7 +1625,7 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end,
         int error, na_count = 0
         size_t i, lines
         coliter_t it
-        char *word
+        const char *word = NULL
         uint8_t *data
         ndarray result
 
@@ -1639,7 +1639,7 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end,
 
     if na_filter:
         for i in range(lines):
-            word = COLITER_NEXT(it)
+            COLITER_NEXT(it, word)
 
             k = kh_get_str(na_hashset, word)
             # in the hash table
@@ -1667,7 +1667,7 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end,
             data += 1
     else:
         for i in range(lines):
-            word = COLITER_NEXT(it)
+            COLITER_NEXT(it, word)
 
             k = kh_get_str(true_hashset, word)
             if k != true_hashset.n_buckets:
@@ -1688,33 +1688,6 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end,
 
     return result.view(np.bool_), na_count
 
-cdef _get_na_mask(parser_t *parser, int col, int line_start, int line_end,
-                  kh_str_t *na_hashset):
-    cdef:
-        int error
-        Py_ssize_t i
-        size_t lines
-        coliter_t it
-        char *word
-        ndarray[uint8_t, cast=True] result
-        khiter_t k
-
-    lines = line_end - line_start
-    result = np.empty(lines, dtype=np.bool_)
-
-    coliter_setup(&it, parser, col, line_start)
-    for i in range(lines):
-        word = COLITER_NEXT(it)
-
-        k = kh_get_str(na_hashset, word)
-        # in the hash table
-        if k != na_hashset.n_buckets:
-            result[i] = 1
-        else:
-            result[i] = 0
-
-    return result
-
 cdef kh_str_t* kset_from_list(list values) except NULL:
     # caller takes responsibility for freeing the hash table
     cdef:
@@ -1897,7 +1870,7 @@ cdef _apply_converter(object f, parser_t *parser, int col,
         Py_ssize_t i
         size_t lines
         coliter_t it
-        char *word
+        const char *word = NULL
         char *errors = "strict"
         ndarray[object] result
         object val
@@ -1909,17 +1882,17 @@ cdef _apply_converter(object f, parser_t *parser, int col,
 
     if not PY3 and c_encoding == NULL:
         for i in range(lines):
-            word = COLITER_NEXT(it)
+            COLITER_NEXT(it, word)
             val = PyBytes_FromString(word)
             result[i] = f(val)
     elif ((PY3 and c_encoding == NULL) or c_encoding == b'utf-8'):
         for i in range(lines):
-            word = COLITER_NEXT(it)
+            COLITER_NEXT(it, word)
             val = PyUnicode_FromString(word)
             result[i] = f(val)
     else:
         for i in range(lines):
-            word = COLITER_NEXT(it)
+            COLITER_NEXT(it, word)
             val = PyUnicode_Decode(word, strlen(word),
                                    c_encoding, errors)
             result[i] = f(val)