pandas-dev · selasley · Jan 27, 2015
diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt
@@ -134,8 +134,6 @@ Bug Fixes
 - Bug in the returned ``Series.dt.components`` index was reset to the default index (:issue:`9247`)
 
 
-
-
 - Fixed bug in ``to_sql`` when mapping a Timestamp object column (datetime
   column with timezone info) to the according sqlalchemy type (:issue:`9085`).
 - Fixed bug in ``to_sql`` ``dtype`` argument not accepting an instantiated
@@ -210,3 +208,6 @@ Bug Fixes
 - Fixes issue with ``index_col=False`` when ``usecols`` is also specified in ``read_csv``. (:issue:`9082`)
 - Bug where ``wide_to_long`` would modify the input stubnames list (:issue:`9204`)
 - Bug in to_sql not storing float64 values using double precision. (:issue:`9009`)
+
+
+- Bug in read_csv.  Buffer overflows with certain malformed input files (:issue:`9205`)
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -3258,6 +3258,19 @@ def test_fallback_to_python(self):
             self.read_table(StringIO(data), engine='c', skip_footer=1)
 
 
+    def test_buffer_overflow(self):
+        # GH9205
+        # test certain malformed input files that cause buffer overflows in
+        # tokenizer.c
+        malfw = "1\r1\r1\r 1\r 1\r"         # buffer overflow in words pointer
+        malfs = "1\r1\r1\r 1\r 1\r11\r"     # buffer overflow in stream pointer
+        malfl = "1\r1\r1\r 1\r 1\r11\r1\r"  # buffer overflow in lines pointer
+        for malf in (malfw, malfs, malfl):
+            try:
+                df = self.read_table(StringIO(malf))
+            except Exception as cperr:
+                self.assertIn('Buffer overflow caught - possible malformed input file.', str(cperr))
+
 class TestCParserLowMemory(ParserTests, tm.TestCase):
 
     def read_csv(self, *args, **kwds):
@@ -3666,6 +3679,19 @@ def test_raise_on_sep_with_delim_whitespace(self):
             self.read_table(StringIO(data), sep='\s', delim_whitespace=True)
 
 
+    def test_buffer_overflow(self):
+        # GH9205
+        # test certain malformed input files that cause buffer overflows in
+        # tokenizer.c
+        malfw = "1\r1\r1\r 1\r 1\r"         # buffer overflow in words pointer
+        malfs = "1\r1\r1\r 1\r 1\r11\r"     # buffer overflow in stream pointer
+        malfl = "1\r1\r1\r 1\r 1\r11\r1\r"  # buffer overflow in lines pointer
+        for malf in (malfw, malfs, malfl):
+            try:
+                df = self.read_table(StringIO(malf))
+            except Exception as cperr:
+                self.assertIn('Buffer overflow caught - possible malformed input file.', str(cperr))
+
 class TestMiscellaneous(tm.TestCase):
 
     # for tests that don't fit into any of the other classes, e.g. those that

diff --git a/pandas/parser.pyx b/pandas/parser.pyx
@@ -137,7 +137,7 @@ cdef extern from "parser/tokenizer.h":
         int quoting                # style of quoting to write */
 
         # hmm =/
-        int numeric_field
+#        int numeric_field
 
         char commentchar
         int allow_embedded_newline
@@ -198,7 +198,7 @@ cdef extern from "parser/tokenizer.h":
 
     int64_t str_to_int64(char *p_item, int64_t int_min,
                          int64_t int_max, int *error, char tsep)
-    uint64_t str_to_uint64(char *p_item, uint64_t uint_max, int *error)
+#    uint64_t str_to_uint64(char *p_item, uint64_t uint_max, int *error)
 
     double xstrtod(const char *p, char **q, char decimal, char sci,
                    char tsep, int skip_trailing)
@@ -207,12 +207,12 @@ cdef extern from "parser/tokenizer.h":
     double round_trip(const char *p, char **q, char decimal, char sci,
                    char tsep, int skip_trailing)
 
-    inline int to_complex(char *item, double *p_real,
-                          double *p_imag, char sci, char decimal)
+#    inline int to_complex(char *item, double *p_real,
+#                          double *p_imag, char sci, char decimal)
     inline int to_longlong(char *item, long long *p_value)
-    inline int to_longlong_thousands(char *item, long long *p_value,
-                                     char tsep)
-    inline int to_boolean(char *item, uint8_t *val)
+#    inline int to_longlong_thousands(char *item, long long *p_value,
+#                                     char tsep)
+    int to_boolean(char *item, uint8_t *val)
 
 
 cdef extern from "parser/io.h":
@@ -1055,7 +1055,8 @@ cdef class TextReader:
                              bint user_dtype,
                              kh_str_t *na_hashset,
                              object na_flist):
-        cdef kh_str_t *true_set, *false_set
+        cdef kh_str_t *true_set
+        cdef kh_str_t *false_set
 
         if dtype[1] == 'i' or dtype[1] == 'u':
             result, na_count = _try_int64(self.parser, i, start, end,
@@ -1443,7 +1444,8 @@ cdef _to_fw_string(parser_t *parser, int col, int line_start,
         int error
         Py_ssize_t i, j
         coliter_t it
-        char *word, *data
+        char *word
+        char *data
         ndarray result
 
     result = np.empty(line_end - line_start, dtype='|S%d' % width)