BUG: Fixed grow_buffer to grow when capacity is reached

gfyoung · jreback · commit c69037cbac35 · 2016-03-03T16:54:51.000-05:00
Addresses issue in #12494 by allowing `grow_buffer` to grow the size of the parser buffer when buffer capacity is achieved. Previously, you had to exceed capacity for this to occur, but that was inconsistent with the `end_field` check later on when handling the EOF terminator, where reached capacity was considered a buffer overflow. Author: gfyoung <gfyoung17@gmail.com> Closes #12504 from gfyoung/read_csv_empty_header and squashes the following commits: 8ba3dd0 [gfyoung] BUG: Fixed grow_buffer to grow when capacity is reached
diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt
@@ -1199,3 +1199,4 @@ Bug Fixes
 - Bug in ``DataFrame.apply`` in which reduction was not being prevented for cases in which ``dtype`` was not a numpy dtype (:issue:`12244`)
 - Bug when initializing categorical series with a scalar value. (:issue:`12336`)
 - Bug when specifying a UTC ``DatetimeIndex`` by setting ``utc=True`` in ``.to_datetime`` (:issue:`11934`)
+- Bug when increasing the buffer size of CSV reader in ``read_csv`` (:issue:`12494`)
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -2635,6 +2635,26 @@ def test_eof_states(self):
         self.assertRaises(Exception, self.read_csv,
                           StringIO(data), escapechar='\\')
 
+    def test_grow_boundary_at_cap(self):
+        # See gh-12494
+        #
+        # Cause of error was the fact that pandas
+        # was not increasing the buffer size when
+        # the desired space would fill the buffer
+        # to capacity, which later would cause a
+        # buffer overflow error when checking the
+        # EOF terminator of the CSV stream
+        def test_empty_header_read(count):
+            s = StringIO(',' * count)
+            expected = DataFrame(columns=[
+                'Unnamed: {i}'.format(i=i)
+                for i in range(count + 1)])
+            df = read_csv(s)
+            tm.assert_frame_equal(df, expected)
+
+        for count in range(1, 101):
+            test_empty_header_read(count)
+
 
 class TestPythonParser(ParserTests, tm.TestCase):
 
diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c
@@ -111,7 +111,7 @@ static void *grow_buffer(void *buffer, int length, int *capacity,
     void *newbuffer = buffer;
 
     // Can we fit potentially nbytes tokens (+ null terminators) in the stream?
-    while ( (length + space > cap) && (newbuffer != NULL) ){
+    while ( (length + space >= cap) && (newbuffer != NULL) ){
         cap = cap? cap << 1 : 2;
         buffer = newbuffer;
         newbuffer = safe_realloc(newbuffer, elsize * cap);