Fix memory growth bug in read_csv

gfyoung · gfyoung · commit 12aaad0c47ff · 2019-01-19T11:38:47.000Z
The edge case where we hit powers of 2 every time during allocation can be painful. Closes gh-24805. xref gh-23527.
diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
@@ -300,7 +300,7 @@ static int make_stream_space(parser_t *self, size_t nbytes) {
      * just because a recent chunk did not have as many words.
      */
     if (self->words_len + nbytes < self->max_words_cap) {
-        length = self->max_words_cap - nbytes;
+        length = self->max_words_cap - nbytes - 1;
     } else {
         length = self->words_len;
     }
diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py
@@ -1916,6 +1916,24 @@ def test_filename_with_special_chars(all_parsers):
         tm.assert_frame_equal(result, df)
 
 
+def test_read_csv_memory_growth_chunksize(all_parsers):
+    # see gh-24805
+    #
+    # Let's just make sure that we don't crash
+    # as we iteratively process all chunks.
+    parser = all_parsers
+
+    with tm.ensure_clean() as path:
+        with open(path, "w") as f:
+            for i in range(1000):
+                f.write(str(i) + "\n")
+
+        result = parser.read_csv(path, chunksize=20)
+
+        for _ in result:
+            pass
+
+
 def test_read_table_deprecated(all_parsers):
     # see gh-21948
     parser = all_parsers

Original file line number	Diff line number	Diff line change
`@@ -300,7 +300,7 @@ static int make_stream_space(parser_t *self, size_t nbytes) {`
`300`	`300`	`* just because a recent chunk did not have as many words.`
`301`	`301`	`*/`
`302`	`302`	`if (self->words_len + nbytes < self->max_words_cap) {`
`303`		`- length = self->max_words_cap - nbytes;`
	`303`	`+ length = self->max_words_cap - nbytes - 1;`
`304`	`304`	`} else {`
`305`	`305`	`length = self->words_len;`
`306`	`306`	`}`