From e24179675f033f3dd2d1823317955d54ff8086c2 Mon Sep 17 00:00:00 2001
From: gfyoung <gfyoung17+GitHub@gmail.com>
Date: Sat, 19 Jan 2019 11:37:29 +0000
Subject: [PATCH 1/2] Fix memory growth bug in read_csv

The edge case where we hit powers of 2
every time during allocation can be painful.

Closes gh-24805.

xref gh-23527.
---
 pandas/_libs/src/parser/tokenizer.c   |  2 +-
 pandas/tests/io/parser/test_common.py | 18 ++++++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 3a4058f37efc7..a86af7c5416de 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -300,7 +300,7 @@ static int make_stream_space(parser_t *self, size_t nbytes) {
      * just because a recent chunk did not have as many words.
      */
     if (self->words_len + nbytes < self->max_words_cap) {
-        length = self->max_words_cap - nbytes;
+        length = self->max_words_cap - nbytes - 1;
     } else {
         length = self->words_len;
     }
diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py
index d87ef7cd15a64..05da171d7dc31 100644
--- a/pandas/tests/io/parser/test_common.py
+++ b/pandas/tests/io/parser/test_common.py
@@ -1916,6 +1916,24 @@ def test_filename_with_special_chars(all_parsers):
         tm.assert_frame_equal(result, df)
 
 
+def test_read_csv_memory_growth_chunksize(all_parsers):
+    # see gh-24805
+    #
+    # Let's just make sure that we don't crash
+    # as we iteratively process all chunks.
+    parser = all_parsers
+
+    with tm.ensure_clean() as path:
+        with open(path, "w") as f:
+            for i in range(1000):
+                f.write(str(i) + "\n")
+
+        result = parser.read_csv(path, chunksize=20)
+
+        for _ in result:
+            pass
+
+
 def test_read_table_deprecated(all_parsers):
     # see gh-21948
     parser = all_parsers

From 0c366a8453827c9e46482dd2b4dddd738981d70b Mon Sep 17 00:00:00 2001
From: gfyoung <gfyoung17+GitHub@gmail.com>
Date: Sun, 20 Jan 2019 00:05:17 +0000
Subject: [PATCH 2/2] TST: Add ASV benchmark for issue

---
 asv_bench/benchmarks/io/csv.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
index 771f2795334e1..d42a15d61fb0d 100644
--- a/asv_bench/benchmarks/io/csv.py
+++ b/asv_bench/benchmarks/io/csv.py
@@ -214,4 +214,23 @@ def time_baseline(self):
                  names=list(string.digits[:9]))
 
 
+class ReadCSVMemoryGrowth(BaseIO):
+
+    chunksize = 20
+    num_rows = 1000
+    fname = "__test__.csv"
+
+    def setup(self):
+        with open(self.fname, "w") as f:
+            for i in range(self.num_rows):
+                f.write("{i}\n".format(i=i))
+
+    def mem_parser_chunks(self):
+        # see gh-24805.
+        result = read_csv(self.fname, chunksize=self.chunksize)
+
+        for _ in result:
+            pass
+
+
 from ..pandas_vb_common import setup  # noqa: F401