From e24179675f033f3dd2d1823317955d54ff8086c2 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sat, 19 Jan 2019 11:37:29 +0000 Subject: [PATCH 1/2] Fix memory growth bug in read_csv The edge case where we hit powers of 2 every time during allocation can be painful. Closes gh-24805. xref gh-23527. --- pandas/_libs/src/parser/tokenizer.c | 2 +- pandas/tests/io/parser/test_common.py | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 3a4058f37efc7..a86af7c5416de 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -300,7 +300,7 @@ static int make_stream_space(parser_t *self, size_t nbytes) { * just because a recent chunk did not have as many words. */ if (self->words_len + nbytes < self->max_words_cap) { - length = self->max_words_cap - nbytes; + length = self->max_words_cap - nbytes - 1; } else { length = self->words_len; } diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index d87ef7cd15a64..05da171d7dc31 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -1916,6 +1916,24 @@ def test_filename_with_special_chars(all_parsers): tm.assert_frame_equal(result, df) +def test_read_csv_memory_growth_chunksize(all_parsers): + # see gh-24805 + # + # Let's just make sure that we don't crash + # as we iteratively process all chunks. + parser = all_parsers + + with tm.ensure_clean() as path: + with open(path, "w") as f: + for i in range(1000): + f.write(str(i) + "\n") + + result = parser.read_csv(path, chunksize=20) + + for _ in result: + pass + + def test_read_table_deprecated(all_parsers): # see gh-21948 parser = all_parsers From 0c366a8453827c9e46482dd2b4dddd738981d70b Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sun, 20 Jan 2019 00:05:17 +0000 Subject: [PATCH 2/2] TST: Add ASV benchmark for issue --- asv_bench/benchmarks/io/csv.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 771f2795334e1..d42a15d61fb0d 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -214,4 +214,23 @@ def time_baseline(self): names=list(string.digits[:9])) +class ReadCSVMemoryGrowth(BaseIO): + + chunksize = 20 + num_rows = 1000 + fname = "__test__.csv" + + def setup(self): + with open(self.fname, "w") as f: + for i in range(self.num_rows): + f.write("{i}\n".format(i=i)) + + def mem_parser_chunks(self): + # see gh-24805. + result = read_csv(self.fname, chunksize=self.chunksize) + + for _ in result: + pass + + from ..pandas_vb_common import setup # noqa: F401