Skip to content

Commit 03001be

Browse files
gfyoungjreback
authored andcommitted
Fix memory growth bug in read_csv (#24837)
* Fix memory growth bug in read_csv The edge case where we hit powers of 2 every time during allocation can be painful. Closes gh-24805. xref gh-23527. * TST: Add ASV benchmark for issue
1 parent f4458c1 commit 03001be

File tree

3 files changed

+38
-1
lines changed

3 files changed

+38
-1
lines changed

asv_bench/benchmarks/io/csv.py

+19
Original file line numberDiff line numberDiff line change
@@ -214,4 +214,23 @@ def time_baseline(self):
214214
names=list(string.digits[:9]))
215215

216216

217+
class ReadCSVMemoryGrowth(BaseIO):
218+
219+
chunksize = 20
220+
num_rows = 1000
221+
fname = "__test__.csv"
222+
223+
def setup(self):
224+
with open(self.fname, "w") as f:
225+
for i in range(self.num_rows):
226+
f.write("{i}\n".format(i=i))
227+
228+
def mem_parser_chunks(self):
229+
# see gh-24805.
230+
result = read_csv(self.fname, chunksize=self.chunksize)
231+
232+
for _ in result:
233+
pass
234+
235+
217236
from ..pandas_vb_common import setup # noqa: F401

pandas/_libs/src/parser/tokenizer.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -300,7 +300,7 @@ static int make_stream_space(parser_t *self, size_t nbytes) {
300300
* just because a recent chunk did not have as many words.
301301
*/
302302
if (self->words_len + nbytes < self->max_words_cap) {
303-
length = self->max_words_cap - nbytes;
303+
length = self->max_words_cap - nbytes - 1;
304304
} else {
305305
length = self->words_len;
306306
}

pandas/tests/io/parser/test_common.py

+18
Original file line numberDiff line numberDiff line change
@@ -1916,6 +1916,24 @@ def test_filename_with_special_chars(all_parsers):
19161916
tm.assert_frame_equal(result, df)
19171917

19181918

1919+
def test_read_csv_memory_growth_chunksize(all_parsers):
1920+
# see gh-24805
1921+
#
1922+
# Let's just make sure that we don't crash
1923+
# as we iteratively process all chunks.
1924+
parser = all_parsers
1925+
1926+
with tm.ensure_clean() as path:
1927+
with open(path, "w") as f:
1928+
for i in range(1000):
1929+
f.write(str(i) + "\n")
1930+
1931+
result = parser.read_csv(path, chunksize=20)
1932+
1933+
for _ in result:
1934+
pass
1935+
1936+
19191937
def test_read_table_deprecated(all_parsers):
19201938
# see gh-21948
19211939
parser = all_parsers

0 commit comments

Comments
 (0)