Skip to content

Commit 12aaad0

Browse files
committed
Fix memory growth bug in read_csv
The edge case where we hit powers of 2 every time during allocation can be painful. Closes gh-24805. xref gh-23527.
1 parent 7e6ad86 commit 12aaad0

File tree

2 files changed

+19
-1
lines changed

2 files changed

+19
-1
lines changed

pandas/_libs/src/parser/tokenizer.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -300,7 +300,7 @@ static int make_stream_space(parser_t *self, size_t nbytes) {
300300
* just because a recent chunk did not have as many words.
301301
*/
302302
if (self->words_len + nbytes < self->max_words_cap) {
303-
length = self->max_words_cap - nbytes;
303+
length = self->max_words_cap - nbytes - 1;
304304
} else {
305305
length = self->words_len;
306306
}

pandas/tests/io/parser/test_common.py

+18
Original file line numberDiff line numberDiff line change
@@ -1916,6 +1916,24 @@ def test_filename_with_special_chars(all_parsers):
19161916
tm.assert_frame_equal(result, df)
19171917

19181918

1919+
def test_read_csv_memory_growth_chunksize(all_parsers):
1920+
# see gh-24805
1921+
#
1922+
# Let's just make sure that we don't crash
1923+
# as we iteratively process all chunks.
1924+
parser = all_parsers
1925+
1926+
with tm.ensure_clean() as path:
1927+
with open(path, "w") as f:
1928+
for i in range(1000):
1929+
f.write(str(i) + "\n")
1930+
1931+
result = parser.read_csv(path, chunksize=20)
1932+
1933+
for _ in result:
1934+
pass
1935+
1936+
19191937
def test_read_table_deprecated(all_parsers):
19201938
# see gh-21948
19211939
parser = all_parsers

0 commit comments

Comments
 (0)