Skip to content

Commit 011b79f

Browse files
gfyoungjreback
authored andcommitted
BUG: Don't over-optimize memory with jagged CSV (pandas-dev#23527)
With jagged CSV's, we risk being too quick to dump memory that we need to allocate because previous chunks would have indicated much larger rows than we can anticipate in subsequent chunks. Closes pandas-devgh-23509.
1 parent 0bc4580 commit 011b79f

File tree

5 files changed

+50
-2
lines changed

5 files changed

+50
-2
lines changed

doc/source/whatsnew/v0.24.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -1299,6 +1299,7 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form
12991299
- :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`)
13001300
- :func:`read_csv()` and func:`read_table()` will throw ``UnicodeError`` and not coredump on badly encoded strings (:issue:`22748`)
13011301
- :func:`read_csv()` will correctly parse timezone-aware datetimes (:issue:`22256`)
1302+
- Bug in :func:`read_csv()` in which memory management was prematurely optimized for the C engine when the data was being read in chunks (:issue:`23509`)
13021303
- :func:`read_sas()` will parse numbers in sas7bdat-files that have width less than 8 bytes correctly. (:issue:`21616`)
13031304
- :func:`read_sas()` will correctly parse sas7bdat files with many columns (:issue:`22628`)
13041305
- :func:`read_sas()` will correctly parse sas7bdat files with data page types having also bit 7 set (so page type is 128 + 256 = 384) (:issue:`16615`)

pandas/_libs/parsers.pyx

+1
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ cdef extern from "parser/tokenizer.h":
132132
int64_t *word_starts # where we are in the stream
133133
int64_t words_len
134134
int64_t words_cap
135+
int64_t max_words_cap # maximum word cap encountered
135136

136137
char *pword_start # pointer to stream start of current field
137138
int64_t word_start # position start of current field

pandas/_libs/src/parser/tokenizer.c

+31-2
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,7 @@ int parser_init(parser_t *self) {
197197
sz = sz ? sz : 1;
198198
self->words = (char **)malloc(sz * sizeof(char *));
199199
self->word_starts = (int64_t *)malloc(sz * sizeof(int64_t));
200+
self->max_words_cap = sz;
200201
self->words_cap = sz;
201202
self->words_len = 0;
202203

@@ -247,7 +248,7 @@ void parser_del(parser_t *self) {
247248
}
248249

249250
static int make_stream_space(parser_t *self, size_t nbytes) {
250-
int64_t i, cap;
251+
int64_t i, cap, length;
251252
int status;
252253
void *orig_ptr, *newptr;
253254

@@ -287,8 +288,23 @@ static int make_stream_space(parser_t *self, size_t nbytes) {
287288
*/
288289

289290
cap = self->words_cap;
291+
292+
/**
293+
* If we are reading in chunks, we need to be aware of the maximum number
294+
* of words we have seen in previous chunks (self->max_words_cap), so
295+
* that way, we can properly allocate when reading subsequent ones.
296+
*
297+
* Otherwise, we risk a buffer overflow if we mistakenly under-allocate
298+
* just because a recent chunk did not have as many words.
299+
*/
300+
if (self->words_len + nbytes < self->max_words_cap) {
301+
length = self->max_words_cap - nbytes;
302+
} else {
303+
length = self->words_len;
304+
}
305+
290306
self->words =
291-
(char **)grow_buffer((void *)self->words, self->words_len,
307+
(char **)grow_buffer((void *)self->words, length,
292308
(int64_t*)&self->words_cap, nbytes,
293309
sizeof(char *), &status);
294310
TRACE(
@@ -1241,6 +1257,19 @@ int parser_trim_buffers(parser_t *self) {
12411257

12421258
int64_t i;
12431259

1260+
/**
1261+
* Before we free up space and trim, we should
1262+
* save how many words we saw when parsing, if
1263+
* it exceeds the maximum number we saw before.
1264+
*
1265+
* This is important for when we read in chunks,
1266+
* so that we can inform subsequent chunk parsing
1267+
* as to how many words we could possibly see.
1268+
*/
1269+
if (self->words_cap > self->max_words_cap) {
1270+
self->max_words_cap = self->words_cap;
1271+
}
1272+
12441273
/* trim words, word_starts */
12451274
new_cap = _next_pow2(self->words_len) + 1;
12461275
if (new_cap < self->words_cap) {

pandas/_libs/src/parser/tokenizer.h

+1
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,7 @@ typedef struct parser_t {
142142
int64_t *word_starts; // where we are in the stream
143143
int64_t words_len;
144144
int64_t words_cap;
145+
int64_t max_words_cap; // maximum word cap encountered
145146

146147
char *pword_start; // pointer to stream start of current field
147148
int64_t word_start; // position start of current field

pandas/tests/io/parser/common.py

+16
Original file line numberDiff line numberDiff line change
@@ -458,6 +458,22 @@ def test_read_chunksize_generated_index(self):
458458

459459
tm.assert_frame_equal(pd.concat(reader), df)
460460

461+
def test_read_chunksize_jagged_names(self):
462+
# see gh-23509
463+
data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)])
464+
reader = self.read_csv(StringIO(data), names=range(10), chunksize=4)
465+
466+
expected = DataFrame()
467+
468+
for i in range(10):
469+
if i == 0:
470+
expected[i] = [0] * 8
471+
else:
472+
expected[i] = [np.nan] * 7 + [0]
473+
474+
result = pd.concat(reader)
475+
tm.assert_frame_equal(result, expected)
476+
461477
def test_read_text_list(self):
462478
data = """A,B,C\nfoo,1,2,3\nbar,4,5,6"""
463479
as_list = [['A', 'B', 'C'], ['foo', '1', '2', '3'], ['bar',

0 commit comments

Comments
 (0)