Skip to content

Commit 31f8e4d

Browse files
ivannzjreback
authored andcommitted
BUG: parser_trim_buffers properly initializes word pointers in read_csv
closes pandas-dev#13703 Author: Ivan Nazarov <[email protected]> Closes pandas-dev#13788 from ivannz/parser_trim_fix and squashes the following commits: d59624e [Ivan Nazarov] Moved the test to 'c_parser_only' 9b521f6 [Ivan Nazarov] Improved the clarity and logic of the test 629198d [Ivan Nazarov] Referenced issue in the test, rewrote the bugfix description 834c851 [Ivan Nazarov] Improved readability of bugfix description; minor style fixes of the test e0b4c83 [Ivan Nazarov] flake8 style test correction 020d706 [Ivan Nazarov] Updated WHATSNEW with the bug fix information bdba66f [Ivan Nazarov] Rewritten the 'parser_trim_buffers' test 5ab3636 [Ivan Nazarov] Expanded the explanation of the patch a831dbb [Ivan Nazarov] Moved 'parser_trim_buffers' test to its proper place 07b4647 [Ivan Nazarov] praser_trim_fix: More stressful test 2120719 [Ivan Nazarov] A memory 'stress' test of parser.pyx to cause corruption or segfault 434f1e0 [Ivan Nazarov] FIX: 'parser_trim_buffers' properly initializes word pointers
1 parent 63285a4 commit 31f8e4d

File tree

3 files changed

+101
-14
lines changed

3 files changed

+101
-14
lines changed

doc/source/whatsnew/v0.19.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -675,6 +675,7 @@ Performance Improvements
675675
Bug Fixes
676676
~~~~~~~~~
677677

678+
- Bug in ``pd.read_csv()``, which may cause a segfault or corruption when iterating in large chunks over a stream/file under rare circumstances (:issue:`13703`)
678679
- Bug in ``io.json.json_normalize()``, where non-ascii keys raised an exception (:issue:`13213`)
679680
- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`)
680681
- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing result may have normal ``Index`` (:issue:`13144`)

pandas/io/tests/parser/c_parser_only.py

+70
Original file line numberDiff line numberDiff line change
@@ -381,3 +381,73 @@ def test_empty_header_read(count):
381381

382382
for count in range(1, 101):
383383
test_empty_header_read(count)
384+
385+
def test_parse_trim_buffers(self):
386+
# This test is part of a bugfix for issue #13703. It attmepts to
387+
# to stress the system memory allocator, to cause it to move the
388+
# stream buffer and either let the OS reclaim the region, or let
389+
# other memory requests of parser otherwise modify the contents
390+
# of memory space, where it was formely located.
391+
# This test is designed to cause a `segfault` with unpatched
392+
# `tokenizer.c`. Sometimes the test fails on `segfault`, other
393+
# times it fails due to memory corruption, which causes the
394+
# loaded DataFrame to differ from the expected one.
395+
396+
# Generate a large mixed-type CSV file on-the-fly (one record is
397+
# approx 1.5KiB).
398+
record_ = \
399+
"""9999-9,99:99,,,,ZZ,ZZ,,,ZZZ-ZZZZ,.Z-ZZZZ,-9.99,,,9.99,Z""" \
400+
"""ZZZZ,,-99,9,ZZZ-ZZZZ,ZZ-ZZZZ,,9.99,ZZZ-ZZZZZ,ZZZ-ZZZZZ,""" \
401+
"""ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,9""" \
402+
"""99,ZZZ-ZZZZ,,ZZ-ZZZZ,,,,,ZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,,,9,9,""" \
403+
"""9,9,99,99,999,999,ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,9,ZZ-ZZZZ,9.""" \
404+
"""99,ZZ-ZZZZ,ZZ-ZZZZ,,,,ZZZZ,,,ZZ,ZZ,,,,,,,,,,,,,9,,,999.""" \
405+
"""99,999.99,,,ZZZZZ,,,Z9,,,,,,,ZZZ,ZZZ,,,,,,,,,,,ZZZZZ,ZZ""" \
406+
"""ZZZ,ZZZ-ZZZZZZ,ZZZ-ZZZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZ""" \
407+
"""ZZ,,,999999,999999,ZZZ,ZZZ,,,ZZZ,ZZZ,999.99,999.99,,,,Z""" \
408+
"""ZZ-ZZZ,ZZZ-ZZZ,-9.99,-9.99,9,9,,99,,9.99,9.99,9,9,9.99,""" \
409+
"""9.99,,,,9.99,9.99,,99,,99,9.99,9.99,,,ZZZ,ZZZ,,999.99,,""" \
410+
"""999.99,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,ZZZZZ,ZZZZZ,ZZZ,ZZZ,9,9,""" \
411+
""",,,,,ZZZ-ZZZZ,ZZZ999Z,,,999.99,,999.99,ZZZ-ZZZZ,,,9.999""" \
412+
""",9.999,9.999,9.999,-9.999,-9.999,-9.999,-9.999,9.999,9.""" \
413+
"""999,9.999,9.999,9.999,9.999,9.999,9.999,99999,ZZZ-ZZZZ,""" \
414+
""",9.99,ZZZ,,,,,,,,ZZZ,,,,,9,,,,9,,,,,,,,,,ZZZ-ZZZZ,ZZZ-Z""" \
415+
"""ZZZ,,ZZZZZ,ZZZZZ,ZZZZZ,ZZZZZ,,,9.99,,ZZ-ZZZZ,ZZ-ZZZZ,ZZ""" \
416+
""",999,,,,ZZ-ZZZZ,ZZZ,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,99.99,99.99""" \
417+
""",,,9.99,9.99,9.99,9.99,ZZZ-ZZZZ,,,ZZZ-ZZZZZ,,,,,-9.99,-""" \
418+
"""9.99,-9.99,-9.99,,,,,,,,,ZZZ-ZZZZ,,9,9.99,9.99,99ZZ,,-9""" \
419+
""".99,-9.99,ZZZ-ZZZZ,,,,,,,ZZZ-ZZZZ,9.99,9.99,9999,,,,,,,""" \
420+
""",,,-9.9,Z/Z-ZZZZ,999.99,9.99,,999.99,ZZ-ZZZZ,ZZ-ZZZZ,9.""" \
421+
"""99,9.99,9.99,9.99,9.99,9.99,,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZ""" \
422+
"""ZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ,ZZZ,ZZZ,ZZZ,9.99,,,-9.99,ZZ""" \
423+
"""-ZZZZ,-999.99,,-9999,,999.99,,,,999.99,99.99,,,ZZ-ZZZZZ""" \
424+
"""ZZZ,ZZ-ZZZZ-ZZZZZZZ,,,,ZZ-ZZ-ZZZZZZZZ,ZZZZZZZZ,ZZZ-ZZZZ""" \
425+
""",9999,999.99,ZZZ-ZZZZ,-9.99,-9.99,ZZZ-ZZZZ,99:99:99,,99""" \
426+
""",99,,9.99,,-99.99,,,,,,9.99,ZZZ-ZZZZ,-9.99,-9.99,9.99,9""" \
427+
""".99,,ZZZ,,,,,,,ZZZ,ZZZ,,,,,"""
428+
429+
# Set the number of lines so that a call to `parser_trim_buffers`
430+
# is triggered: after a couple of full chunks are consumed a
431+
# relatively small 'residual' chunk would cause reallocation
432+
# within the parser.
433+
chunksize, n_lines = 128, 2 * 128 + 15
434+
csv_data = "\n".join([record_] * n_lines) + "\n"
435+
436+
# We will use StringIO to load the CSV from this text buffer.
437+
# pd.read_csv() will iterate over the file in chunks and will
438+
# finally read a residual chunk of really small size.
439+
440+
# Generate the expected output: manually create the dataframe
441+
# by splitting by comma and repeating the `n_lines` times.
442+
row = tuple(val_ if val_ else float("nan")
443+
for val_ in record_.split(","))
444+
expected = pd.DataFrame([row for _ in range(n_lines)],
445+
dtype=object, columns=None, index=None)
446+
447+
# Iterate over the CSV file in chunks of `chunksize` lines
448+
chunks_ = self.read_csv(StringIO(csv_data), header=None,
449+
dtype=object, chunksize=chunksize)
450+
result = pd.concat(chunks_, axis=0, ignore_index=True)
451+
452+
# Check for data corruption if there was no segfault
453+
tm.assert_frame_equal(result, expected)

pandas/src/parser/tokenizer.c

+30-14
Original file line numberDiff line numberDiff line change
@@ -1221,20 +1221,7 @@ int parser_trim_buffers(parser_t *self) {
12211221
size_t new_cap;
12221222
void *newptr;
12231223

1224-
/* trim stream */
1225-
new_cap = _next_pow2(self->stream_len) + 1;
1226-
TRACE(("parser_trim_buffers: new_cap = %zu, stream_cap = %zu, lines_cap = %zu\n",
1227-
new_cap, self->stream_cap, self->lines_cap));
1228-
if (new_cap < self->stream_cap) {
1229-
TRACE(("parser_trim_buffers: new_cap < self->stream_cap, calling safe_realloc\n"));
1230-
newptr = safe_realloc((void*) self->stream, new_cap);
1231-
if (newptr == NULL) {
1232-
return PARSER_OUT_OF_MEMORY;
1233-
} else {
1234-
self->stream = newptr;
1235-
self->stream_cap = new_cap;
1236-
}
1237-
}
1224+
int i;
12381225

12391226
/* trim words, word_starts */
12401227
new_cap = _next_pow2(self->words_len) + 1;
@@ -1255,6 +1242,35 @@ int parser_trim_buffers(parser_t *self) {
12551242
}
12561243
}
12571244

1245+
/* trim stream */
1246+
new_cap = _next_pow2(self->stream_len) + 1;
1247+
TRACE(("parser_trim_buffers: new_cap = %zu, stream_cap = %zu, lines_cap = %zu\n",
1248+
new_cap, self->stream_cap, self->lines_cap));
1249+
if (new_cap < self->stream_cap) {
1250+
TRACE(("parser_trim_buffers: new_cap < self->stream_cap, calling safe_realloc\n"));
1251+
newptr = safe_realloc((void*) self->stream, new_cap);
1252+
if (newptr == NULL) {
1253+
return PARSER_OUT_OF_MEMORY;
1254+
} else {
1255+
// Update the pointers in the self->words array (char **) if `safe_realloc`
1256+
// moved the `self->stream` buffer. This block mirrors a similar block in
1257+
// `make_stream_space`.
1258+
if (self->stream != newptr) {
1259+
/* TRACE(("Moving word pointers\n")) */
1260+
self->pword_start = newptr + self->word_start;
1261+
1262+
for (i = 0; i < self->words_len; ++i)
1263+
{
1264+
self->words[i] = newptr + self->word_starts[i];
1265+
}
1266+
}
1267+
1268+
self->stream = newptr;
1269+
self->stream_cap = new_cap;
1270+
1271+
}
1272+
}
1273+
12581274
/* trim line_start, line_fields */
12591275
new_cap = _next_pow2(self->lines) + 1;
12601276
if (new_cap < self->lines_cap) {

0 commit comments

Comments
 (0)