Skip to content

Commit 95cd98b

Browse files
BUG: Fix segfault in csv tokenizer (#32566)
1 parent 148abba commit 95cd98b

File tree

3 files changed

+22
-2
lines changed

3 files changed

+22
-2
lines changed

doc/source/whatsnew/v1.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -340,6 +340,7 @@ I/O
340340
- Bug in :class:`HDFStore` that caused it to set to ``int64`` the dtype of a ``datetime64`` column when reading a DataFrame in Python 3 from fixed format written in Python 2 (:issue:`31750`)
341341
- Bug in :meth:`read_excel` where a UTF-8 string with a high surrogate would cause a segmentation violation (:issue:`23809`)
342342
- Bug in :meth:`read_csv` was causing a file descriptor leak on an empty file (:issue:`31488`)
343+
- Bug in :meth:`read_csv` was causing a segfault when there were blank lines between the header and data rows (:issue:`28071`)
343344

344345

345346
Plotting

pandas/_libs/src/parser/tokenizer.c

+8-2
Original file line numberDiff line numberDiff line change
@@ -1189,8 +1189,14 @@ int parser_consume_rows(parser_t *self, size_t nrows) {
11891189

11901190
/* cannot guarantee that nrows + 1 has been observed */
11911191
word_deletions = self->line_start[nrows - 1] + self->line_fields[nrows - 1];
1192-
char_count = (self->word_starts[word_deletions - 1] +
1193-
strlen(self->words[word_deletions - 1]) + 1);
1192+
if (word_deletions >= 1) {
1193+
char_count = (self->word_starts[word_deletions - 1] +
1194+
strlen(self->words[word_deletions - 1]) + 1);
1195+
} else {
1196+
/* if word_deletions == 0 (i.e. this case) then char_count must
1197+
* be 0 too, as no data needs to be skipped */
1198+
char_count = 0;
1199+
}
11941200

11951201
TRACE(("parser_consume_rows: Deleting %d words, %d chars\n", word_deletions,
11961202
char_count));

pandas/tests/io/parser/test_common.py

+13
Original file line numberDiff line numberDiff line change
@@ -2093,3 +2093,16 @@ def test():
20932093
parser.read_csv(path)
20942094

20952095
td.check_file_leaks(test)()
2096+
2097+
2098+
@pytest.mark.parametrize("nrows", range(1, 6))
2099+
def test_blank_lines_between_header_and_data_rows(all_parsers, nrows):
2100+
# GH 28071
2101+
ref = DataFrame(
2102+
[[np.nan, np.nan], [np.nan, np.nan], [1, 2], [np.nan, np.nan], [3, 4]],
2103+
columns=list("ab"),
2104+
)
2105+
csv = "\nheader\n\na,b\n\n\n1,2\n\n3,4"
2106+
parser = all_parsers
2107+
df = parser.read_csv(StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False)
2108+
tm.assert_frame_equal(df, ref[:nrows])

0 commit comments

Comments
 (0)