Skip to content

Commit 1ca88b6

Browse files
Fix segfault in csv tokenizer (issue pandas-dev#28071)
1 parent 787dc8a commit 1ca88b6

File tree

2 files changed

+17
-2
lines changed

2 files changed

+17
-2
lines changed

pandas/_libs/src/parser/tokenizer.c

+7-2
Original file line numberDiff line numberDiff line change
@@ -1189,8 +1189,13 @@ int parser_consume_rows(parser_t *self, size_t nrows) {
11891189

11901190
/* cannot guarantee that nrows + 1 has been observed */
11911191
word_deletions = self->line_start[nrows - 1] + self->line_fields[nrows - 1];
1192-
char_count = (self->word_starts[word_deletions - 1] +
1193-
strlen(self->words[word_deletions - 1]) + 1);
1192+
if (word_deletions >= 1) {
1193+
char_count = (self->word_starts[word_deletions - 1] +
1194+
strlen(self->words[word_deletions - 1]) + 1);
1195+
}
1196+
else {
1197+
char_count = 1;
1198+
}
11941199

11951200
TRACE(("parser_consume_rows: Deleting %d words, %d chars\n", word_deletions,
11961201
char_count));

pandas/tests/io/parser/test_textreader.py

+10
Original file line numberDiff line numberDiff line change
@@ -341,6 +341,16 @@ def test_empty_csv_input(self):
341341
df = read_csv(StringIO(), chunksize=20, header=None, names=["a", "b", "c"])
342342
assert isinstance(df, TextFileReader)
343343

344+
def test_blank_lines_between_header_and_data_rows(self):
345+
ref = DataFrame(
346+
[[np.nan, np.nan], [np.nan, np.nan], [1, 2], [np.nan, np.nan], [3, 4]],
347+
columns=list("ab"),
348+
)
349+
csv = "\nheader\n\na,b\n\n\n1,2\n\n3,4"
350+
for nrows in range(1, 6):
351+
df = read_csv(StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False)
352+
tm.assert_frame_equal(df, ref[:nrows])
353+
344354

345355
def assert_array_dicts_equal(left, right):
346356
for k, v in left.items():

0 commit comments

Comments
 (0)