Fix segfault in csv tokenizer (issue pandas-dev#28071)

roberthdevries · roberthdevries · commit 1ca88b6c2db5 · 2020-03-09T23:02:23.000+01:00
diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
@@ -1189,8 +1189,13 @@ int parser_consume_rows(parser_t *self, size_t nrows) {
 
     /* cannot guarantee that nrows + 1 has been observed */
     word_deletions = self->line_start[nrows - 1] + self->line_fields[nrows - 1];
-    char_count = (self->word_starts[word_deletions - 1] +
-                  strlen(self->words[word_deletions - 1]) + 1);
+    if (word_deletions >= 1) {
+	char_count = (self->word_starts[word_deletions - 1] +
+		      strlen(self->words[word_deletions - 1]) + 1);
+    }
+    else {
+	char_count = 1;
+    }
 
     TRACE(("parser_consume_rows: Deleting %d words, %d chars\n", word_deletions,
            char_count));
diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py
@@ -341,6 +341,16 @@ def test_empty_csv_input(self):
         df = read_csv(StringIO(), chunksize=20, header=None, names=["a", "b", "c"])
         assert isinstance(df, TextFileReader)
 
+    def test_blank_lines_between_header_and_data_rows(self):
+        ref = DataFrame(
+            [[np.nan, np.nan], [np.nan, np.nan], [1, 2], [np.nan, np.nan], [3, 4]],
+            columns=list("ab"),
+        )
+        csv = "\nheader\n\na,b\n\n\n1,2\n\n3,4"
+        for nrows in range(1, 6):
+            df = read_csv(StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False)
+            tm.assert_frame_equal(df, ref[:nrows])
+
 
 def assert_array_dicts_equal(left, right):
     for k, v in left.items():