BUG: Fix segfault in csv tokenizer (#32566)

roberthdevries · web-flow · commit 95cd98b41b92 · 2020-03-15T22:21:29.000-04:00
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -340,6 +340,7 @@ I/O
 - Bug in :class:`HDFStore` that caused it to set to ``int64`` the dtype of a ``datetime64`` column when reading a DataFrame in Python 3 from fixed format written in Python 2 (:issue:`31750`)
 - Bug in :meth:`read_excel` where a UTF-8 string with a high surrogate would cause a segmentation violation (:issue:`23809`)
 - Bug in :meth:`read_csv` was causing a file descriptor leak on an empty file (:issue:`31488`)
+- Bug in :meth:`read_csv` was causing a segfault when there were blank lines between the header and data rows (:issue:`28071`)
 
 
 Plotting
diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
@@ -1189,8 +1189,14 @@ int parser_consume_rows(parser_t *self, size_t nrows) {
 
     /* cannot guarantee that nrows + 1 has been observed */
     word_deletions = self->line_start[nrows - 1] + self->line_fields[nrows - 1];
-    char_count = (self->word_starts[word_deletions - 1] +
-                  strlen(self->words[word_deletions - 1]) + 1);
+    if (word_deletions >= 1) {
+        char_count = (self->word_starts[word_deletions - 1] +
+                      strlen(self->words[word_deletions - 1]) + 1);
+    } else {
+        /* if word_deletions == 0 (i.e. this case) then char_count must
+         * be 0 too, as no data needs to be skipped */
+        char_count = 0;
+    }
 
     TRACE(("parser_consume_rows: Deleting %d words, %d chars\n", word_deletions,
            char_count));
diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py
@@ -2093,3 +2093,16 @@ def test():
                 parser.read_csv(path)
 
         td.check_file_leaks(test)()
+
+
+@pytest.mark.parametrize("nrows", range(1, 6))
+def test_blank_lines_between_header_and_data_rows(all_parsers, nrows):
+    # GH 28071
+    ref = DataFrame(
+        [[np.nan, np.nan], [np.nan, np.nan], [1, 2], [np.nan, np.nan], [3, 4]],
+        columns=list("ab"),
+    )
+    csv = "\nheader\n\na,b\n\n\n1,2\n\n3,4"
+    parser = all_parsers
+    df = parser.read_csv(StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False)
+    tm.assert_frame_equal(df, ref[:nrows])