From 6da5e2a883b11045fd0249c5434a127298774dff Mon Sep 17 00:00:00 2001
From: Robert de Vries <rhdv@xs4all.nl>
Date: Mon, 9 Mar 2020 23:02:23 +0100
Subject: [PATCH 01/10] Fix segfault in csv tokenizer (issue #28071)

---
 pandas/_libs/src/parser/tokenizer.c       |  9 +++++++--
 pandas/tests/io/parser/test_textreader.py | 10 ++++++++++
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 2188ff6b0d464..3dd935316c00b 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1189,8 +1189,13 @@ int parser_consume_rows(parser_t *self, size_t nrows) {
 
     /* cannot guarantee that nrows + 1 has been observed */
     word_deletions = self->line_start[nrows - 1] + self->line_fields[nrows - 1];
-    char_count = (self->word_starts[word_deletions - 1] +
-                  strlen(self->words[word_deletions - 1]) + 1);
+    if (word_deletions >= 1) {
+	char_count = (self->word_starts[word_deletions - 1] +
+		      strlen(self->words[word_deletions - 1]) + 1);
+    }
+    else {
+	char_count = 1;
+    }
 
     TRACE(("parser_consume_rows: Deleting %d words, %d chars\n", word_deletions,
            char_count));
diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py
index 8d5af85c20d33..4314aa9fbe678 100644
--- a/pandas/tests/io/parser/test_textreader.py
+++ b/pandas/tests/io/parser/test_textreader.py
@@ -341,6 +341,16 @@ def test_empty_csv_input(self):
         df = read_csv(StringIO(), chunksize=20, header=None, names=["a", "b", "c"])
         assert isinstance(df, TextFileReader)
 
+    def test_blank_lines_between_header_and_data_rows(self):
+        ref = DataFrame(
+            [[np.nan, np.nan], [np.nan, np.nan], [1, 2], [np.nan, np.nan], [3, 4]],
+            columns=list("ab"),
+        )
+        csv = "\nheader\n\na,b\n\n\n1,2\n\n3,4"
+        for nrows in range(1, 6):
+            df = read_csv(StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False)
+            tm.assert_frame_equal(df, ref[:nrows])
+
 
 def assert_array_dicts_equal(left, right):
     for k, v in left.items():

From b1305b68d454639a793f674b9993ba6b1765d092 Mon Sep 17 00:00:00 2001
From: Robert de Vries <rhdv@xs4all.nl>
Date: Mon, 9 Mar 2020 23:06:42 +0100
Subject: [PATCH 02/10] Add whatsnew entry

---
 doc/source/whatsnew/v1.1.0.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
index 21e59805fa143..36cfb4a904139 100644
--- a/doc/source/whatsnew/v1.1.0.rst
+++ b/doc/source/whatsnew/v1.1.0.rst
@@ -340,6 +340,7 @@ I/O
 - Bug in :class:`HDFStore` that caused it to set to ``int64`` the dtype of a ``datetime64`` column when reading a DataFrame in Python 3 from fixed format written in Python 2 (:issue:`31750`)
 - Bug in :meth:`read_excel` where a UTF-8 string with a high surrogate would cause a segmentation violation (:issue:`23809`)
 - Bug in :meth:`read_csv` was causing a file descriptor leak on an empty file (:issue:`31488`)
+- Bug in :meth:`read_csv` was causing a segfault when there were blank lines between the header and data rows (:issue:`28071`)
 
 
 Plotting

From 79cd073f9b27ec0c9f12c9c5d0306c07ac0488bf Mon Sep 17 00:00:00 2001
From: Robert de Vries <rhdv@xs4all.nl>
Date: Tue, 10 Mar 2020 19:23:15 +0100
Subject: [PATCH 03/10] Add ticket number to test

---
 pandas/tests/io/parser/test_textreader.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py
index 4314aa9fbe678..dd0b149f67b96 100644
--- a/pandas/tests/io/parser/test_textreader.py
+++ b/pandas/tests/io/parser/test_textreader.py
@@ -342,6 +342,7 @@ def test_empty_csv_input(self):
         assert isinstance(df, TextFileReader)
 
     def test_blank_lines_between_header_and_data_rows(self):
+        # GH 28071
         ref = DataFrame(
             [[np.nan, np.nan], [np.nan, np.nan], [1, 2], [np.nan, np.nan], [3, 4]],
             columns=list("ab"),

From f1aa15440c4699fa59d990241cf52940a77c4cf1 Mon Sep 17 00:00:00 2001
From: Robert de Vries <rhdv@xs4all.nl>
Date: Tue, 10 Mar 2020 19:24:41 +0100
Subject: [PATCH 04/10] Untabify

---
 pandas/_libs/src/parser/tokenizer.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 3dd935316c00b..df41f96389553 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1190,11 +1190,11 @@ int parser_consume_rows(parser_t *self, size_t nrows) {
     /* cannot guarantee that nrows + 1 has been observed */
     word_deletions = self->line_start[nrows - 1] + self->line_fields[nrows - 1];
     if (word_deletions >= 1) {
-	char_count = (self->word_starts[word_deletions - 1] +
-		      strlen(self->words[word_deletions - 1]) + 1);
+        char_count = (self->word_starts[word_deletions - 1] +
+                      strlen(self->words[word_deletions - 1]) + 1);
     }
     else {
-	char_count = 1;
+        char_count = 1;
     }
 
     TRACE(("parser_consume_rows: Deleting %d words, %d chars\n", word_deletions,

From c0c1ac63dfc1b9013434b43232e0bf638e10ce1e Mon Sep 17 00:00:00 2001
From: Robert de Vries <rhdv@xs4all.nl>
Date: Tue, 10 Mar 2020 19:49:44 +0100
Subject: [PATCH 05/10] Make linter happy

---
 pandas/_libs/src/parser/tokenizer.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index df41f96389553..8a19c8ff1c90f 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1192,8 +1192,7 @@ int parser_consume_rows(parser_t *self, size_t nrows) {
     if (word_deletions >= 1) {
         char_count = (self->word_starts[word_deletions - 1] +
                       strlen(self->words[word_deletions - 1]) + 1);
-    }
-    else {
+    } else {
         char_count = 1;
     }
 

From 019b3fab5d71ce9c128c7748b164e5ff463d807d Mon Sep 17 00:00:00 2001
From: Robert de Vries <rhdv@xs4all.nl>
Date: Wed, 11 Mar 2020 16:31:11 +0100
Subject: [PATCH 06/10] nrows is now parametrized

---
 pandas/tests/io/parser/test_textreader.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py
index dd0b149f67b96..548982b57e9f4 100644
--- a/pandas/tests/io/parser/test_textreader.py
+++ b/pandas/tests/io/parser/test_textreader.py
@@ -341,16 +341,16 @@ def test_empty_csv_input(self):
         df = read_csv(StringIO(), chunksize=20, header=None, names=["a", "b", "c"])
         assert isinstance(df, TextFileReader)
 
-    def test_blank_lines_between_header_and_data_rows(self):
+    @pytest.mark.parametrize("nrows", range(1,6))
+    def test_blank_lines_between_header_and_data_rows(self, nrows):
         # GH 28071
         ref = DataFrame(
             [[np.nan, np.nan], [np.nan, np.nan], [1, 2], [np.nan, np.nan], [3, 4]],
             columns=list("ab"),
         )
         csv = "\nheader\n\na,b\n\n\n1,2\n\n3,4"
-        for nrows in range(1, 6):
-            df = read_csv(StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False)
-            tm.assert_frame_equal(df, ref[:nrows])
+        df = read_csv(StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False)
+        tm.assert_frame_equal(df, ref[:nrows])
 
 
 def assert_array_dicts_equal(left, right):

From 4c50e0a5584b938da7cc93217b834094154b6b9c Mon Sep 17 00:00:00 2001
From: Robert de Vries <rhdv@xs4all.nl>
Date: Wed, 11 Mar 2020 16:38:35 +0100
Subject: [PATCH 07/10] Test moved from test_textreader to test_common

---
 pandas/tests/io/parser/test_common.py     | 13 +++++++++++++
 pandas/tests/io/parser/test_textreader.py | 11 -----------
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py
index 33460262a4430..0f3a5be76ae60 100644
--- a/pandas/tests/io/parser/test_common.py
+++ b/pandas/tests/io/parser/test_common.py
@@ -2093,3 +2093,16 @@ def test():
                 parser.read_csv(path)
 
         td.check_file_leaks(test)()
+
+
+@pytest.mark.parametrize("nrows", range(1, 6))
+def test_blank_lines_between_header_and_data_rows(all_parsers, nrows):
+    # GH 28071
+    ref = DataFrame(
+        [[np.nan, np.nan], [np.nan, np.nan], [1, 2], [np.nan, np.nan], [3, 4]],
+        columns=list("ab"),
+    )
+    csv = "\nheader\n\na,b\n\n\n1,2\n\n3,4"
+    parser = all_parsers
+    df = parser.read_csv(StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False)
+    tm.assert_frame_equal(df, ref[:nrows])
diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py
index 548982b57e9f4..8d5af85c20d33 100644
--- a/pandas/tests/io/parser/test_textreader.py
+++ b/pandas/tests/io/parser/test_textreader.py
@@ -341,17 +341,6 @@ def test_empty_csv_input(self):
         df = read_csv(StringIO(), chunksize=20, header=None, names=["a", "b", "c"])
         assert isinstance(df, TextFileReader)
 
-    @pytest.mark.parametrize("nrows", range(1,6))
-    def test_blank_lines_between_header_and_data_rows(self, nrows):
-        # GH 28071
-        ref = DataFrame(
-            [[np.nan, np.nan], [np.nan, np.nan], [1, 2], [np.nan, np.nan], [3, 4]],
-            columns=list("ab"),
-        )
-        csv = "\nheader\n\na,b\n\n\n1,2\n\n3,4"
-        df = read_csv(StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False)
-        tm.assert_frame_equal(df, ref[:nrows])
-
 
 def assert_array_dicts_equal(left, right):
     for k, v in left.items():

From 0f793ef8e560e534b20cdea95a95604923d8d29a Mon Sep 17 00:00:00 2001
From: Robert de Vries <rhdv@xs4all.nl>
Date: Wed, 11 Mar 2020 18:15:33 +0100
Subject: [PATCH 08/10] Skip the remainder of stream if there are no
 word_deletions

---
 pandas/_libs/src/parser/tokenizer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 8a19c8ff1c90f..c5c980d89da53 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1193,7 +1193,7 @@ int parser_consume_rows(parser_t *self, size_t nrows) {
         char_count = (self->word_starts[word_deletions - 1] +
                       strlen(self->words[word_deletions - 1]) + 1);
     } else {
-        char_count = 1;
+        char_count = self->stream_len;
     }
 
     TRACE(("parser_consume_rows: Deleting %d words, %d chars\n", word_deletions,

From 3278c79f20e26ed4713953d2879f8d6aad62a120 Mon Sep 17 00:00:00 2001
From: Robert de Vries <rhdv@xs4all.nl>
Date: Sat, 14 Mar 2020 14:08:44 +0100
Subject: [PATCH 09/10] Set char_count to 0 when word_deletions is 0

This looks like the most sensible value as the number of words that is deleted
is 0 therefore the char_count of characters to be skipped is also 0.
---
 pandas/_libs/src/parser/tokenizer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index c5c980d89da53..c77346b2176ce 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1193,7 +1193,7 @@ int parser_consume_rows(parser_t *self, size_t nrows) {
         char_count = (self->word_starts[word_deletions - 1] +
                       strlen(self->words[word_deletions - 1]) + 1);
     } else {
-        char_count = self->stream_len;
+        char_count = 0;
     }
 
     TRACE(("parser_consume_rows: Deleting %d words, %d chars\n", word_deletions,

From c6b64c7a907b0dae2dd5cc93ea810799c046f2b5 Mon Sep 17 00:00:00 2001
From: Robert de Vries <rhdv@xs4all.nl>
Date: Sat, 14 Mar 2020 23:24:10 +0100
Subject: [PATCH 10/10] Add comment to explain the reasoning to set char_count
 to 0

---
 pandas/_libs/src/parser/tokenizer.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index c77346b2176ce..7ba1a6cd398c9 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1193,6 +1193,8 @@ int parser_consume_rows(parser_t *self, size_t nrows) {
         char_count = (self->word_starts[word_deletions - 1] +
                       strlen(self->words[word_deletions - 1]) + 1);
     } else {
+        /* if word_deletions == 0 (i.e. this case) then char_count must
+         * be 0 too, as no data needs to be skipped */
         char_count = 0;
     }