From 6da5e2a883b11045fd0249c5434a127298774dff Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Mon, 9 Mar 2020 23:02:23 +0100 Subject: [PATCH 01/10] Fix segfault in csv tokenizer (issue #28071) --- pandas/_libs/src/parser/tokenizer.c | 9 +++++++-- pandas/tests/io/parser/test_textreader.py | 10 ++++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 2188ff6b0d464..3dd935316c00b 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1189,8 +1189,13 @@ int parser_consume_rows(parser_t *self, size_t nrows) { /* cannot guarantee that nrows + 1 has been observed */ word_deletions = self->line_start[nrows - 1] + self->line_fields[nrows - 1]; - char_count = (self->word_starts[word_deletions - 1] + - strlen(self->words[word_deletions - 1]) + 1); + if (word_deletions >= 1) { + char_count = (self->word_starts[word_deletions - 1] + + strlen(self->words[word_deletions - 1]) + 1); + } + else { + char_count = 1; + } TRACE(("parser_consume_rows: Deleting %d words, %d chars\n", word_deletions, char_count)); diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index 8d5af85c20d33..4314aa9fbe678 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -341,6 +341,16 @@ def test_empty_csv_input(self): df = read_csv(StringIO(), chunksize=20, header=None, names=["a", "b", "c"]) assert isinstance(df, TextFileReader) + def test_blank_lines_between_header_and_data_rows(self): + ref = DataFrame( + [[np.nan, np.nan], [np.nan, np.nan], [1, 2], [np.nan, np.nan], [3, 4]], + columns=list("ab"), + ) + csv = "\nheader\n\na,b\n\n\n1,2\n\n3,4" + for nrows in range(1, 6): + df = read_csv(StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False) + tm.assert_frame_equal(df, ref[:nrows]) + def assert_array_dicts_equal(left, right): for k, v in left.items(): From b1305b68d454639a793f674b9993ba6b1765d092 Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Mon, 9 Mar 2020 23:06:42 +0100 Subject: [PATCH 02/10] Add whatsnew entry --- doc/source/whatsnew/v1.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 21e59805fa143..36cfb4a904139 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -340,6 +340,7 @@ I/O - Bug in :class:`HDFStore` that caused it to set to ``int64`` the dtype of a ``datetime64`` column when reading a DataFrame in Python 3 from fixed format written in Python 2 (:issue:`31750`) - Bug in :meth:`read_excel` where a UTF-8 string with a high surrogate would cause a segmentation violation (:issue:`23809`) - Bug in :meth:`read_csv` was causing a file descriptor leak on an empty file (:issue:`31488`) +- Bug in :meth:`read_csv` was causing a segfault when there were blank lines between the header and data rows (:issue:`28071`) Plotting From 79cd073f9b27ec0c9f12c9c5d0306c07ac0488bf Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Tue, 10 Mar 2020 19:23:15 +0100 Subject: [PATCH 03/10] Add ticket number to test --- pandas/tests/io/parser/test_textreader.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index 4314aa9fbe678..dd0b149f67b96 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -342,6 +342,7 @@ def test_empty_csv_input(self): assert isinstance(df, TextFileReader) def test_blank_lines_between_header_and_data_rows(self): + # GH 28071 ref = DataFrame( [[np.nan, np.nan], [np.nan, np.nan], [1, 2], [np.nan, np.nan], [3, 4]], columns=list("ab"), From f1aa15440c4699fa59d990241cf52940a77c4cf1 Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Tue, 10 Mar 2020 19:24:41 +0100 Subject: [PATCH 04/10] Untabify --- pandas/_libs/src/parser/tokenizer.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 3dd935316c00b..df41f96389553 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1190,11 +1190,11 @@ int parser_consume_rows(parser_t *self, size_t nrows) { /* cannot guarantee that nrows + 1 has been observed */ word_deletions = self->line_start[nrows - 1] + self->line_fields[nrows - 1]; if (word_deletions >= 1) { - char_count = (self->word_starts[word_deletions - 1] + - strlen(self->words[word_deletions - 1]) + 1); + char_count = (self->word_starts[word_deletions - 1] + + strlen(self->words[word_deletions - 1]) + 1); } else { - char_count = 1; + char_count = 1; } TRACE(("parser_consume_rows: Deleting %d words, %d chars\n", word_deletions, From c0c1ac63dfc1b9013434b43232e0bf638e10ce1e Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Tue, 10 Mar 2020 19:49:44 +0100 Subject: [PATCH 05/10] Make linter happy --- pandas/_libs/src/parser/tokenizer.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index df41f96389553..8a19c8ff1c90f 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1192,8 +1192,7 @@ int parser_consume_rows(parser_t *self, size_t nrows) { if (word_deletions >= 1) { char_count = (self->word_starts[word_deletions - 1] + strlen(self->words[word_deletions - 1]) + 1); - } - else { + } else { char_count = 1; } From 019b3fab5d71ce9c128c7748b164e5ff463d807d Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Wed, 11 Mar 2020 16:31:11 +0100 Subject: [PATCH 06/10] nrows is now parametrized --- pandas/tests/io/parser/test_textreader.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index dd0b149f67b96..548982b57e9f4 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -341,16 +341,16 @@ def test_empty_csv_input(self): df = read_csv(StringIO(), chunksize=20, header=None, names=["a", "b", "c"]) assert isinstance(df, TextFileReader) - def test_blank_lines_between_header_and_data_rows(self): + @pytest.mark.parametrize("nrows", range(1,6)) + def test_blank_lines_between_header_and_data_rows(self, nrows): # GH 28071 ref = DataFrame( [[np.nan, np.nan], [np.nan, np.nan], [1, 2], [np.nan, np.nan], [3, 4]], columns=list("ab"), ) csv = "\nheader\n\na,b\n\n\n1,2\n\n3,4" - for nrows in range(1, 6): - df = read_csv(StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False) - tm.assert_frame_equal(df, ref[:nrows]) + df = read_csv(StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False) + tm.assert_frame_equal(df, ref[:nrows]) def assert_array_dicts_equal(left, right): From 4c50e0a5584b938da7cc93217b834094154b6b9c Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Wed, 11 Mar 2020 16:38:35 +0100 Subject: [PATCH 07/10] Test moved from test_textreader to test_common --- pandas/tests/io/parser/test_common.py | 13 +++++++++++++ pandas/tests/io/parser/test_textreader.py | 11 ----------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 33460262a4430..0f3a5be76ae60 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -2093,3 +2093,16 @@ def test(): parser.read_csv(path) td.check_file_leaks(test)() + + +@pytest.mark.parametrize("nrows", range(1, 6)) +def test_blank_lines_between_header_and_data_rows(all_parsers, nrows): + # GH 28071 + ref = DataFrame( + [[np.nan, np.nan], [np.nan, np.nan], [1, 2], [np.nan, np.nan], [3, 4]], + columns=list("ab"), + ) + csv = "\nheader\n\na,b\n\n\n1,2\n\n3,4" + parser = all_parsers + df = parser.read_csv(StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False) + tm.assert_frame_equal(df, ref[:nrows]) diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index 548982b57e9f4..8d5af85c20d33 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -341,17 +341,6 @@ def test_empty_csv_input(self): df = read_csv(StringIO(), chunksize=20, header=None, names=["a", "b", "c"]) assert isinstance(df, TextFileReader) - @pytest.mark.parametrize("nrows", range(1,6)) - def test_blank_lines_between_header_and_data_rows(self, nrows): - # GH 28071 - ref = DataFrame( - [[np.nan, np.nan], [np.nan, np.nan], [1, 2], [np.nan, np.nan], [3, 4]], - columns=list("ab"), - ) - csv = "\nheader\n\na,b\n\n\n1,2\n\n3,4" - df = read_csv(StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False) - tm.assert_frame_equal(df, ref[:nrows]) - def assert_array_dicts_equal(left, right): for k, v in left.items(): From 0f793ef8e560e534b20cdea95a95604923d8d29a Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Wed, 11 Mar 2020 18:15:33 +0100 Subject: [PATCH 08/10] Skip the remainder of stream if there are no word_deletions --- pandas/_libs/src/parser/tokenizer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 8a19c8ff1c90f..c5c980d89da53 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1193,7 +1193,7 @@ int parser_consume_rows(parser_t *self, size_t nrows) { char_count = (self->word_starts[word_deletions - 1] + strlen(self->words[word_deletions - 1]) + 1); } else { - char_count = 1; + char_count = self->stream_len; } TRACE(("parser_consume_rows: Deleting %d words, %d chars\n", word_deletions, From 3278c79f20e26ed4713953d2879f8d6aad62a120 Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Sat, 14 Mar 2020 14:08:44 +0100 Subject: [PATCH 09/10] Set char_count to 0 when word_deletions is 0 This looks like the most sensible value as the number of words that is deleted is 0 therefore the char_count of characters to be skipped is also 0. --- pandas/_libs/src/parser/tokenizer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index c5c980d89da53..c77346b2176ce 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1193,7 +1193,7 @@ int parser_consume_rows(parser_t *self, size_t nrows) { char_count = (self->word_starts[word_deletions - 1] + strlen(self->words[word_deletions - 1]) + 1); } else { - char_count = self->stream_len; + char_count = 0; } TRACE(("parser_consume_rows: Deleting %d words, %d chars\n", word_deletions, From c6b64c7a907b0dae2dd5cc93ea810799c046f2b5 Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Sat, 14 Mar 2020 23:24:10 +0100 Subject: [PATCH 10/10] Add comment to explain the reasoning to set char_count to 0 --- pandas/_libs/src/parser/tokenizer.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index c77346b2176ce..7ba1a6cd398c9 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1193,6 +1193,8 @@ int parser_consume_rows(parser_t *self, size_t nrows) { char_count = (self->word_starts[word_deletions - 1] + strlen(self->words[word_deletions - 1]) + 1); } else { + /* if word_deletions == 0 (i.e. this case) then char_count must + * be 0 too, as no data needs to be skipped */ char_count = 0; }