diff --git a/doc/source/whatsnew/v0.15.2.txt b/doc/source/whatsnew/v0.15.2.txt index 740263bed7970..e86b53ef745d3 100644 --- a/doc/source/whatsnew/v0.15.2.txt +++ b/doc/source/whatsnew/v0.15.2.txt @@ -165,7 +165,7 @@ Bug Fixes of the level names are numbers (:issue:`8584`). - Bug in ``MultiIndex`` where ``__contains__`` returns wrong result if index is not lexically sorted or unique (:issue:`7724`) -- BUG CSV: fix problem with trailing whitespace in skipped rows, (:issue:`8679`), (:issue:`8661`) +- BUG CSV: fix problem with trailing whitespace in skipped rows, (:issue:`8679`), (:issue:`8661`), (:issue:`8983`) - Regression in ``Timestamp`` does not parse 'Z' zone designator for UTC (:issue:`8771`) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 59647b4c781e5..05a5493d0c70c 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -3049,17 +3049,17 @@ def test_comment_skiprows(self): tm.assert_almost_equal(df.values, expected) def test_trailing_spaces(self): - data = """skip + data = """A B C random line with trailing spaces skip 1,2,3 1,2.,4. random line with trailing tabs\t\t\t -5.,NaN,10.0 +5.1,NaN,10.0 """ expected = pd.DataFrame([[1., 2., 4.], - [5., np.nan, 10.]]) + [5.1, np.nan, 10.]]) # this should ignore six lines including lines with trailing # whitespace and blank lines. issues 8661, 8679 df = self.read_csv(StringIO(data.replace(',', ' ')), @@ -3070,6 +3070,13 @@ def test_trailing_spaces(self): header=None, delim_whitespace=True, skiprows=[0,1,2,3,5,6], skip_blank_lines=True) tm.assert_frame_equal(df, expected) + # test skipping set of rows after a row with trailing spaces, issue #8983 + expected = pd.DataFrame({"A":[1., 5.1], "B":[2., np.nan], + "C":[4., 10]}) + df = self.read_table(StringIO(data.replace(',', ' ')), + delim_whitespace=True, + skiprows=[1,2,3,5,6], skip_blank_lines=True) + tm.assert_frame_equal(df, expected) def test_comment_header(self): data = """# empty diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index fc96cc5429775..a64235c7c9732 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -1324,6 +1324,7 @@ int tokenize_whitespace(parser_t *self, size_t line_limit) if (c == '\n') { END_LINE(); self->state = START_RECORD; + break; } else if (c == '\r') { self->state = EAT_CRNL; break;