diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt index 8720774b821a2..6176555dea27e 100644 --- a/doc/source/whatsnew/v0.16.0.txt +++ b/doc/source/whatsnew/v0.16.0.txt @@ -51,3 +51,4 @@ Bug Fixes - Fixed compatibility issue in ``DatetimeIndex`` affecting architectures where ``numpy.int_`` defaults to ``numpy.int32`` (:issue:`8943`) - Bug in ``MultiIndex.has_duplicates`` when having many levels causes an indexer overflow (:issue:`9075`) +- Bug in read_csv when using skiprows on a file with CR line endings with the c engine. (:issue:`9079`) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 2f211ab0381a2..d805727394f33 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -3048,6 +3048,30 @@ def test_comment_skiprows(self): df = self.read_csv(StringIO(data), comment='#', skiprows=4) tm.assert_almost_equal(df.values, expected) + def test_skiprows_lineterminator(self): + #GH #9079 + data = '\n'.join(['SMOSMANIA ThetaProbe-ML2X ', + '2007/01/01 01:00 0.2140 U M ', + '2007/01/01 02:00 0.2141 M O ', + '2007/01/01 04:00 0.2142 D M ']) + expected = pd.DataFrame([['2007/01/01', '01:00', 0.2140, 'U', 'M'], + ['2007/01/01', '02:00', 0.2141, 'M', 'O'], + ['2007/01/01', '04:00', 0.2142, 'D', 'M']], + columns=['date', 'time', 'var', 'flag', + 'oflag']) + # test with the three default lineterminators LF, CR and CRLF + df = self.read_csv(StringIO(data), skiprows=1, delim_whitespace=True, + names=['date', 'time', 'var', 'flag', 'oflag']) + tm.assert_frame_equal(df, expected) + df = self.read_csv(StringIO(data.replace('\n', '\r')), + skiprows=1, delim_whitespace=True, + names=['date', 'time', 'var', 'flag', 'oflag']) + tm.assert_frame_equal(df, expected) + df = self.read_csv(StringIO(data.replace('\n', '\r\n')), + skiprows=1, delim_whitespace=True, + names=['date', 'time', 'var', 'flag', 'oflag']) + tm.assert_frame_equal(df, expected) + def test_trailing_spaces(self): data = "A B C \nrandom line with trailing spaces \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n \n5.1,NaN,10.0\n" expected = pd.DataFrame([[1., 2., 4.], diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index a64235c7c9732..f56945db87326 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -707,6 +707,9 @@ int tokenize_delimited(parser_t *self, size_t line_limit) // TRACE(("tokenize_delimited SKIP_LINE %c, state %d\n", c, self->state)); if (c == '\n') { END_LINE(); + } else if (c == '\r') { + self->file_lines++; + self->state = EAT_CRNL_NOP; } break; @@ -1304,6 +1307,9 @@ int tokenize_whitespace(parser_t *self, size_t line_limit) // TRACE(("tokenize_whitespace SKIP_LINE %c, state %d\n", c, self->state)); if (c == '\n') { END_LINE(); + } else if (c == '\r') { + self->file_lines++; + self->state = EAT_CRNL_NOP; } break;