diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index 928fefd6ce17e..9251c7144fdf1 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -295,6 +295,7 @@ Bug Fixes +- Bug in ``read_csv`` with the C engine when specifying ``skiprows`` with newlines in quoted items (:issue:`10911`, `12775`) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 1fab316d80ae6..4b705ae54385b 100755 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -2867,6 +2867,89 @@ def test_read_only_header_no_rows(self): df = self.read_csv(StringIO('a,b,c'), index_col=False) tm.assert_frame_equal(df, expected) + def test_skiprow_with_newline(self): + # see gh-12775 and gh-10911 + data = """id,text,num_lines +1,"line 11 +line 12",2 +2,"line 21 +line 22",2 +3,"line 31",1""" + expected = [[2, 'line 21\nline 22', 2], + [3, 'line 31', 1]] + expected = DataFrame(expected, columns=[ + 'id', 'text', 'num_lines']) + df = self.read_csv(StringIO(data), skiprows=[1]) + tm.assert_frame_equal(df, expected) + + data = ('a,b,c\n~a\n b~,~e\n d~,' + '~f\n f~\n1,2,~12\n 13\n 14~') + expected = [['a\n b', 'e\n d', 'f\n f']] + expected = DataFrame(expected, columns=[ + 'a', 'b', 'c']) + df = self.read_csv(StringIO(data), + quotechar="~", + skiprows=[2]) + tm.assert_frame_equal(df, expected) + + data = ('Text,url\n~example\n ' + 'sentence\n one~,url1\n~' + 'example\n sentence\n two~,url2\n~' + 'example\n sentence\n three~,url3') + expected = [['example\n sentence\n two', 'url2']] + expected = DataFrame(expected, columns=[ + 'Text', 'url']) + df = self.read_csv(StringIO(data), + quotechar="~", + skiprows=[1, 3]) + tm.assert_frame_equal(df, expected) + + def test_skiprow_with_quote(self): + data = """id,text,num_lines +1,"line '11' line 12",2 +2,"line '21' line 22",2 +3,"line '31' line 32",1""" + expected = [[2, "line '21' line 22", 2], + [3, "line '31' line 32", 1]] + expected = DataFrame(expected, columns=[ + 'id', 'text', 'num_lines']) + df = self.read_csv(StringIO(data), skiprows=[1]) + tm.assert_frame_equal(df, expected) + + def test_skiprow_with_newline_and_quote(self): + data = """id,text,num_lines +1,"line \n'11' line 12",2 +2,"line \n'21' line 22",2 +3,"line \n'31' line 32",1""" + expected = [[2, "line \n'21' line 22", 2], + [3, "line \n'31' line 32", 1]] + expected = DataFrame(expected, columns=[ + 'id', 'text', 'num_lines']) + df = self.read_csv(StringIO(data), skiprows=[1]) + tm.assert_frame_equal(df, expected) + + data = """id,text,num_lines +1,"line '11\n' line 12",2 +2,"line '21\n' line 22",2 +3,"line '31\n' line 32",1""" + expected = [[2, "line '21\n' line 22", 2], + [3, "line '31\n' line 32", 1]] + expected = DataFrame(expected, columns=[ + 'id', 'text', 'num_lines']) + df = self.read_csv(StringIO(data), skiprows=[1]) + tm.assert_frame_equal(df, expected) + + data = """id,text,num_lines +1,"line '11\n' \r\tline 12",2 +2,"line '21\n' \r\tline 22",2 +3,"line '31\n' \r\tline 32",1""" + expected = [[2, "line '21\n' \r\tline 22", 2], + [3, "line '31\n' \r\tline 32", 1]] + expected = DataFrame(expected, columns=[ + 'id', 'text', 'num_lines']) + df = self.read_csv(StringIO(data), skiprows=[1]) + tm.assert_frame_equal(df, expected) + class CompressionTests(object): def test_zip(self): diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index 013c47cd09a9b..6091c79e2b4fc 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -478,7 +478,10 @@ static int end_line(parser_t *self) { } } - if (self->state == SKIP_LINE) { + if (self->state == SKIP_LINE || \ + self->state == QUOTE_IN_SKIP_LINE || \ + self->state == QUOTE_IN_QUOTE_IN_SKIP_LINE + ) { TRACE(("end_line: Skipping row %d\n", self->file_lines)); // increment file line count self->file_lines++; @@ -491,8 +494,6 @@ static int end_line(parser_t *self) { return 0; } - /* printf("Line: %d, Fields: %d, Ex-fields: %d\n", self->lines, fields, ex_fields); */ - if (!(self->lines <= self->header_end + 1) && (self->expected_fields < 0 && fields > ex_fields) && !(self->usecols)) { @@ -505,8 +506,7 @@ static int end_line(parser_t *self) { // reset field count self->line_fields[self->lines] = 0; - // file_lines is now the _actual_ file line number (starting at 1) - + // file_lines is now the actual file line number (starting at 1) if (self->error_bad_lines) { self->error_msg = (char*) malloc(100); sprintf(self->error_msg, "Expected %d fields in line %d, saw %d\n", @@ -526,12 +526,11 @@ static int end_line(parser_t *self) { free(msg); } } - } - else { - /* missing trailing delimiters */ + } else { + // missing trailing delimiters if ((self->lines >= self->header_end + 1) && fields < ex_fields) { - /* Might overrun the buffer when closing fields */ + // might overrun the buffer when closing fields if (make_stream_space(self, ex_fields - fields) < 0) { self->error_msg = "out of memory"; return -1; @@ -539,20 +538,14 @@ static int end_line(parser_t *self) { while (fields < ex_fields){ end_field(self); - /* printf("Prior word: %s\n", self->words[self->words_len - 2]); */ fields++; } } // increment both line counts self->file_lines++; - self->lines++; - /* coliter_t it; */ - /* coliter_setup(&it, self, 5, self->lines - 1); */ - /* printf("word at column 5: %s\n", COLITER_NEXT(it)); */ - // good line, set new start point if (self->lines >= self->lines_cap) { TRACE(("end_line: ERROR!!! self->lines(%zu) >= self->lines_cap(%zu)\n", self->lines, self->lines_cap)) \ @@ -574,8 +567,6 @@ static int end_line(parser_t *self) { return 0; } - - int parser_add_skiprow(parser_t *self, int64_t row) { khiter_t k; kh_int64_t *set; @@ -763,6 +754,31 @@ int tokenize_bytes(parser_t *self, size_t line_limit) } else if (IS_CARRIAGE(c)) { self->file_lines++; self->state = EAT_CRNL_NOP; + } else if (IS_QUOTE(c)) { + self->state = QUOTE_IN_SKIP_LINE; + } + break; + + case QUOTE_IN_SKIP_LINE: + if (IS_QUOTE(c)) { + if (self->doublequote) { + self->state = QUOTE_IN_QUOTE_IN_SKIP_LINE; + } else { + self->state = SKIP_LINE; + } + } + break; + + case QUOTE_IN_QUOTE_IN_SKIP_LINE: + if (IS_QUOTE(c)) { + self->state = QUOTE_IN_SKIP_LINE; + } else if (IS_TERMINATOR(c)) { + END_LINE(); + } else if (IS_CARRIAGE(c)) { + self->file_lines++; + self->state = EAT_CRNL_NOP; + } else { + self->state = SKIP_LINE; } break; @@ -815,9 +831,14 @@ int tokenize_bytes(parser_t *self, size_t line_limit) case START_RECORD: // start of record if (skip_this_line(self, self->file_lines)) { - self->state = SKIP_LINE; - if (IS_TERMINATOR(c)) { - END_LINE(); + if (IS_QUOTE(c)) { + self->state = QUOTE_IN_SKIP_LINE; + } else { + self->state = SKIP_LINE; + + if (IS_TERMINATOR(c)) { + END_LINE(); + } } break; } else if (IS_TERMINATOR(c)) { diff --git a/pandas/src/parser/tokenizer.h b/pandas/src/parser/tokenizer.h index 2d1b7fae58714..8f7ae436bb7b7 100644 --- a/pandas/src/parser/tokenizer.h +++ b/pandas/src/parser/tokenizer.h @@ -124,6 +124,8 @@ typedef enum { EAT_LINE_COMMENT, WHITESPACE_LINE, SKIP_LINE, + QUOTE_IN_SKIP_LINE, + QUOTE_IN_QUOTE_IN_SKIP_LINE, FINISHED } ParserState;