Allow parsing in skipped row for C engine

gfyoung · jreback · commit 5688d2771a00 · 2016-04-22T11:18:01.000-04:00
Changes behaviour of C engine parser so that parsing is done on skipped rows so that they are properly skipped. Closes #10911. Closes #12775. Author: gfyoung <gfyoung17@gmail.com> Closes #12900 from gfyoung/skiprows-newlines-patch and squashes the following commits: 858c673 [gfyoung] Patch handling of quotes in skipped rows
diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt
@@ -296,6 +296,7 @@ Bug Fixes
 
 
 
+- Bug in ``read_csv`` with the C engine when specifying ``skiprows`` with newlines in quoted items (:issue:`10911`, `12775`)
 
 
 
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -2867,6 +2867,91 @@ def test_read_only_header_no_rows(self):
         df = self.read_csv(StringIO('a,b,c'), index_col=False)
         tm.assert_frame_equal(df, expected)
 
+    def test_skiprow_with_newline(self):
+        # see gh-12775 and gh-10911
+        data = """id,text,num_lines
+1,"line 11
+line 12",2
+2,"line 21
+line 22",2
+3,"line 31",1"""
+        expected = [[2, 'line 21\nline 22', 2],
+                    [3, 'line 31', 1]]
+        expected = DataFrame(expected, columns=[
+            'id', 'text', 'num_lines'])
+        df = self.read_csv(StringIO(data), skiprows=[1])
+        tm.assert_frame_equal(df, expected)
+
+        data = ('a,b,c\n~a\n b~,~e\n d~,'
+                '~f\n f~\n1,2,~12\n 13\n 14~')
+        expected = [['a\n b', 'e\n d', 'f\n f']]
+        expected = DataFrame(expected, columns=[
+            'a', 'b', 'c'])
+        df = self.read_csv(StringIO(data),
+                           quotechar="~",
+                           skiprows=[2])
+        tm.assert_frame_equal(df, expected)
+
+        data = ('Text,url\n~example\n '
+                'sentence\n one~,url1\n~'
+                'example\n sentence\n two~,url2\n~'
+                'example\n sentence\n three~,url3')
+        expected = [['example\n sentence\n two', 'url2']]
+        expected = DataFrame(expected, columns=[
+            'Text', 'url'])
+        df = self.read_csv(StringIO(data),
+                           quotechar="~",
+                           skiprows=[1, 3])
+        tm.assert_frame_equal(df, expected)
+
+    def test_skiprow_with_quote(self):
+        # see gh-12775 and gh-10911
+        data = """id,text,num_lines
+1,"line '11' line 12",2
+2,"line '21' line 22",2
+3,"line '31' line 32",1"""
+        expected = [[2, "line '21' line 22", 2],
+                    [3, "line '31' line 32", 1]]
+        expected = DataFrame(expected, columns=[
+            'id', 'text', 'num_lines'])
+        df = self.read_csv(StringIO(data), skiprows=[1])
+        tm.assert_frame_equal(df, expected)
+
+    def test_skiprow_with_newline_and_quote(self):
+        # see gh-12775 and gh-10911
+        data = """id,text,num_lines
+1,"line \n'11' line 12",2
+2,"line \n'21' line 22",2
+3,"line \n'31' line 32",1"""
+        expected = [[2, "line \n'21' line 22", 2],
+                    [3, "line \n'31' line 32", 1]]
+        expected = DataFrame(expected, columns=[
+            'id', 'text', 'num_lines'])
+        df = self.read_csv(StringIO(data), skiprows=[1])
+        tm.assert_frame_equal(df, expected)
+
+        data = """id,text,num_lines
+1,"line '11\n' line 12",2
+2,"line '21\n' line 22",2
+3,"line '31\n' line 32",1"""
+        expected = [[2, "line '21\n' line 22", 2],
+                    [3, "line '31\n' line 32", 1]]
+        expected = DataFrame(expected, columns=[
+            'id', 'text', 'num_lines'])
+        df = self.read_csv(StringIO(data), skiprows=[1])
+        tm.assert_frame_equal(df, expected)
+
+        data = """id,text,num_lines
+1,"line '11\n' \r\tline 12",2
+2,"line '21\n' \r\tline 22",2
+3,"line '31\n' \r\tline 32",1"""
+        expected = [[2, "line '21\n' \r\tline 22", 2],
+                    [3, "line '31\n' \r\tline 32", 1]]
+        expected = DataFrame(expected, columns=[
+            'id', 'text', 'num_lines'])
+        df = self.read_csv(StringIO(data), skiprows=[1])
+        tm.assert_frame_equal(df, expected)
+
 
 class CompressionTests(object):
     def test_zip(self):
diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c
@@ -478,7 +478,10 @@ static int end_line(parser_t *self) {
         }
     }
 
-    if (self->state == SKIP_LINE) {
+    if (self->state == SKIP_LINE || \
+        self->state == QUOTE_IN_SKIP_LINE || \
+        self->state == QUOTE_IN_QUOTE_IN_SKIP_LINE
+    ) {
         TRACE(("end_line: Skipping row %d\n", self->file_lines));
         // increment file line count
         self->file_lines++;
@@ -491,8 +494,6 @@ static int end_line(parser_t *self) {
         return 0;
     }
 
-    /* printf("Line: %d, Fields: %d, Ex-fields: %d\n", self->lines, fields, ex_fields); */
-
     if (!(self->lines <= self->header_end + 1)
         && (self->expected_fields < 0 && fields > ex_fields)
         && !(self->usecols)) {
@@ -505,8 +506,7 @@ static int end_line(parser_t *self) {
         // reset field count
         self->line_fields[self->lines] = 0;
 
-        // file_lines is now the _actual_ file line number (starting at 1)
-
+        // file_lines is now the actual file line number (starting at 1)
         if (self->error_bad_lines) {
             self->error_msg = (char*) malloc(100);
             sprintf(self->error_msg, "Expected %d fields in line %d, saw %d\n",
@@ -526,33 +526,26 @@ static int end_line(parser_t *self) {
                 free(msg);
             }
         }
-    }
-    else {
-        /* missing trailing delimiters */
+    } else {
+        // missing trailing delimiters
         if ((self->lines >= self->header_end + 1) && fields < ex_fields) {
 
-            /* Might overrun the buffer when closing fields */
+            // might overrun the buffer when closing fields
             if (make_stream_space(self, ex_fields - fields) < 0) {
                 self->error_msg = "out of memory";
                 return -1;
             }
 
             while (fields < ex_fields){
                 end_field(self);
-                /* printf("Prior word: %s\n", self->words[self->words_len - 2]); */
                 fields++;
             }
         }
 
         // increment both line counts
         self->file_lines++;
-
         self->lines++;
 
-        /* coliter_t it; */
-        /* coliter_setup(&it, self, 5, self->lines - 1); */
-        /* printf("word at column 5: %s\n", COLITER_NEXT(it)); */
-
         // good line, set new start point
         if (self->lines >= self->lines_cap) {
             TRACE(("end_line: ERROR!!! self->lines(%zu) >= self->lines_cap(%zu)\n", self->lines, self->lines_cap))  \
@@ -574,8 +567,6 @@ static int end_line(parser_t *self) {
     return 0;
 }
 
-
-
 int parser_add_skiprow(parser_t *self, int64_t row) {
     khiter_t k;
     kh_int64_t *set;
@@ -763,6 +754,31 @@ int tokenize_bytes(parser_t *self, size_t line_limit)
             } else if (IS_CARRIAGE(c)) {
                 self->file_lines++;
                 self->state = EAT_CRNL_NOP;
+            } else if (IS_QUOTE(c)) {
+                self->state = QUOTE_IN_SKIP_LINE;
+            }
+            break;
+
+        case QUOTE_IN_SKIP_LINE:
+            if (IS_QUOTE(c)) {
+                if (self->doublequote) {
+                    self->state = QUOTE_IN_QUOTE_IN_SKIP_LINE;
+                } else {
+                    self->state = SKIP_LINE;
+                }
+            }
+            break;
+
+        case QUOTE_IN_QUOTE_IN_SKIP_LINE:
+            if (IS_QUOTE(c)) {
+                self->state = QUOTE_IN_SKIP_LINE;
+            } else if (IS_TERMINATOR(c)) {
+                END_LINE();
+            } else if (IS_CARRIAGE(c)) {
+                self->file_lines++;
+                self->state = EAT_CRNL_NOP;
+            } else {
+                self->state = SKIP_LINE;
             }
             break;
 
@@ -815,9 +831,14 @@ int tokenize_bytes(parser_t *self, size_t line_limit)
         case START_RECORD:
             // start of record
             if (skip_this_line(self, self->file_lines)) {
-                self->state = SKIP_LINE;
-                if (IS_TERMINATOR(c)) {
-                    END_LINE();
+                if (IS_QUOTE(c)) {
+                    self->state = QUOTE_IN_SKIP_LINE;
+                } else {
+                    self->state = SKIP_LINE;
+
+                    if (IS_TERMINATOR(c)) {
+                        END_LINE();
+                    }
                 }
                 break;
             } else if (IS_TERMINATOR(c)) {
diff --git a/pandas/src/parser/tokenizer.h b/pandas/src/parser/tokenizer.h
@@ -124,6 +124,8 @@ typedef enum {
     EAT_LINE_COMMENT,
     WHITESPACE_LINE,
     SKIP_LINE,
+    QUOTE_IN_SKIP_LINE,
+    QUOTE_IN_QUOTE_IN_SKIP_LINE,
     FINISHED
 } ParserState;
 

Original file line number	Diff line number	Diff line change
`@@ -296,6 +296,7 @@ Bug Fixes`
`296`	`296`
`297`	`297`
`298`	`298`
	`299`	+- Bug in ``read_csv`` with the C engine when specifying ``skiprows`` with newlines in quoted items (:issue:`10911`, `12775`)
`299`	`300`
`300`	`301`
`301`	`302`