Skip to content

Commit 5688d27

Browse files
gfyoungjreback
authored andcommitted
Allow parsing in skipped row for C engine
Changes behaviour of C engine parser so that parsing is done on skipped rows so that they are properly skipped. Closes #10911. Closes #12775. Author: gfyoung <[email protected]> Closes #12900 from gfyoung/skiprows-newlines-patch and squashes the following commits: 858c673 [gfyoung] Patch handling of quotes in skipped rows
1 parent 33683cc commit 5688d27

File tree

4 files changed

+129
-20
lines changed

4 files changed

+129
-20
lines changed

doc/source/whatsnew/v0.18.1.txt

+1
Original file line numberDiff line numberDiff line change
@@ -296,6 +296,7 @@ Bug Fixes
296296

297297

298298

299+
- Bug in ``read_csv`` with the C engine when specifying ``skiprows`` with newlines in quoted items (:issue:`10911`, `12775`)
299300

300301

301302

pandas/io/tests/test_parsers.py

+85
Original file line numberDiff line numberDiff line change
@@ -2867,6 +2867,91 @@ def test_read_only_header_no_rows(self):
28672867
df = self.read_csv(StringIO('a,b,c'), index_col=False)
28682868
tm.assert_frame_equal(df, expected)
28692869

2870+
def test_skiprow_with_newline(self):
2871+
# see gh-12775 and gh-10911
2872+
data = """id,text,num_lines
2873+
1,"line 11
2874+
line 12",2
2875+
2,"line 21
2876+
line 22",2
2877+
3,"line 31",1"""
2878+
expected = [[2, 'line 21\nline 22', 2],
2879+
[3, 'line 31', 1]]
2880+
expected = DataFrame(expected, columns=[
2881+
'id', 'text', 'num_lines'])
2882+
df = self.read_csv(StringIO(data), skiprows=[1])
2883+
tm.assert_frame_equal(df, expected)
2884+
2885+
data = ('a,b,c\n~a\n b~,~e\n d~,'
2886+
'~f\n f~\n1,2,~12\n 13\n 14~')
2887+
expected = [['a\n b', 'e\n d', 'f\n f']]
2888+
expected = DataFrame(expected, columns=[
2889+
'a', 'b', 'c'])
2890+
df = self.read_csv(StringIO(data),
2891+
quotechar="~",
2892+
skiprows=[2])
2893+
tm.assert_frame_equal(df, expected)
2894+
2895+
data = ('Text,url\n~example\n '
2896+
'sentence\n one~,url1\n~'
2897+
'example\n sentence\n two~,url2\n~'
2898+
'example\n sentence\n three~,url3')
2899+
expected = [['example\n sentence\n two', 'url2']]
2900+
expected = DataFrame(expected, columns=[
2901+
'Text', 'url'])
2902+
df = self.read_csv(StringIO(data),
2903+
quotechar="~",
2904+
skiprows=[1, 3])
2905+
tm.assert_frame_equal(df, expected)
2906+
2907+
def test_skiprow_with_quote(self):
2908+
# see gh-12775 and gh-10911
2909+
data = """id,text,num_lines
2910+
1,"line '11' line 12",2
2911+
2,"line '21' line 22",2
2912+
3,"line '31' line 32",1"""
2913+
expected = [[2, "line '21' line 22", 2],
2914+
[3, "line '31' line 32", 1]]
2915+
expected = DataFrame(expected, columns=[
2916+
'id', 'text', 'num_lines'])
2917+
df = self.read_csv(StringIO(data), skiprows=[1])
2918+
tm.assert_frame_equal(df, expected)
2919+
2920+
def test_skiprow_with_newline_and_quote(self):
2921+
# see gh-12775 and gh-10911
2922+
data = """id,text,num_lines
2923+
1,"line \n'11' line 12",2
2924+
2,"line \n'21' line 22",2
2925+
3,"line \n'31' line 32",1"""
2926+
expected = [[2, "line \n'21' line 22", 2],
2927+
[3, "line \n'31' line 32", 1]]
2928+
expected = DataFrame(expected, columns=[
2929+
'id', 'text', 'num_lines'])
2930+
df = self.read_csv(StringIO(data), skiprows=[1])
2931+
tm.assert_frame_equal(df, expected)
2932+
2933+
data = """id,text,num_lines
2934+
1,"line '11\n' line 12",2
2935+
2,"line '21\n' line 22",2
2936+
3,"line '31\n' line 32",1"""
2937+
expected = [[2, "line '21\n' line 22", 2],
2938+
[3, "line '31\n' line 32", 1]]
2939+
expected = DataFrame(expected, columns=[
2940+
'id', 'text', 'num_lines'])
2941+
df = self.read_csv(StringIO(data), skiprows=[1])
2942+
tm.assert_frame_equal(df, expected)
2943+
2944+
data = """id,text,num_lines
2945+
1,"line '11\n' \r\tline 12",2
2946+
2,"line '21\n' \r\tline 22",2
2947+
3,"line '31\n' \r\tline 32",1"""
2948+
expected = [[2, "line '21\n' \r\tline 22", 2],
2949+
[3, "line '31\n' \r\tline 32", 1]]
2950+
expected = DataFrame(expected, columns=[
2951+
'id', 'text', 'num_lines'])
2952+
df = self.read_csv(StringIO(data), skiprows=[1])
2953+
tm.assert_frame_equal(df, expected)
2954+
28702955

28712956
class CompressionTests(object):
28722957
def test_zip(self):

pandas/src/parser/tokenizer.c

+41-20
Original file line numberDiff line numberDiff line change
@@ -478,7 +478,10 @@ static int end_line(parser_t *self) {
478478
}
479479
}
480480

481-
if (self->state == SKIP_LINE) {
481+
if (self->state == SKIP_LINE || \
482+
self->state == QUOTE_IN_SKIP_LINE || \
483+
self->state == QUOTE_IN_QUOTE_IN_SKIP_LINE
484+
) {
482485
TRACE(("end_line: Skipping row %d\n", self->file_lines));
483486
// increment file line count
484487
self->file_lines++;
@@ -491,8 +494,6 @@ static int end_line(parser_t *self) {
491494
return 0;
492495
}
493496

494-
/* printf("Line: %d, Fields: %d, Ex-fields: %d\n", self->lines, fields, ex_fields); */
495-
496497
if (!(self->lines <= self->header_end + 1)
497498
&& (self->expected_fields < 0 && fields > ex_fields)
498499
&& !(self->usecols)) {
@@ -505,8 +506,7 @@ static int end_line(parser_t *self) {
505506
// reset field count
506507
self->line_fields[self->lines] = 0;
507508

508-
// file_lines is now the _actual_ file line number (starting at 1)
509-
509+
// file_lines is now the actual file line number (starting at 1)
510510
if (self->error_bad_lines) {
511511
self->error_msg = (char*) malloc(100);
512512
sprintf(self->error_msg, "Expected %d fields in line %d, saw %d\n",
@@ -526,33 +526,26 @@ static int end_line(parser_t *self) {
526526
free(msg);
527527
}
528528
}
529-
}
530-
else {
531-
/* missing trailing delimiters */
529+
} else {
530+
// missing trailing delimiters
532531
if ((self->lines >= self->header_end + 1) && fields < ex_fields) {
533532

534-
/* Might overrun the buffer when closing fields */
533+
// might overrun the buffer when closing fields
535534
if (make_stream_space(self, ex_fields - fields) < 0) {
536535
self->error_msg = "out of memory";
537536
return -1;
538537
}
539538

540539
while (fields < ex_fields){
541540
end_field(self);
542-
/* printf("Prior word: %s\n", self->words[self->words_len - 2]); */
543541
fields++;
544542
}
545543
}
546544

547545
// increment both line counts
548546
self->file_lines++;
549-
550547
self->lines++;
551548

552-
/* coliter_t it; */
553-
/* coliter_setup(&it, self, 5, self->lines - 1); */
554-
/* printf("word at column 5: %s\n", COLITER_NEXT(it)); */
555-
556549
// good line, set new start point
557550
if (self->lines >= self->lines_cap) {
558551
TRACE(("end_line: ERROR!!! self->lines(%zu) >= self->lines_cap(%zu)\n", self->lines, self->lines_cap)) \
@@ -574,8 +567,6 @@ static int end_line(parser_t *self) {
574567
return 0;
575568
}
576569

577-
578-
579570
int parser_add_skiprow(parser_t *self, int64_t row) {
580571
khiter_t k;
581572
kh_int64_t *set;
@@ -763,6 +754,31 @@ int tokenize_bytes(parser_t *self, size_t line_limit)
763754
} else if (IS_CARRIAGE(c)) {
764755
self->file_lines++;
765756
self->state = EAT_CRNL_NOP;
757+
} else if (IS_QUOTE(c)) {
758+
self->state = QUOTE_IN_SKIP_LINE;
759+
}
760+
break;
761+
762+
case QUOTE_IN_SKIP_LINE:
763+
if (IS_QUOTE(c)) {
764+
if (self->doublequote) {
765+
self->state = QUOTE_IN_QUOTE_IN_SKIP_LINE;
766+
} else {
767+
self->state = SKIP_LINE;
768+
}
769+
}
770+
break;
771+
772+
case QUOTE_IN_QUOTE_IN_SKIP_LINE:
773+
if (IS_QUOTE(c)) {
774+
self->state = QUOTE_IN_SKIP_LINE;
775+
} else if (IS_TERMINATOR(c)) {
776+
END_LINE();
777+
} else if (IS_CARRIAGE(c)) {
778+
self->file_lines++;
779+
self->state = EAT_CRNL_NOP;
780+
} else {
781+
self->state = SKIP_LINE;
766782
}
767783
break;
768784

@@ -815,9 +831,14 @@ int tokenize_bytes(parser_t *self, size_t line_limit)
815831
case START_RECORD:
816832
// start of record
817833
if (skip_this_line(self, self->file_lines)) {
818-
self->state = SKIP_LINE;
819-
if (IS_TERMINATOR(c)) {
820-
END_LINE();
834+
if (IS_QUOTE(c)) {
835+
self->state = QUOTE_IN_SKIP_LINE;
836+
} else {
837+
self->state = SKIP_LINE;
838+
839+
if (IS_TERMINATOR(c)) {
840+
END_LINE();
841+
}
821842
}
822843
break;
823844
} else if (IS_TERMINATOR(c)) {

pandas/src/parser/tokenizer.h

+2
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,8 @@ typedef enum {
124124
EAT_LINE_COMMENT,
125125
WHITESPACE_LINE,
126126
SKIP_LINE,
127+
QUOTE_IN_SKIP_LINE,
128+
QUOTE_IN_QUOTE_IN_SKIP_LINE,
127129
FINISHED
128130
} ParserState;
129131

0 commit comments

Comments
 (0)