Skip to content

Commit 5a0556e

Browse files
committed
BUG: Don't parse inline quotes in skipped lines
Closes gh-14459.
1 parent 6ac759d commit 5a0556e

File tree

4 files changed

+44
-17
lines changed

4 files changed

+44
-17
lines changed

doc/source/whatsnew/v0.19.1.txt

+1
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ Bug Fixes
3636
- Compat with Cython 0.25 for building (:issue:`14496`)
3737

3838

39+
- Bug in ``pd.read_csv`` for the C engine in which quotation marks were improperly parsed in skipped rows (:issue:`14459`)
3940
- Bug in ``pd.read_csv`` for Python 2.x in which Unicode quote characters were no longer being respected (:issue:`14477`)
4041
- Bug in localizing an ambiguous timezone when a boolean is passed (:issue:`14402`)
4142
- Bug in ``TimedeltaIndex`` addition with a Datetime-like object where addition overflow in the negative direction was not being caught (:issue:`14068`, :issue:`14453`)

pandas/io/tests/parser/skiprows.py

+8
Original file line numberDiff line numberDiff line change
@@ -190,3 +190,11 @@ def test_skiprows_lineterminator(self):
190190
skiprows=1, delim_whitespace=True,
191191
names=['date', 'time', 'var', 'flag', 'oflag'])
192192
tm.assert_frame_equal(df, expected)
193+
194+
def test_skiprows_infield_quote(self):
195+
# see gh-14459
196+
data = 'a"\nb"\na\n1'
197+
expected = DataFrame({'a': [1]})
198+
199+
df = self.read_csv(StringIO(data), skiprows=2)
200+
tm.assert_frame_equal(df, expected)

pandas/src/parser/tokenizer.c

+31-14
Original file line numberDiff line numberDiff line change
@@ -478,9 +478,10 @@ static int end_line(parser_t *self) {
478478
}
479479
}
480480

481-
if (self->state == SKIP_LINE || \
482-
self->state == QUOTE_IN_SKIP_LINE || \
483-
self->state == QUOTE_IN_QUOTE_IN_SKIP_LINE
481+
if (self->state == START_FIELD_IN_SKIP_LINE || \
482+
self->state == IN_FIELD_IN_SKIP_LINE || \
483+
self->state == IN_QUOTED_FIELD_IN_SKIP_LINE || \
484+
self->state == QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE
484485
) {
485486
TRACE(("end_line: Skipping row %d\n", self->file_lines));
486487
// increment file line count
@@ -761,38 +762,54 @@ int tokenize_bytes(parser_t *self, size_t line_limit)
761762

762763
switch(self->state) {
763764

764-
case SKIP_LINE:
765-
TRACE(("tokenize_bytes SKIP_LINE 0x%x, state %d\n", c, self->state));
765+
case START_FIELD_IN_SKIP_LINE:
766766
if (IS_TERMINATOR(c)) {
767767
END_LINE();
768768
} else if (IS_CARRIAGE(c)) {
769769
self->file_lines++;
770770
self->state = EAT_CRNL_NOP;
771771
} else if (IS_QUOTE(c)) {
772-
self->state = QUOTE_IN_SKIP_LINE;
772+
self->state = IN_QUOTED_FIELD_IN_SKIP_LINE;
773+
} else if (IS_DELIMITER(c)) {
774+
// Do nothing, we're starting a new field again.
775+
} else {
776+
self->state = IN_FIELD_IN_SKIP_LINE;
777+
}
778+
break;
779+
780+
case IN_FIELD_IN_SKIP_LINE:
781+
if (IS_TERMINATOR(c)) {
782+
END_LINE();
783+
} else if (IS_CARRIAGE(c)) {
784+
self->file_lines++;
785+
self->state = EAT_CRNL_NOP;
786+
} else if (IS_DELIMITER(c)) {
787+
self->state = START_FIELD_IN_SKIP_LINE;
773788
}
774789
break;
775790

776-
case QUOTE_IN_SKIP_LINE:
791+
case IN_QUOTED_FIELD_IN_SKIP_LINE:
777792
if (IS_QUOTE(c)) {
778793
if (self->doublequote) {
779-
self->state = QUOTE_IN_QUOTE_IN_SKIP_LINE;
794+
self->state = QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE;
780795
} else {
781-
self->state = SKIP_LINE;
796+
self->state = IN_FIELD_IN_SKIP_LINE;
782797
}
783798
}
784799
break;
785800

786-
case QUOTE_IN_QUOTE_IN_SKIP_LINE:
801+
case QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE:
787802
if (IS_QUOTE(c)) {
788-
self->state = QUOTE_IN_SKIP_LINE;
803+
self->state = IN_QUOTED_FIELD_IN_SKIP_LINE;
789804
} else if (IS_TERMINATOR(c)) {
790805
END_LINE();
791806
} else if (IS_CARRIAGE(c)) {
792807
self->file_lines++;
793808
self->state = EAT_CRNL_NOP;
809+
} else if (IS_DELIMITER(c)) {
810+
self->state = START_FIELD_IN_SKIP_LINE;
794811
} else {
795-
self->state = SKIP_LINE;
812+
self->state = IN_FIELD_IN_SKIP_LINE;
796813
}
797814
break;
798815

@@ -846,9 +863,9 @@ int tokenize_bytes(parser_t *self, size_t line_limit)
846863
// start of record
847864
if (skip_this_line(self, self->file_lines)) {
848865
if (IS_QUOTE(c)) {
849-
self->state = QUOTE_IN_SKIP_LINE;
866+
self->state = IN_QUOTED_FIELD_IN_SKIP_LINE;
850867
} else {
851-
self->state = SKIP_LINE;
868+
self->state = IN_FIELD_IN_SKIP_LINE;
852869

853870
if (IS_TERMINATOR(c)) {
854871
END_LINE();

pandas/src/parser/tokenizer.h

+4-3
Original file line numberDiff line numberDiff line change
@@ -123,9 +123,10 @@ typedef enum {
123123
EAT_COMMENT,
124124
EAT_LINE_COMMENT,
125125
WHITESPACE_LINE,
126-
SKIP_LINE,
127-
QUOTE_IN_SKIP_LINE,
128-
QUOTE_IN_QUOTE_IN_SKIP_LINE,
126+
START_FIELD_IN_SKIP_LINE,
127+
IN_FIELD_IN_SKIP_LINE,
128+
IN_QUOTED_FIELD_IN_SKIP_LINE,
129+
QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE,
129130
FINISHED
130131
} ParserState;
131132

0 commit comments

Comments
 (0)