Skip to content

Commit c549299

Browse files
committed
BUG: fix tokenizer bug with \r line terminator and quoted fields. closes #3453
1 parent ebcdaa7 commit c549299

File tree

4 files changed

+43
-10
lines changed

4 files changed

+43
-10
lines changed

RELEASE.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,7 @@ pandas 0.11.1
196196
- ``DataFrame.to_csv`` will succeed with the deprecated option ``nanRep``, @tdsmith
197197
- ``DataFrame.to_html`` and ``DataFrame.to_latex`` now accept a path for
198198
their first argument (GH3702_)
199+
- Fix file tokenization error with \r delimiter and quoted fields (GH3453_)
199200

200201
.. _GH3164: https://github.com/pydata/pandas/issues/3164
201202
.. _GH2786: https://github.com/pydata/pandas/issues/2786
@@ -220,6 +221,7 @@ pandas 0.11.1
220221
.. _GH3553: https://github.com/pydata/pandas/issues/3553
221222
.. _GH3437: https://github.com/pydata/pandas/issues/3437
222223
.. _GH3468: https://github.com/pydata/pandas/issues/3468
224+
.. _GH3453: https://github.com/pydata/pandas/issues/3453
223225
.. _GH3455: https://github.com/pydata/pandas/issues/3455
224226
.. _GH3457: https://github.com/pydata/pandas/issues/3457
225227
.. _GH3477: https://github.com/pydata/pandas/issues/3457

pandas/io/tests/test_parsers.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2238,6 +2238,20 @@ def test_parse_ragged_csv(self):
22382238

22392239
tm.assert_frame_equal(result, expected)
22402240

2241+
def test_tokenize_CR_with_quoting(self):
2242+
# #3453, this doesn't work with Python parser for some reason
2243+
2244+
data = ' a,b,c\r"a,b","e,d","f,f"'
2245+
2246+
result = self.read_csv(StringIO(data), header=None)
2247+
expected = self.read_csv(StringIO(data.replace('\r', '\n')),
2248+
header=None)
2249+
tm.assert_frame_equal(result, expected)
2250+
2251+
result = self.read_csv(StringIO(data))
2252+
expected = self.read_csv(StringIO(data.replace('\r', '\n')))
2253+
tm.assert_frame_equal(result, expected)
2254+
22412255

22422256
class TestParseSQL(unittest.TestCase):
22432257

pandas/src/parser/tokenizer.c

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -687,6 +687,7 @@ int tokenize_delimited(parser_t *self, size_t line_limit)
687687
self->state));
688688

689689
switch(self->state) {
690+
690691
case START_RECORD:
691692
// start of record
692693

@@ -702,6 +703,7 @@ int tokenize_delimited(parser_t *self, size_t line_limit)
702703
/* normal character - handle as START_FIELD */
703704
self->state = START_FIELD;
704705
/* fallthru */
706+
705707
case START_FIELD:
706708
/* expecting field */
707709
if (c == '\n') {
@@ -846,6 +848,14 @@ int tokenize_delimited(parser_t *self, size_t line_limit)
846848
}
847849
break;
848850

851+
case EAT_COMMENT:
852+
if (c == '\n') {
853+
END_LINE();
854+
} else if (c == '\r') {
855+
self->state = EAT_CRNL;
856+
}
857+
break;
858+
849859
case EAT_CRNL:
850860
if (c == '\n') {
851861
END_LINE();
@@ -854,16 +864,23 @@ int tokenize_delimited(parser_t *self, size_t line_limit)
854864
// Handle \r-delimited files
855865
END_LINE_AND_FIELD_STATE(START_FIELD);
856866
} else {
857-
PUSH_CHAR(c);
858-
END_LINE_STATE(IN_FIELD);
859-
}
860-
break;
867+
/* \r line terminator */
868+
869+
/* UGH. we don't actually want to consume the token. fix this later */
870+
self->stream_len = slen;
871+
if (end_line(self) < 0) {
872+
goto parsingerror;
873+
}
874+
stream = self->stream + self->stream_len;
875+
slen = self->stream_len;
876+
self->state = START_RECORD;
877+
878+
/* HACK, let's try this one again */
879+
--i; buf--;
880+
if (line_limit > 0 && self->lines == start_lines + line_limit) {
881+
goto linelimit;
882+
}
861883

862-
case EAT_COMMENT:
863-
if (c == '\n') {
864-
END_LINE();
865-
} else if (c == '\r') {
866-
self->state = EAT_CRNL;
867884
}
868885
break;
869886

pandas/src/parser/tokenizer.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ See LICENSE for the license
8888
#define ERROR_NO_DATA 23
8989

9090

91-
// #define VERBOSE
91+
/* #define VERBOSE */
9292

9393
#if defined(VERBOSE)
9494
#define TRACE(X) printf X;

0 commit comments

Comments
 (0)