Skip to content

Commit 35cc80d

Browse files
committed
Merge pull request #10825 from evanpw/csv_eof
Fix handling of EOF in 'c' csv parser
2 parents 05a8bad + 780396b commit 35cc80d

File tree

4 files changed

+110
-24
lines changed

4 files changed

+110
-24
lines changed

doc/source/whatsnew/v0.17.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -653,6 +653,7 @@ Bug Fixes
653653
- Bug in ``Categorical`` may not representing properly when category contains ``tz`` or ``Period`` (:issue:`10713`)
654654
- Bug in ``Categorical.__iter__`` may not returning correct ``datetime`` and ``Period`` (:issue:`10713`)
655655

656+
- Bug in ``read_csv`` with ``engine='c'``: EOF preceded by a comment, blank line, etc. was not handled correctly (:issue:`10728`, :issue:`10548`)
656657

657658
- Reading "famafrench" data via ``DataReader`` results in HTTP 404 error because of the website url is changed (:issue:`10591`).
658659
- Bug in ``read_msgpack`` where DataFrame to decode has duplicate column names (:issue:`9618`)

pandas/io/tests/test_parsers.py

+69
Original file line numberDiff line numberDiff line change
@@ -2433,6 +2433,75 @@ def test_empty_with_nrows_chunksize(self):
24332433
result = pd.DataFrame(result[2], columns=result[1], index=result[0])
24342434
tm.assert_frame_equal(pd.DataFrame.from_records(result), expected)
24352435

2436+
def test_eof_states(self):
2437+
# GH 10728 and 10548
2438+
2439+
## With skip_blank_lines = True
2440+
expected = pd.DataFrame([[4, 5, 6]], columns=['a', 'b', 'c'])
2441+
2442+
# GH 10728
2443+
# WHITESPACE_LINE
2444+
data = 'a,b,c\n4,5,6\n '
2445+
result = self.read_csv(StringIO(data))
2446+
tm.assert_frame_equal(result, expected)
2447+
2448+
# GH 10548
2449+
# EAT_LINE_COMMENT
2450+
data = 'a,b,c\n4,5,6\n#comment'
2451+
result = self.read_csv(StringIO(data), comment='#')
2452+
tm.assert_frame_equal(result, expected)
2453+
2454+
# EAT_CRNL_NOP
2455+
data = 'a,b,c\n4,5,6\n\r'
2456+
result = self.read_csv(StringIO(data))
2457+
tm.assert_frame_equal(result, expected)
2458+
2459+
# EAT_COMMENT
2460+
data = 'a,b,c\n4,5,6#comment'
2461+
result = self.read_csv(StringIO(data), comment='#')
2462+
tm.assert_frame_equal(result, expected)
2463+
2464+
# SKIP_LINE
2465+
data = 'a,b,c\n4,5,6\nskipme'
2466+
result = self.read_csv(StringIO(data), skiprows=[2])
2467+
tm.assert_frame_equal(result, expected)
2468+
2469+
## With skip_blank_lines = False
2470+
2471+
# EAT_LINE_COMMENT
2472+
data = 'a,b,c\n4,5,6\n#comment'
2473+
result = self.read_csv(StringIO(data), comment='#', skip_blank_lines=False)
2474+
expected = pd.DataFrame([[4, 5, 6]], columns=['a', 'b', 'c'])
2475+
tm.assert_frame_equal(result, expected)
2476+
2477+
# IN_FIELD
2478+
data = 'a,b,c\n4,5,6\n '
2479+
result = self.read_csv(StringIO(data), skip_blank_lines=False)
2480+
expected = pd.DataFrame([['4', 5, 6], [' ', None, None]], columns=['a', 'b', 'c'])
2481+
tm.assert_frame_equal(result, expected)
2482+
2483+
# EAT_CRNL
2484+
data = 'a,b,c\n4,5,6\n\r'
2485+
result = self.read_csv(StringIO(data), skip_blank_lines=False)
2486+
expected = pd.DataFrame([[4, 5, 6], [None, None, None]], columns=['a', 'b', 'c'])
2487+
tm.assert_frame_equal(result, expected)
2488+
2489+
## Should produce exceptions
2490+
2491+
# ESCAPED_CHAR
2492+
data = "a,b,c\n4,5,6\n\\"
2493+
self.assertRaises(Exception, self.read_csv, StringIO(data), escapechar='\\')
2494+
2495+
# ESCAPE_IN_QUOTED_FIELD
2496+
data = 'a,b,c\n4,5,6\n"\\'
2497+
self.assertRaises(Exception, self.read_csv, StringIO(data), escapechar='\\')
2498+
2499+
# IN_QUOTED_FIELD
2500+
# Python 2.6 won't throw an exception for this case (see http://bugs.python.org/issue16013)
2501+
tm._skip_if_python26()
2502+
data = 'a,b,c\n4,5,6\n"'
2503+
self.assertRaises(Exception, self.read_csv, StringIO(data), escapechar='\\')
2504+
24362505

24372506

24382507
class TestPythonParser(ParserTests, tm.TestCase):

pandas/src/parser/tokenizer.c

+34-24
Original file line numberDiff line numberDiff line change
@@ -1413,9 +1413,9 @@ int tokenize_whitespace(parser_t *self, size_t line_limit)
14131413
self->state = EAT_CRNL;
14141414
break;
14151415
} else if (IS_WHITESPACE(c)) {
1416-
/*if (self->skip_empty_lines)
1416+
if (self->skip_empty_lines)
14171417
self->state = WHITESPACE_LINE;
1418-
else*/
1418+
else
14191419
self->state = EAT_WHITESPACE;
14201420
break;
14211421
} else if (c == self->commentchar) {
@@ -1643,34 +1643,44 @@ int tokenize_whitespace(parser_t *self, size_t line_limit)
16431643

16441644
static int parser_handle_eof(parser_t *self) {
16451645
TRACE(("handling eof, datalen: %d, pstate: %d\n", self->datalen, self->state))
1646-
if (self->datalen == 0 && (self->state != START_RECORD)) {
1647-
// test cases needed here
1648-
// TODO: empty field at end of line
1649-
TRACE(("handling eof\n"));
16501646

1651-
if (self->state == IN_FIELD || self->state == START_FIELD) {
1652-
if (end_field(self) < 0)
1653-
return -1;
1654-
} else if (self->state == QUOTE_IN_QUOTED_FIELD) {
1655-
if (end_field(self) < 0)
1656-
return -1;
1657-
} else if (self->state == IN_QUOTED_FIELD) {
1658-
self->error_msg = (char*) malloc(100);
1659-
sprintf(self->error_msg, "EOF inside string starting at line %d",
1660-
self->file_lines);
1661-
return -1;
1662-
}
1647+
if (self->datalen != 0)
1648+
return -1;
16631649

1664-
if (end_line(self) < 0)
1650+
switch (self->state) {
1651+
case START_RECORD:
1652+
case WHITESPACE_LINE:
1653+
case EAT_CRNL_NOP:
1654+
case EAT_LINE_COMMENT:
1655+
return 0;
1656+
1657+
case ESCAPE_IN_QUOTED_FIELD:
1658+
case IN_QUOTED_FIELD:
1659+
self->error_msg = (char*)malloc(100);
1660+
sprintf(self->error_msg, "EOF inside string starting at line %d",
1661+
self->file_lines);
1662+
return -1;
1663+
1664+
case ESCAPED_CHAR:
1665+
self->error_msg = (char*)malloc(100);
1666+
sprintf(self->error_msg, "EOF following escape character");
1667+
return -1;
1668+
1669+
case IN_FIELD:
1670+
case START_FIELD:
1671+
case QUOTE_IN_QUOTED_FIELD:
1672+
if (end_field(self) < 0)
16651673
return -1;
1674+
break;
16661675

1667-
return 0;
1668-
}
1669-
else if (self->datalen == 0 && (self->state == START_RECORD)) {
1670-
return 0;
1676+
default:
1677+
break;
16711678
}
16721679

1673-
return -1;
1680+
if (end_line(self) < 0)
1681+
return -1;
1682+
else
1683+
return 0;
16741684
}
16751685

16761686
int parser_consume_rows(parser_t *self, size_t nrows) {

pandas/util/testing.py

+6
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,12 @@ def _skip_if_no_cday():
241241
raise nose.SkipTest("CustomBusinessDay not available.")
242242

243243

244+
def _skip_if_python26():
245+
if sys.version_info[:2] == (2, 6):
246+
import nose
247+
raise nose.SkipTest("skipping on python2.6")
248+
249+
244250
#------------------------------------------------------------------------------
245251
# locale utilities
246252

0 commit comments

Comments
 (0)