From 80b1c979856e3adf5627a2bebfdc5c8aa120b77e Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sun, 23 Sep 2018 17:21:07 -0700 Subject: [PATCH] ERR: Clarify location of EOF on unbalanced quotes Closes gh-22789. --- pandas/_libs/src/parser/tokenizer.c | 2 +- pandas/io/parsers.py | 3 --- pandas/tests/io/parser/common.py | 14 -------------- pandas/tests/io/parser/quoting.py | 17 +++++++++++++++++ 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index da0a9f7498aa8..2fce241027d56 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1150,7 +1150,7 @@ static int parser_handle_eof(parser_t *self) { case IN_QUOTED_FIELD: self->error_msg = (char *)malloc(bufsize); snprintf(self->error_msg, bufsize, - "EOF inside string starting at line %lld", + "EOF inside string starting at row %lld", (long long)self->file_lines); return -1; diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 8d37bf4c84d5d..a4f1155117b12 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2727,9 +2727,6 @@ def _next_iter_line(self, row_num): 'cannot be processed in Python\'s ' 'native csv library at the moment, ' 'so please pass in engine=\'c\' instead') - elif 'newline inside string' in msg: - msg = ('EOF inside string starting with ' - 'line ' + str(row_num)) if self.skipfooter > 0: reason = ('Error could possibly be due to ' diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py index 9e871d27f0ce8..36060490a837d 100644 --- a/pandas/tests/io/parser/common.py +++ b/pandas/tests/io/parser/common.py @@ -197,20 +197,6 @@ def test_malformed(self): header=1, comment='#', skipfooter=1) - def test_quoting(self): - bad_line_small = """printer\tresult\tvariant_name -Klosterdruckerei\tKlosterdruckerei (1611-1804)\tMuller, Jacob -Klosterdruckerei\tKlosterdruckerei (1611-1804)\tMuller, Jakob -Klosterdruckerei\tKlosterdruckerei (1609-1805)\t"Furststiftische Hofdruckerei, (1609-1805)\tGaller, Alois -Klosterdruckerei\tKlosterdruckerei (1609-1805)\tHochfurstliche Buchhandlung """ # noqa - pytest.raises(Exception, self.read_table, StringIO(bad_line_small), - sep='\t') - - good_line_small = bad_line_small + '"' - df = self.read_table(StringIO(good_line_small), sep='\t') - assert len(df) == 3 - def test_unnamed_columns(self): data = """A,B,C,, 1,2,3,4,5 diff --git a/pandas/tests/io/parser/quoting.py b/pandas/tests/io/parser/quoting.py index 15427aaf9825c..013e635f80d21 100644 --- a/pandas/tests/io/parser/quoting.py +++ b/pandas/tests/io/parser/quoting.py @@ -9,6 +9,7 @@ import pandas.util.testing as tm from pandas import DataFrame +from pandas.errors import ParserError from pandas.compat import PY3, StringIO, u @@ -151,3 +152,19 @@ def test_quotechar_unicode(self): if PY3: result = self.read_csv(StringIO(data), quotechar=u('\u0001')) tm.assert_frame_equal(result, expected) + + def test_unbalanced_quoting(self): + # see gh-22789. + data = "a,b,c\n1,2,\"3" + + if self.engine == "c": + regex = "EOF inside string starting at row 1" + else: + regex = "unexpected end of data" + + with tm.assert_raises_regex(ParserError, regex): + self.read_csv(StringIO(data)) + + expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"]) + data = self.read_csv(StringIO(data + '"')) + tm.assert_frame_equal(data, expected)