Skip to content

Commit e159369

Browse files
gfyoungvictor
authored and
victor
committed
ERR: Clarify location of EOF on unbalanced quotes (pandas-dev#22814)
Closes pandas-devgh-22789.
1 parent 7bbe736 commit e159369

File tree

4 files changed

+18
-18
lines changed

4 files changed

+18
-18
lines changed

pandas/_libs/src/parser/tokenizer.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -1150,7 +1150,7 @@ static int parser_handle_eof(parser_t *self) {
11501150
case IN_QUOTED_FIELD:
11511151
self->error_msg = (char *)malloc(bufsize);
11521152
snprintf(self->error_msg, bufsize,
1153-
"EOF inside string starting at line %lld",
1153+
"EOF inside string starting at row %lld",
11541154
(long long)self->file_lines);
11551155
return -1;
11561156

pandas/io/parsers.py

-3
Original file line numberDiff line numberDiff line change
@@ -2727,9 +2727,6 @@ def _next_iter_line(self, row_num):
27272727
'cannot be processed in Python\'s '
27282728
'native csv library at the moment, '
27292729
'so please pass in engine=\'c\' instead')
2730-
elif 'newline inside string' in msg:
2731-
msg = ('EOF inside string starting with '
2732-
'line ' + str(row_num))
27332730

27342731
if self.skipfooter > 0:
27352732
reason = ('Error could possibly be due to '

pandas/tests/io/parser/common.py

-14
Original file line numberDiff line numberDiff line change
@@ -198,20 +198,6 @@ def test_malformed(self):
198198
header=1, comment='#',
199199
skipfooter=1)
200200

201-
def test_quoting(self):
202-
bad_line_small = """printer\tresult\tvariant_name
203-
Klosterdruckerei\tKlosterdruckerei <Salem> (1611-1804)\tMuller, Jacob
204-
Klosterdruckerei\tKlosterdruckerei <Salem> (1611-1804)\tMuller, Jakob
205-
Klosterdruckerei\tKlosterdruckerei <Kempten> (1609-1805)\t"Furststiftische Hofdruckerei, <Kempten""
206-
Klosterdruckerei\tKlosterdruckerei <Kempten> (1609-1805)\tGaller, Alois
207-
Klosterdruckerei\tKlosterdruckerei <Kempten> (1609-1805)\tHochfurstliche Buchhandlung <Kempten>""" # noqa
208-
pytest.raises(Exception, self.read_table, StringIO(bad_line_small),
209-
sep='\t')
210-
211-
good_line_small = bad_line_small + '"'
212-
df = self.read_table(StringIO(good_line_small), sep='\t')
213-
assert len(df) == 3
214-
215201
def test_unnamed_columns(self):
216202
data = """A,B,C,,
217203
1,2,3,4,5

pandas/tests/io/parser/quoting.py

+17
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import pandas.util.testing as tm
1010

1111
from pandas import DataFrame
12+
from pandas.errors import ParserError
1213
from pandas.compat import PY3, StringIO, u
1314

1415

@@ -151,3 +152,19 @@ def test_quotechar_unicode(self):
151152
if PY3:
152153
result = self.read_csv(StringIO(data), quotechar=u('\u0001'))
153154
tm.assert_frame_equal(result, expected)
155+
156+
def test_unbalanced_quoting(self):
157+
# see gh-22789.
158+
data = "a,b,c\n1,2,\"3"
159+
160+
if self.engine == "c":
161+
regex = "EOF inside string starting at row 1"
162+
else:
163+
regex = "unexpected end of data"
164+
165+
with tm.assert_raises_regex(ParserError, regex):
166+
self.read_csv(StringIO(data))
167+
168+
expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
169+
data = self.read_csv(StringIO(data + '"'))
170+
tm.assert_frame_equal(data, expected)

0 commit comments

Comments
 (0)