Skip to content

Commit 2a3b1fe

Browse files
committed
BUG: Standardize malformed row handling in Python engine
Closes pandas-devgh-15910.
1 parent ba30e3a commit 2a3b1fe

File tree

3 files changed

+65
-38
lines changed

3 files changed

+65
-38
lines changed

pandas/io/parsers.py

+46-30
Original file line numberDiff line numberDiff line change
@@ -2469,26 +2469,7 @@ def _next_line(self):
24692469
next(self.data)
24702470

24712471
while True:
2472-
try:
2473-
orig_line = next(self.data)
2474-
except csv.Error as e:
2475-
msg = str(e)
2476-
2477-
if 'NULL byte' in str(e):
2478-
msg = ('NULL byte detected. This byte '
2479-
'cannot be processed in Python\'s '
2480-
'native csv library at the moment, '
2481-
'so please pass in engine=\'c\' instead')
2482-
2483-
if self.skipfooter > 0:
2484-
reason = ('Error could possibly be due to '
2485-
'parsing errors in the skipped footer rows '
2486-
'(the skipfooter keyword is only applied '
2487-
'after Python\'s csv library has parsed '
2488-
'all rows).')
2489-
msg += '. ' + reason
2490-
2491-
raise csv.Error(msg)
2472+
orig_line = self._next_iter_line(self.data)
24922473
line = self._check_comments([orig_line])[0]
24932474
self.pos += 1
24942475
if (not self.skip_blank_lines and
@@ -2510,6 +2491,45 @@ def _next_line(self):
25102491
self.buf.append(line)
25112492
return line
25122493

2494+
def _next_iter_line(self, source, **kwargs):
2495+
"""
2496+
Wrapper around iterating through a CSV source.
2497+
2498+
When a CSV error is raised, we check for specific
2499+
error messages that allow us to customize the
2500+
error message displayed to the user.
2501+
2502+
Parameters
2503+
----------
2504+
source : The CSV source through which to iterate.
2505+
kwargs : Keyword arguments used to customize the error message.
2506+
"""
2507+
2508+
try:
2509+
return next(source)
2510+
except csv.Error as e:
2511+
msg = str(e)
2512+
2513+
if 'NULL byte' in msg:
2514+
msg = ('NULL byte detected. This byte '
2515+
'cannot be processed in Python\'s '
2516+
'native csv library at the moment, '
2517+
'so please pass in engine=\'c\' instead')
2518+
elif 'newline inside string' in msg:
2519+
msg = ('EOF inside string starting with '
2520+
'line ' + str(kwargs['row_num']))
2521+
raise Exception(msg)
2522+
2523+
if self.skipfooter > 0:
2524+
reason = ('Error could possibly be due to '
2525+
'parsing errors in the skipped footer rows '
2526+
'(the skipfooter keyword is only applied '
2527+
'after Python\'s csv library has parsed '
2528+
'all rows).')
2529+
msg += '. ' + reason
2530+
2531+
raise csv.Error(msg)
2532+
25132533
def _check_comments(self, lines):
25142534
if self.comment is None:
25152535
return lines
@@ -2730,17 +2750,13 @@ def _get_lines(self, rows=None):
27302750
lines.extend(new_rows)
27312751
else:
27322752
rows = 0
2753+
27332754
while True:
2734-
try:
2735-
new_rows.append(next(source))
2736-
rows += 1
2737-
except csv.Error as inst:
2738-
if 'newline inside string' in str(inst):
2739-
row_num = str(self.pos + rows)
2740-
msg = ('EOF inside string starting with '
2741-
'line ' + row_num)
2742-
raise Exception(msg)
2743-
raise
2755+
new_row = self._next_iter_line(
2756+
source, row_num=self.pos + rows)
2757+
new_rows.append(new_row)
2758+
rows += 1
2759+
27442760
except StopIteration:
27452761
if self.skiprows:
27462762
new_rows = [row for i, row in enumerate(new_rows)

pandas/tests/io/parser/c_parser_only.py

+9
Original file line numberDiff line numberDiff line change
@@ -408,3 +408,12 @@ def test_large_difference_in_columns(self):
408408
expected = DataFrame([row.split(',')[0] for row in rows])
409409

410410
tm.assert_frame_equal(result, expected)
411+
412+
def test_data_after_quote(self):
413+
# see gh-15910
414+
415+
data = 'a\n1\n"b"a'
416+
result = self.read_csv(StringIO(data))
417+
expected = DataFrame({'a': ['1', 'ba']})
418+
419+
tm.assert_frame_equal(result, expected)

pandas/tests/io/parser/python_parser_only.py

+10-8
Original file line numberDiff line numberDiff line change
@@ -225,15 +225,17 @@ def test_multi_char_sep_quotes(self):
225225

226226
def test_skipfooter_bad_row(self):
227227
# see gh-13879
228+
# see gh-15910
228229

229-
data = 'a,b,c\ncat,foo,bar\ndog,foo,"baz'
230230
msg = 'parsing errors in the skipped footer rows'
231231

232-
with tm.assertRaisesRegexp(csv.Error, msg):
233-
self.read_csv(StringIO(data), skipfooter=1)
234-
235-
# We expect no match, so there should be an assertion
236-
# error out of the inner context manager.
237-
with tm.assertRaises(AssertionError):
232+
for data in ('a\n1\n"b"a',
233+
'a,b,c\ncat,foo,bar\ndog,foo,"baz'):
238234
with tm.assertRaisesRegexp(csv.Error, msg):
239-
self.read_csv(StringIO(data))
235+
self.read_csv(StringIO(data), skipfooter=1)
236+
237+
# We expect no match, so there should be an assertion
238+
# error out of the inner context manager.
239+
with tm.assertRaises(AssertionError):
240+
with tm.assertRaisesRegexp(csv.Error, msg):
241+
self.read_csv(StringIO(data))

0 commit comments

Comments
 (0)