Skip to content

Commit e0157d6

Browse files
committed
BUG: Standardize malformed row handling in Python engine
Closes gh-15910.
1 parent ba30e3a commit e0157d6

File tree

3 files changed

+70
-45
lines changed

3 files changed

+70
-45
lines changed

pandas/io/parsers.py

+51-37
Original file line numberDiff line numberDiff line change
@@ -2469,26 +2469,7 @@ def _next_line(self):
24692469
next(self.data)
24702470

24712471
while True:
2472-
try:
2473-
orig_line = next(self.data)
2474-
except csv.Error as e:
2475-
msg = str(e)
2476-
2477-
if 'NULL byte' in str(e):
2478-
msg = ('NULL byte detected. This byte '
2479-
'cannot be processed in Python\'s '
2480-
'native csv library at the moment, '
2481-
'so please pass in engine=\'c\' instead')
2482-
2483-
if self.skipfooter > 0:
2484-
reason = ('Error could possibly be due to '
2485-
'parsing errors in the skipped footer rows '
2486-
'(the skipfooter keyword is only applied '
2487-
'after Python\'s csv library has parsed '
2488-
'all rows).')
2489-
msg += '. ' + reason
2490-
2491-
raise csv.Error(msg)
2472+
orig_line = self._next_iter_line()
24922473
line = self._check_comments([orig_line])[0]
24932474
self.pos += 1
24942475
if (not self.skip_blank_lines and
@@ -2510,6 +2491,44 @@ def _next_line(self):
25102491
self.buf.append(line)
25112492
return line
25122493

2494+
def _next_iter_line(self, **kwargs):
2495+
"""
2496+
Wrapper around iterating through `self.data` (CSV source).
2497+
2498+
When a CSV error is raised, we check for specific
2499+
error messages that allow us to customize the
2500+
error message displayed to the user.
2501+
2502+
Parameters
2503+
----------
2504+
kwargs : Keyword arguments used to customize the error message.
2505+
"""
2506+
2507+
try:
2508+
return next(self.data)
2509+
except csv.Error as e:
2510+
msg = str(e)
2511+
2512+
if 'NULL byte' in msg:
2513+
msg = ('NULL byte detected. This byte '
2514+
'cannot be processed in Python\'s '
2515+
'native csv library at the moment, '
2516+
'so please pass in engine=\'c\' instead')
2517+
elif 'newline inside string' in msg:
2518+
msg = ('EOF inside string starting with '
2519+
'line ' + str(kwargs['row_num']))
2520+
raise Exception(msg)
2521+
2522+
if self.skipfooter > 0:
2523+
reason = ('Error could possibly be due to '
2524+
'parsing errors in the skipped footer rows '
2525+
'(the skipfooter keyword is only applied '
2526+
'after Python\'s csv library has parsed '
2527+
'all rows).')
2528+
msg += '. ' + reason
2529+
2530+
raise csv.Error(msg)
2531+
25132532
def _check_comments(self, lines):
25142533
if self.comment is None:
25152534
return lines
@@ -2688,7 +2707,6 @@ def _rows_to_cols(self, content):
26882707
return zipped_content
26892708

26902709
def _get_lines(self, rows=None):
2691-
source = self.data
26922710
lines = self.buf
26932711
new_rows = None
26942712

@@ -2703,14 +2721,14 @@ def _get_lines(self, rows=None):
27032721
rows -= len(self.buf)
27042722

27052723
if new_rows is None:
2706-
if isinstance(source, list):
2707-
if self.pos > len(source):
2724+
if isinstance(self.data, list):
2725+
if self.pos > len(self.data):
27082726
raise StopIteration
27092727
if rows is None:
2710-
new_rows = source[self.pos:]
2711-
new_pos = len(source)
2728+
new_rows = self.data[self.pos:]
2729+
new_pos = len(self.data)
27122730
else:
2713-
new_rows = source[self.pos:self.pos + rows]
2731+
new_rows = self.data[self.pos:self.pos + rows]
27142732
new_pos = self.pos + rows
27152733

27162734
# Check for stop rows. n.b.: self.skiprows is a set.
@@ -2726,21 +2744,17 @@ def _get_lines(self, rows=None):
27262744
try:
27272745
if rows is not None:
27282746
for _ in range(rows):
2729-
new_rows.append(next(source))
2747+
new_rows.append(next(self.data))
27302748
lines.extend(new_rows)
27312749
else:
27322750
rows = 0
2751+
27332752
while True:
2734-
try:
2735-
new_rows.append(next(source))
2736-
rows += 1
2737-
except csv.Error as inst:
2738-
if 'newline inside string' in str(inst):
2739-
row_num = str(self.pos + rows)
2740-
msg = ('EOF inside string starting with '
2741-
'line ' + row_num)
2742-
raise Exception(msg)
2743-
raise
2753+
new_row = self._next_iter_line(
2754+
row_num=self.pos + rows)
2755+
new_rows.append(new_row)
2756+
rows += 1
2757+
27442758
except StopIteration:
27452759
if self.skiprows:
27462760
new_rows = [row for i, row in enumerate(new_rows)

pandas/tests/io/parser/c_parser_only.py

+9
Original file line numberDiff line numberDiff line change
@@ -408,3 +408,12 @@ def test_large_difference_in_columns(self):
408408
expected = DataFrame([row.split(',')[0] for row in rows])
409409

410410
tm.assert_frame_equal(result, expected)
411+
412+
def test_data_after_quote(self):
413+
# see gh-15910
414+
415+
data = 'a\n1\n"b"a'
416+
result = self.read_csv(StringIO(data))
417+
expected = DataFrame({'a': ['1', 'ba']})
418+
419+
tm.assert_frame_equal(result, expected)

pandas/tests/io/parser/python_parser_only.py

+10-8
Original file line numberDiff line numberDiff line change
@@ -225,15 +225,17 @@ def test_multi_char_sep_quotes(self):
225225

226226
def test_skipfooter_bad_row(self):
227227
# see gh-13879
228+
# see gh-15910
228229

229-
data = 'a,b,c\ncat,foo,bar\ndog,foo,"baz'
230230
msg = 'parsing errors in the skipped footer rows'
231231

232-
with tm.assertRaisesRegexp(csv.Error, msg):
233-
self.read_csv(StringIO(data), skipfooter=1)
234-
235-
# We expect no match, so there should be an assertion
236-
# error out of the inner context manager.
237-
with tm.assertRaises(AssertionError):
232+
for data in ('a\n1\n"b"a',
233+
'a,b,c\ncat,foo,bar\ndog,foo,"baz'):
238234
with tm.assertRaisesRegexp(csv.Error, msg):
239-
self.read_csv(StringIO(data))
235+
self.read_csv(StringIO(data), skipfooter=1)
236+
237+
# We expect no match, so there should be an assertion
238+
# error out of the inner context manager.
239+
with tm.assertRaises(AssertionError):
240+
with tm.assertRaisesRegexp(csv.Error, msg):
241+
self.read_csv(StringIO(data))

0 commit comments

Comments
 (0)