Skip to content

Commit 5d17a94

Browse files
gfyoungjreback
authored andcommitted
ENH: Support malformed row handling in Python engine (#15925)
1 parent c25fbde commit 5d17a94

File tree

5 files changed

+155
-71
lines changed

5 files changed

+155
-71
lines changed

doc/source/io.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -342,11 +342,11 @@ error_bad_lines : boolean, default ``True``
342342
Lines with too many fields (e.g. a csv line with too many commas) will by
343343
default cause an exception to be raised, and no DataFrame will be returned. If
344344
``False``, then these "bad lines" will dropped from the DataFrame that is
345-
returned (only valid with C parser). See :ref:`bad lines <io.bad_lines>`
345+
returned. See :ref:`bad lines <io.bad_lines>`
346346
below.
347347
warn_bad_lines : boolean, default ``True``
348348
If error_bad_lines is ``False``, and warn_bad_lines is ``True``, a warning for
349-
each "bad line" will be output (only valid with C parser).
349+
each "bad line" will be output.
350350

351351
.. _io.dtypes:
352352

doc/source/whatsnew/v0.20.0.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -368,9 +368,10 @@ Other Enhancements
368368
- ``pandas.io.json.json_normalize()`` gained the option ``errors='ignore'|'raise'``; the default is ``errors='raise'`` which is backward compatible. (:issue:`14583`)
369369
- ``pandas.io.json.json_normalize()`` with an empty ``list`` will return an empty ``DataFrame`` (:issue:`15534`)
370370
- ``pandas.io.json.json_normalize()`` has gained a ``sep`` option that accepts ``str`` to separate joined fields; the default is ".", which is backward compatible. (:issue:`14883`)
371-
- ``pd.read_csv()`` will now raise a ``csv.Error`` error whenever an end-of-file character is encountered in the middle of a data row (:issue:`15913`)
372371
- A new function has been added to a ``MultiIndex`` to facilitate :ref:`Removing Unused Levels <advanced.shown_levels>`. (:issue:`15694`)
373372
- :func:`MultiIndex.remove_unused_levels` has been added to facilitate :ref:`removing unused levels <advanced.shown_levels>`. (:issue:`15694`)
373+
- ``pd.read_csv()`` will now raise a ``ParserError`` error whenever any parsing error occurs (:issue:`15913`, :issue:`15925`)
374+
- ``pd.read_csv()`` now supports the ``error_bad_lines`` and ``warn_bad_lines`` arguments for the Python parser (:issue:`15925`)
374375

375376

376377
.. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations

pandas/io/parsers.py

+106-62
Original file line numberDiff line numberDiff line change
@@ -263,10 +263,10 @@
263263
Lines with too many fields (e.g. a csv line with too many commas) will by
264264
default cause an exception to be raised, and no DataFrame will be returned.
265265
If False, then these "bad lines" will dropped from the DataFrame that is
266-
returned. (Only valid with C parser)
266+
returned.
267267
warn_bad_lines : boolean, default True
268268
If error_bad_lines is False, and warn_bad_lines is True, a warning for each
269-
"bad line" will be output. (Only valid with C parser).
269+
"bad line" will be output.
270270
low_memory : boolean, default True
271271
Internally process the file in chunks, resulting in lower memory use
272272
while parsing, but possibly mixed type inference. To ensure no mixed
@@ -485,8 +485,6 @@ def _read(filepath_or_buffer, kwds):
485485
_python_unsupported = set([
486486
'low_memory',
487487
'buffer_lines',
488-
'error_bad_lines',
489-
'warn_bad_lines',
490488
'float_precision',
491489
])
492490
_deprecated_args = set([
@@ -1897,6 +1895,9 @@ def __init__(self, f, **kwds):
18971895
self.usecols, _ = _validate_usecols_arg(kwds['usecols'])
18981896
self.skip_blank_lines = kwds['skip_blank_lines']
18991897

1898+
self.warn_bad_lines = kwds['warn_bad_lines']
1899+
self.error_bad_lines = kwds['error_bad_lines']
1900+
19001901
self.names_passed = kwds['names'] or None
19011902

19021903
self.na_filter = kwds['na_filter']
@@ -2469,16 +2470,19 @@ def _next_line(self):
24692470
next(self.data)
24702471

24712472
while True:
2472-
orig_line = self._next_iter_line()
2473-
line = self._check_comments([orig_line])[0]
2473+
orig_line = self._next_iter_line(row_num=self.pos + 1)
24742474
self.pos += 1
2475-
if (not self.skip_blank_lines and
2476-
(self._empty(orig_line) or line)):
2477-
break
2478-
elif self.skip_blank_lines:
2479-
ret = self._check_empty([line])
2480-
if ret:
2481-
line = ret[0]
2475+
2476+
if orig_line is not None:
2477+
line = self._check_comments([orig_line])[0]
2478+
2479+
if self.skip_blank_lines:
2480+
ret = self._check_empty([line])
2481+
2482+
if ret:
2483+
line = ret[0]
2484+
break
2485+
elif self._empty(orig_line) or line:
24822486
break
24832487

24842488
# This was the first line of the file,
@@ -2491,7 +2495,28 @@ def _next_line(self):
24912495
self.buf.append(line)
24922496
return line
24932497

2494-
def _next_iter_line(self, **kwargs):
2498+
def _alert_malformed(self, msg, row_num):
2499+
"""
2500+
Alert a user about a malformed row.
2501+
2502+
If `self.error_bad_lines` is True, the alert will be `ParserError`.
2503+
If `self.warn_bad_lines` is True, the alert will be printed out.
2504+
2505+
Parameters
2506+
----------
2507+
msg : The error message to display.
2508+
row_num : The row number where the parsing error occurred.
2509+
Because this row number is displayed, we 1-index,
2510+
even though we 0-index internally.
2511+
"""
2512+
2513+
if self.error_bad_lines:
2514+
raise ParserError(msg)
2515+
elif self.warn_bad_lines:
2516+
base = 'Skipping line {row_num}: '.format(row_num=row_num)
2517+
sys.stderr.write(base + msg + '\n')
2518+
2519+
def _next_iter_line(self, row_num):
24952520
"""
24962521
Wrapper around iterating through `self.data` (CSV source).
24972522
@@ -2501,32 +2526,34 @@ def _next_iter_line(self, **kwargs):
25012526
25022527
Parameters
25032528
----------
2504-
kwargs : Keyword arguments used to customize the error message.
2529+
row_num : The row number of the line being parsed.
25052530
"""
25062531

25072532
try:
25082533
return next(self.data)
25092534
except csv.Error as e:
2510-
msg = str(e)
2511-
2512-
if 'NULL byte' in msg:
2513-
msg = ('NULL byte detected. This byte '
2514-
'cannot be processed in Python\'s '
2515-
'native csv library at the moment, '
2516-
'so please pass in engine=\'c\' instead')
2517-
elif 'newline inside string' in msg:
2518-
msg = ('EOF inside string starting with '
2519-
'line ' + str(kwargs['row_num']))
2520-
2521-
if self.skipfooter > 0:
2522-
reason = ('Error could possibly be due to '
2523-
'parsing errors in the skipped footer rows '
2524-
'(the skipfooter keyword is only applied '
2525-
'after Python\'s csv library has parsed '
2526-
'all rows).')
2527-
msg += '. ' + reason
2528-
2529-
raise csv.Error(msg)
2535+
if self.warn_bad_lines or self.error_bad_lines:
2536+
msg = str(e)
2537+
2538+
if 'NULL byte' in msg:
2539+
msg = ('NULL byte detected. This byte '
2540+
'cannot be processed in Python\'s '
2541+
'native csv library at the moment, '
2542+
'so please pass in engine=\'c\' instead')
2543+
elif 'newline inside string' in msg:
2544+
msg = ('EOF inside string starting with '
2545+
'line ' + str(row_num))
2546+
2547+
if self.skipfooter > 0:
2548+
reason = ('Error could possibly be due to '
2549+
'parsing errors in the skipped footer rows '
2550+
'(the skipfooter keyword is only applied '
2551+
'after Python\'s csv library has parsed '
2552+
'all rows).')
2553+
msg += '. ' + reason
2554+
2555+
self._alert_malformed(msg, row_num)
2556+
return None
25302557

25312558
def _check_comments(self, lines):
25322559
if self.comment is None:
@@ -2657,42 +2684,57 @@ def _get_index_name(self, columns):
26572684
return index_name, orig_names, columns
26582685

26592686
def _rows_to_cols(self, content):
2687+
if self.skipfooter < 0:
2688+
raise ValueError('skip footer cannot be negative')
2689+
26602690
col_len = self.num_original_columns
26612691

26622692
if self._implicit_index:
26632693
col_len += len(self.index_col)
26642694

2665-
# see gh-13320
2666-
zipped_content = list(lib.to_object_array(
2667-
content, min_width=col_len).T)
2668-
zip_len = len(zipped_content)
2669-
2670-
if self.skipfooter < 0:
2671-
raise ValueError('skip footer cannot be negative')
2695+
max_len = max([len(row) for row in content])
26722696

2673-
# Loop through rows to verify lengths are correct.
2674-
if (col_len != zip_len and
2697+
# Check that there are no rows with too many
2698+
# elements in their row (rows with too few
2699+
# elements are padded with NaN).
2700+
if (max_len > col_len and
26752701
self.index_col is not False and
26762702
self.usecols is None):
2677-
i = 0
2678-
for (i, l) in enumerate(content):
2679-
if len(l) != col_len:
2680-
break
26812703

2682-
footers = 0
2683-
if self.skipfooter:
2684-
footers = self.skipfooter
2704+
footers = self.skipfooter if self.skipfooter else 0
2705+
bad_lines = []
26852706

2686-
row_num = self.pos - (len(content) - i + footers)
2707+
iter_content = enumerate(content)
2708+
content_len = len(content)
2709+
content = []
26872710

2688-
msg = ('Expected %d fields in line %d, saw %d' %
2689-
(col_len, row_num + 1, zip_len))
2690-
if len(self.delimiter) > 1 and self.quoting != csv.QUOTE_NONE:
2691-
# see gh-13374
2692-
reason = ('Error could possibly be due to quotes being '
2693-
'ignored when a multi-char delimiter is used.')
2694-
msg += '. ' + reason
2695-
raise ValueError(msg)
2711+
for (i, l) in iter_content:
2712+
actual_len = len(l)
2713+
2714+
if actual_len > col_len:
2715+
if self.error_bad_lines or self.warn_bad_lines:
2716+
row_num = self.pos - (content_len - i + footers)
2717+
bad_lines.append((row_num, actual_len))
2718+
2719+
if self.error_bad_lines:
2720+
break
2721+
else:
2722+
content.append(l)
2723+
2724+
for row_num, actual_len in bad_lines:
2725+
msg = ('Expected %d fields in line %d, saw %d' %
2726+
(col_len, row_num + 1, actual_len))
2727+
if len(self.delimiter) > 1 and self.quoting != csv.QUOTE_NONE:
2728+
# see gh-13374
2729+
reason = ('Error could possibly be due to quotes being '
2730+
'ignored when a multi-char delimiter is used.')
2731+
msg += '. ' + reason
2732+
2733+
self._alert_malformed(msg, row_num + 1)
2734+
2735+
# see gh-13320
2736+
zipped_content = list(lib.to_object_array(
2737+
content, min_width=col_len).T)
26962738

26972739
if self.usecols:
26982740
if self._implicit_index:
@@ -2750,10 +2792,12 @@ def _get_lines(self, rows=None):
27502792

27512793
while True:
27522794
new_row = self._next_iter_line(
2753-
row_num=self.pos + rows)
2754-
new_rows.append(new_row)
2795+
row_num=self.pos + rows + 1)
27552796
rows += 1
27562797

2798+
if new_row is not None:
2799+
new_rows.append(new_row)
2800+
27572801
except StopIteration:
27582802
if self.skiprows:
27592803
new_rows = [row for i, row in enumerate(new_rows)

pandas/tests/io/parser/common.py

+40-2
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
from pandas import compat
2020
from pandas.compat import (StringIO, BytesIO, PY3,
2121
range, lrange, u)
22-
from pandas.errors import DtypeWarning, EmptyDataError
22+
from pandas.errors import DtypeWarning, EmptyDataError, ParserError
2323
from pandas.io.common import URLError
2424
from pandas.io.parsers import TextFileReader, TextParser
2525

@@ -1569,7 +1569,7 @@ def test_null_byte_char(self):
15691569
tm.assert_frame_equal(out, expected)
15701570
else:
15711571
msg = "NULL byte detected"
1572-
with tm.assertRaisesRegexp(csv.Error, msg):
1572+
with tm.assertRaisesRegexp(ParserError, msg):
15731573
self.read_csv(StringIO(data), names=cols)
15741574

15751575
def test_utf8_bom(self):
@@ -1695,3 +1695,41 @@ class InvalidBuffer(object):
16951695

16961696
with tm.assertRaisesRegexp(ValueError, msg):
16971697
self.read_csv(mock.Mock())
1698+
1699+
def test_skip_bad_lines(self):
1700+
# see gh-15925
1701+
data = 'a\n1\n1,2,3\n4\n5,6,7'
1702+
1703+
with tm.assertRaises(ParserError):
1704+
self.read_csv(StringIO(data))
1705+
1706+
with tm.assertRaises(ParserError):
1707+
self.read_csv(StringIO(data), error_bad_lines=True)
1708+
1709+
stderr = sys.stderr
1710+
expected = DataFrame({'a': [1, 4]})
1711+
1712+
sys.stderr = StringIO()
1713+
try:
1714+
out = self.read_csv(StringIO(data),
1715+
error_bad_lines=False,
1716+
warn_bad_lines=False)
1717+
tm.assert_frame_equal(out, expected)
1718+
1719+
val = sys.stderr.getvalue()
1720+
self.assertEqual(val, '')
1721+
finally:
1722+
sys.stderr = stderr
1723+
1724+
sys.stderr = StringIO()
1725+
try:
1726+
out = self.read_csv(StringIO(data),
1727+
error_bad_lines=False,
1728+
warn_bad_lines=True)
1729+
tm.assert_frame_equal(out, expected)
1730+
1731+
val = sys.stderr.getvalue()
1732+
self.assertTrue('Skipping line 3' in val)
1733+
self.assertTrue('Skipping line 5' in val)
1734+
finally:
1735+
sys.stderr = stderr

pandas/tests/io/parser/python_parser_only.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
import pandas.util.testing as tm
1515
from pandas import DataFrame, Index
1616
from pandas import compat
17+
from pandas.errors import ParserError
1718
from pandas.compat import StringIO, BytesIO, u
1819

1920

@@ -213,13 +214,13 @@ def test_multi_char_sep_quotes(self):
213214
data = 'a,,b\n1,,a\n2,,"2,,b"'
214215
msg = 'ignored when a multi-char delimiter is used'
215216

216-
with tm.assertRaisesRegexp(ValueError, msg):
217+
with tm.assertRaisesRegexp(ParserError, msg):
217218
self.read_csv(StringIO(data), sep=',,')
218219

219220
# We expect no match, so there should be an assertion
220221
# error out of the inner context manager.
221222
with tm.assertRaises(AssertionError):
222-
with tm.assertRaisesRegexp(ValueError, msg):
223+
with tm.assertRaisesRegexp(ParserError, msg):
223224
self.read_csv(StringIO(data), sep=',,',
224225
quoting=csv.QUOTE_NONE)
225226

@@ -231,11 +232,11 @@ def test_skipfooter_bad_row(self):
231232

232233
for data in ('a\n1\n"b"a',
233234
'a,b,c\ncat,foo,bar\ndog,foo,"baz'):
234-
with tm.assertRaisesRegexp(csv.Error, msg):
235+
with tm.assertRaisesRegexp(ParserError, msg):
235236
self.read_csv(StringIO(data), skipfooter=1)
236237

237238
# We expect no match, so there should be an assertion
238239
# error out of the inner context manager.
239240
with tm.assertRaises(AssertionError):
240-
with tm.assertRaisesRegexp(csv.Error, msg):
241+
with tm.assertRaisesRegexp(ParserError, msg):
241242
self.read_csv(StringIO(data))

0 commit comments

Comments
 (0)