ENH: Support malformed row handling in Python engine (#15925)

gfyoung · jreback · commit 5d17a94506c9 · 2017-04-07T15:47:29.000-04:00
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -342,11 +342,11 @@ error_bad_lines : boolean, default ``True``
   Lines with too many fields (e.g. a csv line with too many commas) will by
   default cause an exception to be raised, and no DataFrame will be returned. If
   ``False``, then these "bad lines" will dropped from the DataFrame that is
-  returned (only valid with C parser). See :ref:`bad lines <io.bad_lines>`
+  returned. See :ref:`bad lines <io.bad_lines>`
   below.
 warn_bad_lines : boolean, default ``True``
   If error_bad_lines is ``False``, and warn_bad_lines is ``True``, a warning for
-  each "bad line" will be output (only valid with C parser).
+  each "bad line" will be output.
 
 .. _io.dtypes:
 
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -368,9 +368,10 @@ Other Enhancements
 - ``pandas.io.json.json_normalize()`` gained the option ``errors='ignore'|'raise'``; the default is ``errors='raise'`` which is backward compatible. (:issue:`14583`)
 - ``pandas.io.json.json_normalize()`` with an empty ``list`` will return an empty ``DataFrame`` (:issue:`15534`)
 - ``pandas.io.json.json_normalize()`` has gained a ``sep`` option that accepts ``str`` to separate joined fields; the default is ".", which is backward compatible. (:issue:`14883`)
-- ``pd.read_csv()`` will now raise a ``csv.Error`` error whenever an end-of-file character is encountered in the middle of a data row (:issue:`15913`)
 - A new function has been added to a ``MultiIndex`` to facilitate :ref:`Removing Unused Levels <advanced.shown_levels>`. (:issue:`15694`)
 - :func:`MultiIndex.remove_unused_levels` has been added to facilitate :ref:`removing unused levels <advanced.shown_levels>`. (:issue:`15694`)
+- ``pd.read_csv()`` will now raise a ``ParserError`` error whenever any parsing error occurs (:issue:`15913`, :issue:`15925`)
+- ``pd.read_csv()`` now supports the ``error_bad_lines`` and ``warn_bad_lines`` arguments for the Python parser (:issue:`15925`)
 
 
 .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -263,10 +263,10 @@
     Lines with too many fields (e.g. a csv line with too many commas) will by
     default cause an exception to be raised, and no DataFrame will be returned.
     If False, then these "bad lines" will dropped from the DataFrame that is
-    returned. (Only valid with C parser)
+    returned.
 warn_bad_lines : boolean, default True
     If error_bad_lines is False, and warn_bad_lines is True, a warning for each
-    "bad line" will be output. (Only valid with C parser).
+    "bad line" will be output.
 low_memory : boolean, default True
     Internally process the file in chunks, resulting in lower memory use
     while parsing, but possibly mixed type inference.  To ensure no mixed
@@ -485,8 +485,6 @@ def _read(filepath_or_buffer, kwds):
 _python_unsupported = set([
     'low_memory',
     'buffer_lines',
-    'error_bad_lines',
-    'warn_bad_lines',
     'float_precision',
 ])
 _deprecated_args = set([
@@ -1897,6 +1895,9 @@ def __init__(self, f, **kwds):
         self.usecols, _ = _validate_usecols_arg(kwds['usecols'])
         self.skip_blank_lines = kwds['skip_blank_lines']
 
+        self.warn_bad_lines = kwds['warn_bad_lines']
+        self.error_bad_lines = kwds['error_bad_lines']
+
         self.names_passed = kwds['names'] or None
 
         self.na_filter = kwds['na_filter']
@@ -2469,16 +2470,19 @@ def _next_line(self):
                 next(self.data)
 
             while True:
-                orig_line = self._next_iter_line()
-                line = self._check_comments([orig_line])[0]
+                orig_line = self._next_iter_line(row_num=self.pos + 1)
                 self.pos += 1
-                if (not self.skip_blank_lines and
-                        (self._empty(orig_line) or line)):
-                    break
-                elif self.skip_blank_lines:
-                    ret = self._check_empty([line])
-                    if ret:
-                        line = ret[0]
+
+                if orig_line is not None:
+                    line = self._check_comments([orig_line])[0]
+
+                    if self.skip_blank_lines:
+                        ret = self._check_empty([line])
+
+                        if ret:
+                            line = ret[0]
+                            break
+                    elif self._empty(orig_line) or line:
                         break
 
         # This was the first line of the file,
@@ -2491,7 +2495,28 @@ def _next_line(self):
         self.buf.append(line)
         return line
 
-    def _next_iter_line(self, **kwargs):
+    def _alert_malformed(self, msg, row_num):
+        """
+        Alert a user about a malformed row.
+
+        If `self.error_bad_lines` is True, the alert will be `ParserError`.
+        If `self.warn_bad_lines` is True, the alert will be printed out.
+
+        Parameters
+        ----------
+        msg : The error message to display.
+        row_num : The row number where the parsing error occurred.
+                  Because this row number is displayed, we 1-index,
+                  even though we 0-index internally.
+        """
+
+        if self.error_bad_lines:
+            raise ParserError(msg)
+        elif self.warn_bad_lines:
+            base = 'Skipping line {row_num}: '.format(row_num=row_num)
+            sys.stderr.write(base + msg + '\n')
+
+    def _next_iter_line(self, row_num):
         """
         Wrapper around iterating through `self.data` (CSV source).
 
@@ -2501,32 +2526,34 @@ def _next_iter_line(self, **kwargs):
 
         Parameters
         ----------
-        kwargs : Keyword arguments used to customize the error message.
+        row_num : The row number of the line being parsed.
         """
 
         try:
             return next(self.data)
         except csv.Error as e:
-            msg = str(e)
-
-            if 'NULL byte' in msg:
-                msg = ('NULL byte detected. This byte '
-                       'cannot be processed in Python\'s '
-                       'native csv library at the moment, '
-                       'so please pass in engine=\'c\' instead')
-            elif 'newline inside string' in msg:
-                msg = ('EOF inside string starting with '
-                       'line ' + str(kwargs['row_num']))
-
-            if self.skipfooter > 0:
-                reason = ('Error could possibly be due to '
-                          'parsing errors in the skipped footer rows '
-                          '(the skipfooter keyword is only applied '
-                          'after Python\'s csv library has parsed '
-                          'all rows).')
-                msg += '. ' + reason
-
-            raise csv.Error(msg)
+            if self.warn_bad_lines or self.error_bad_lines:
+                msg = str(e)
+
+                if 'NULL byte' in msg:
+                    msg = ('NULL byte detected. This byte '
+                           'cannot be processed in Python\'s '
+                           'native csv library at the moment, '
+                           'so please pass in engine=\'c\' instead')
+                elif 'newline inside string' in msg:
+                    msg = ('EOF inside string starting with '
+                           'line ' + str(row_num))
+
+                if self.skipfooter > 0:
+                    reason = ('Error could possibly be due to '
+                              'parsing errors in the skipped footer rows '
+                              '(the skipfooter keyword is only applied '
+                              'after Python\'s csv library has parsed '
+                              'all rows).')
+                    msg += '. ' + reason
+
+                self._alert_malformed(msg, row_num)
+            return None
 
     def _check_comments(self, lines):
         if self.comment is None:
@@ -2657,42 +2684,57 @@ def _get_index_name(self, columns):
         return index_name, orig_names, columns
 
     def _rows_to_cols(self, content):
+        if self.skipfooter < 0:
+            raise ValueError('skip footer cannot be negative')
+
         col_len = self.num_original_columns
 
         if self._implicit_index:
             col_len += len(self.index_col)
 
-        # see gh-13320
-        zipped_content = list(lib.to_object_array(
-            content, min_width=col_len).T)
-        zip_len = len(zipped_content)
-
-        if self.skipfooter < 0:
-            raise ValueError('skip footer cannot be negative')
+        max_len = max([len(row) for row in content])
 
-        # Loop through rows to verify lengths are correct.
-        if (col_len != zip_len and
+        # Check that there are no rows with too many
+        # elements in their row (rows with too few
+        # elements are padded with NaN).
+        if (max_len > col_len and
                 self.index_col is not False and
                 self.usecols is None):
-            i = 0
-            for (i, l) in enumerate(content):
-                if len(l) != col_len:
-                    break
 
-            footers = 0
-            if self.skipfooter:
-                footers = self.skipfooter
+            footers = self.skipfooter if self.skipfooter else 0
+            bad_lines = []
 
-            row_num = self.pos - (len(content) - i + footers)
+            iter_content = enumerate(content)
+            content_len = len(content)
+            content = []
 
-            msg = ('Expected %d fields in line %d, saw %d' %
-                   (col_len, row_num + 1, zip_len))
-            if len(self.delimiter) > 1 and self.quoting != csv.QUOTE_NONE:
-                # see gh-13374
-                reason = ('Error could possibly be due to quotes being '
-                          'ignored when a multi-char delimiter is used.')
-                msg += '. ' + reason
-            raise ValueError(msg)
+            for (i, l) in iter_content:
+                actual_len = len(l)
+
+                if actual_len > col_len:
+                    if self.error_bad_lines or self.warn_bad_lines:
+                        row_num = self.pos - (content_len - i + footers)
+                        bad_lines.append((row_num, actual_len))
+
+                        if self.error_bad_lines:
+                            break
+                else:
+                    content.append(l)
+
+            for row_num, actual_len in bad_lines:
+                msg = ('Expected %d fields in line %d, saw %d' %
+                       (col_len, row_num + 1, actual_len))
+                if len(self.delimiter) > 1 and self.quoting != csv.QUOTE_NONE:
+                    # see gh-13374
+                    reason = ('Error could possibly be due to quotes being '
+                              'ignored when a multi-char delimiter is used.')
+                    msg += '. ' + reason
+
+                self._alert_malformed(msg, row_num + 1)
+
+        # see gh-13320
+        zipped_content = list(lib.to_object_array(
+            content, min_width=col_len).T)
 
         if self.usecols:
             if self._implicit_index:
@@ -2750,10 +2792,12 @@ def _get_lines(self, rows=None):
 
                         while True:
                             new_row = self._next_iter_line(
-                                row_num=self.pos + rows)
-                            new_rows.append(new_row)
+                                row_num=self.pos + rows + 1)
                             rows += 1
 
+                            if new_row is not None:
+                                new_rows.append(new_row)
+
                 except StopIteration:
                     if self.skiprows:
                         new_rows = [row for i, row in enumerate(new_rows)
diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py
@@ -19,7 +19,7 @@
 from pandas import compat
 from pandas.compat import (StringIO, BytesIO, PY3,
                            range, lrange, u)
-from pandas.errors import DtypeWarning, EmptyDataError
+from pandas.errors import DtypeWarning, EmptyDataError, ParserError
 from pandas.io.common import URLError
 from pandas.io.parsers import TextFileReader, TextParser
 
@@ -1569,7 +1569,7 @@ def test_null_byte_char(self):
             tm.assert_frame_equal(out, expected)
         else:
             msg = "NULL byte detected"
-            with tm.assertRaisesRegexp(csv.Error, msg):
+            with tm.assertRaisesRegexp(ParserError, msg):
                 self.read_csv(StringIO(data), names=cols)
 
     def test_utf8_bom(self):
@@ -1695,3 +1695,41 @@ class InvalidBuffer(object):
 
             with tm.assertRaisesRegexp(ValueError, msg):
                 self.read_csv(mock.Mock())
+
+    def test_skip_bad_lines(self):
+        # see gh-15925
+        data = 'a\n1\n1,2,3\n4\n5,6,7'
+
+        with tm.assertRaises(ParserError):
+            self.read_csv(StringIO(data))
+
+        with tm.assertRaises(ParserError):
+            self.read_csv(StringIO(data), error_bad_lines=True)
+
+        stderr = sys.stderr
+        expected = DataFrame({'a': [1, 4]})
+
+        sys.stderr = StringIO()
+        try:
+            out = self.read_csv(StringIO(data),
+                                error_bad_lines=False,
+                                warn_bad_lines=False)
+            tm.assert_frame_equal(out, expected)
+
+            val = sys.stderr.getvalue()
+            self.assertEqual(val, '')
+        finally:
+            sys.stderr = stderr
+
+        sys.stderr = StringIO()
+        try:
+            out = self.read_csv(StringIO(data),
+                                error_bad_lines=False,
+                                warn_bad_lines=True)
+            tm.assert_frame_equal(out, expected)
+
+            val = sys.stderr.getvalue()
+            self.assertTrue('Skipping line 3' in val)
+            self.assertTrue('Skipping line 5' in val)
+        finally:
+            sys.stderr = stderr
diff --git a/pandas/tests/io/parser/python_parser_only.py b/pandas/tests/io/parser/python_parser_only.py
@@ -14,6 +14,7 @@
 import pandas.util.testing as tm
 from pandas import DataFrame, Index
 from pandas import compat
+from pandas.errors import ParserError
 from pandas.compat import StringIO, BytesIO, u
 
 
@@ -213,13 +214,13 @@ def test_multi_char_sep_quotes(self):
         data = 'a,,b\n1,,a\n2,,"2,,b"'
         msg = 'ignored when a multi-char delimiter is used'
 
-        with tm.assertRaisesRegexp(ValueError, msg):
+        with tm.assertRaisesRegexp(ParserError, msg):
             self.read_csv(StringIO(data), sep=',,')
 
         # We expect no match, so there should be an assertion
         # error out of the inner context manager.
         with tm.assertRaises(AssertionError):
-            with tm.assertRaisesRegexp(ValueError, msg):
+            with tm.assertRaisesRegexp(ParserError, msg):
                 self.read_csv(StringIO(data), sep=',,',
                               quoting=csv.QUOTE_NONE)
 
@@ -231,11 +232,11 @@ def test_skipfooter_bad_row(self):
 
         for data in ('a\n1\n"b"a',
                      'a,b,c\ncat,foo,bar\ndog,foo,"baz'):
-            with tm.assertRaisesRegexp(csv.Error, msg):
+            with tm.assertRaisesRegexp(ParserError, msg):
                 self.read_csv(StringIO(data), skipfooter=1)
 
             # We expect no match, so there should be an assertion
             # error out of the inner context manager.
             with tm.assertRaises(AssertionError):
-                with tm.assertRaisesRegexp(csv.Error, msg):
+                with tm.assertRaisesRegexp(ParserError, msg):
                     self.read_csv(StringIO(data))