From 884387e570a5c342af1110528059ffeadeaaab5c Mon Sep 17 00:00:00 2001
From: gfyoung <gfyoung17@gmail.com>
Date: Wed, 5 Apr 2017 17:46:11 -0400
Subject: [PATCH] BUG: Standardize malformed row handling in Python engine

Closes gh-15910.
---
 doc/source/whatsnew/v0.20.0.txt              |  4 +-
 pandas/io/parsers.py                         | 87 +++++++++++---------
 pandas/tests/io/parser/c_parser_only.py      |  9 ++
 pandas/tests/io/parser/python_parser_only.py | 18 ++--
 4 files changed, 72 insertions(+), 46 deletions(-)

diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
index ad190671cbbdc..462341d3d692d 100644
--- a/doc/source/whatsnew/v0.20.0.txt
+++ b/doc/source/whatsnew/v0.20.0.txt
@@ -365,6 +365,7 @@ Other Enhancements
 - ``pandas.io.json.json_normalize()`` gained the option ``errors='ignore'|'raise'``; the default is ``errors='raise'`` which is backward compatible. (:issue:`14583`)
 - ``pandas.io.json.json_normalize()`` with an empty ``list`` will return an empty ``DataFrame`` (:issue:`15534`)
 - ``pandas.io.json.json_normalize()`` has gained a ``sep`` option that accepts ``str`` to separate joined fields; the default is ".", which is backward compatible. (:issue:`14883`)
+- ``pd.read_csv()`` will now raise a ``csv.Error`` error whenever an end-of-file character is encountered in the middle of a data row (:issue:`15913`)
 
 
 .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations
@@ -1034,7 +1035,8 @@ I/O
 - Bug in ``pd.read_csv()`` with ``float_precision='round_trip'`` which caused a segfault when a text entry is parsed (:issue:`15140`)
 - Bug in ``pd.read_csv()`` when an index was specified and no values were specified as null values (:issue:`15835`)
 - Bug in ``pd.read_csv()`` in which certain invalid file objects caused the Python interpreter to crash (:issue:`15337`)
-- Added checks in ``pd.read_csv()`` ensuring that values for ``nrows`` and ``chunksize`` are valid (:issue:`15767`)
+- Bug in ``pd.read_csv()`` in which invalid values for ``nrows`` and ``chunksize`` were allowed (:issue:`15767`)
+- Bug in ``pd.read_csv()`` for the Python engine in which unhelpful error messages were being raised when parsing errors occurred (:issue:`15910`)
 - Bug in ``pd.tools.hashing.hash_pandas_object()`` in which hashing of categoricals depended on the ordering of categories, instead of just their values. (:issue:`15143`)
 - Bug in ``.to_json()`` where ``lines=True`` and contents (keys or values) contain escaped characters (:issue:`15096`)
 - Bug in ``.to_json()`` causing single byte ascii characters to be expanded to four byte unicode (:issue:`15344`)
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index b624d2cc0c7ad..a85f9cda50879 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -2469,26 +2469,7 @@ def _next_line(self):
                 next(self.data)
 
             while True:
-                try:
-                    orig_line = next(self.data)
-                except csv.Error as e:
-                    msg = str(e)
-
-                    if 'NULL byte' in str(e):
-                        msg = ('NULL byte detected. This byte '
-                               'cannot be processed in Python\'s '
-                               'native csv library at the moment, '
-                               'so please pass in engine=\'c\' instead')
-
-                    if self.skipfooter > 0:
-                        reason = ('Error could possibly be due to '
-                                  'parsing errors in the skipped footer rows '
-                                  '(the skipfooter keyword is only applied '
-                                  'after Python\'s csv library has parsed '
-                                  'all rows).')
-                        msg += '. ' + reason
-
-                    raise csv.Error(msg)
+                orig_line = self._next_iter_line()
                 line = self._check_comments([orig_line])[0]
                 self.pos += 1
                 if (not self.skip_blank_lines and
@@ -2510,6 +2491,43 @@ def _next_line(self):
         self.buf.append(line)
         return line
 
+    def _next_iter_line(self, **kwargs):
+        """
+        Wrapper around iterating through `self.data` (CSV source).
+
+        When a CSV error is raised, we check for specific
+        error messages that allow us to customize the
+        error message displayed to the user.
+
+        Parameters
+        ----------
+        kwargs : Keyword arguments used to customize the error message.
+        """
+
+        try:
+            return next(self.data)
+        except csv.Error as e:
+            msg = str(e)
+
+            if 'NULL byte' in msg:
+                msg = ('NULL byte detected. This byte '
+                       'cannot be processed in Python\'s '
+                       'native csv library at the moment, '
+                       'so please pass in engine=\'c\' instead')
+            elif 'newline inside string' in msg:
+                msg = ('EOF inside string starting with '
+                       'line ' + str(kwargs['row_num']))
+
+            if self.skipfooter > 0:
+                reason = ('Error could possibly be due to '
+                          'parsing errors in the skipped footer rows '
+                          '(the skipfooter keyword is only applied '
+                          'after Python\'s csv library has parsed '
+                          'all rows).')
+                msg += '. ' + reason
+
+            raise csv.Error(msg)
+
     def _check_comments(self, lines):
         if self.comment is None:
             return lines
@@ -2688,7 +2706,6 @@ def _rows_to_cols(self, content):
         return zipped_content
 
     def _get_lines(self, rows=None):
-        source = self.data
         lines = self.buf
         new_rows = None
 
@@ -2703,14 +2720,14 @@ def _get_lines(self, rows=None):
                 rows -= len(self.buf)
 
         if new_rows is None:
-            if isinstance(source, list):
-                if self.pos > len(source):
+            if isinstance(self.data, list):
+                if self.pos > len(self.data):
                     raise StopIteration
                 if rows is None:
-                    new_rows = source[self.pos:]
-                    new_pos = len(source)
+                    new_rows = self.data[self.pos:]
+                    new_pos = len(self.data)
                 else:
-                    new_rows = source[self.pos:self.pos + rows]
+                    new_rows = self.data[self.pos:self.pos + rows]
                     new_pos = self.pos + rows
 
                 # Check for stop rows. n.b.: self.skiprows is a set.
@@ -2726,21 +2743,17 @@ def _get_lines(self, rows=None):
                 try:
                     if rows is not None:
                         for _ in range(rows):
-                            new_rows.append(next(source))
+                            new_rows.append(next(self.data))
                         lines.extend(new_rows)
                     else:
                         rows = 0
+
                         while True:
-                            try:
-                                new_rows.append(next(source))
-                                rows += 1
-                            except csv.Error as inst:
-                                if 'newline inside string' in str(inst):
-                                    row_num = str(self.pos + rows)
-                                    msg = ('EOF inside string starting with '
-                                           'line ' + row_num)
-                                    raise Exception(msg)
-                                raise
+                            new_row = self._next_iter_line(
+                                row_num=self.pos + rows)
+                            new_rows.append(new_row)
+                            rows += 1
+
                 except StopIteration:
                     if self.skiprows:
                         new_rows = [row for i, row in enumerate(new_rows)
diff --git a/pandas/tests/io/parser/c_parser_only.py b/pandas/tests/io/parser/c_parser_only.py
index ffbd904843bfc..837b7a7922d75 100644
--- a/pandas/tests/io/parser/c_parser_only.py
+++ b/pandas/tests/io/parser/c_parser_only.py
@@ -408,3 +408,12 @@ def test_large_difference_in_columns(self):
         expected = DataFrame([row.split(',')[0] for row in rows])
 
         tm.assert_frame_equal(result, expected)
+
+    def test_data_after_quote(self):
+        # see gh-15910
+
+        data = 'a\n1\n"b"a'
+        result = self.read_csv(StringIO(data))
+        expected = DataFrame({'a': ['1', 'ba']})
+
+        tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/parser/python_parser_only.py b/pandas/tests/io/parser/python_parser_only.py
index bd76070933c47..36356315419c4 100644
--- a/pandas/tests/io/parser/python_parser_only.py
+++ b/pandas/tests/io/parser/python_parser_only.py
@@ -225,15 +225,17 @@ def test_multi_char_sep_quotes(self):
 
     def test_skipfooter_bad_row(self):
         # see gh-13879
+        # see gh-15910
 
-        data = 'a,b,c\ncat,foo,bar\ndog,foo,"baz'
         msg = 'parsing errors in the skipped footer rows'
 
-        with tm.assertRaisesRegexp(csv.Error, msg):
-            self.read_csv(StringIO(data), skipfooter=1)
-
-        # We expect no match, so there should be an assertion
-        # error out of the inner context manager.
-        with tm.assertRaises(AssertionError):
+        for data in ('a\n1\n"b"a',
+                     'a,b,c\ncat,foo,bar\ndog,foo,"baz'):
             with tm.assertRaisesRegexp(csv.Error, msg):
-                self.read_csv(StringIO(data))
+                self.read_csv(StringIO(data), skipfooter=1)
+
+            # We expect no match, so there should be an assertion
+            # error out of the inner context manager.
+            with tm.assertRaises(AssertionError):
+                with tm.assertRaisesRegexp(csv.Error, msg):
+                    self.read_csv(StringIO(data))