From 884387e570a5c342af1110528059ffeadeaaab5c Mon Sep 17 00:00:00 2001 From: gfyoung Date: Wed, 5 Apr 2017 17:46:11 -0400 Subject: [PATCH] BUG: Standardize malformed row handling in Python engine Closes gh-15910. --- doc/source/whatsnew/v0.20.0.txt | 4 +- pandas/io/parsers.py | 87 +++++++++++--------- pandas/tests/io/parser/c_parser_only.py | 9 ++ pandas/tests/io/parser/python_parser_only.py | 18 ++-- 4 files changed, 72 insertions(+), 46 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index ad190671cbbdc..462341d3d692d 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -365,6 +365,7 @@ Other Enhancements - ``pandas.io.json.json_normalize()`` gained the option ``errors='ignore'|'raise'``; the default is ``errors='raise'`` which is backward compatible. (:issue:`14583`) - ``pandas.io.json.json_normalize()`` with an empty ``list`` will return an empty ``DataFrame`` (:issue:`15534`) - ``pandas.io.json.json_normalize()`` has gained a ``sep`` option that accepts ``str`` to separate joined fields; the default is ".", which is backward compatible. (:issue:`14883`) +- ``pd.read_csv()`` will now raise a ``csv.Error`` error whenever an end-of-file character is encountered in the middle of a data row (:issue:`15913`) .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations @@ -1034,7 +1035,8 @@ I/O - Bug in ``pd.read_csv()`` with ``float_precision='round_trip'`` which caused a segfault when a text entry is parsed (:issue:`15140`) - Bug in ``pd.read_csv()`` when an index was specified and no values were specified as null values (:issue:`15835`) - Bug in ``pd.read_csv()`` in which certain invalid file objects caused the Python interpreter to crash (:issue:`15337`) -- Added checks in ``pd.read_csv()`` ensuring that values for ``nrows`` and ``chunksize`` are valid (:issue:`15767`) +- Bug in ``pd.read_csv()`` in which invalid values for ``nrows`` and ``chunksize`` were allowed (:issue:`15767`) +- Bug in ``pd.read_csv()`` for the Python engine in which unhelpful error messages were being raised when parsing errors occurred (:issue:`15910`) - Bug in ``pd.tools.hashing.hash_pandas_object()`` in which hashing of categoricals depended on the ordering of categories, instead of just their values. (:issue:`15143`) - Bug in ``.to_json()`` where ``lines=True`` and contents (keys or values) contain escaped characters (:issue:`15096`) - Bug in ``.to_json()`` causing single byte ascii characters to be expanded to four byte unicode (:issue:`15344`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index b624d2cc0c7ad..a85f9cda50879 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2469,26 +2469,7 @@ def _next_line(self): next(self.data) while True: - try: - orig_line = next(self.data) - except csv.Error as e: - msg = str(e) - - if 'NULL byte' in str(e): - msg = ('NULL byte detected. This byte ' - 'cannot be processed in Python\'s ' - 'native csv library at the moment, ' - 'so please pass in engine=\'c\' instead') - - if self.skipfooter > 0: - reason = ('Error could possibly be due to ' - 'parsing errors in the skipped footer rows ' - '(the skipfooter keyword is only applied ' - 'after Python\'s csv library has parsed ' - 'all rows).') - msg += '. ' + reason - - raise csv.Error(msg) + orig_line = self._next_iter_line() line = self._check_comments([orig_line])[0] self.pos += 1 if (not self.skip_blank_lines and @@ -2510,6 +2491,43 @@ def _next_line(self): self.buf.append(line) return line + def _next_iter_line(self, **kwargs): + """ + Wrapper around iterating through `self.data` (CSV source). + + When a CSV error is raised, we check for specific + error messages that allow us to customize the + error message displayed to the user. + + Parameters + ---------- + kwargs : Keyword arguments used to customize the error message. + """ + + try: + return next(self.data) + except csv.Error as e: + msg = str(e) + + if 'NULL byte' in msg: + msg = ('NULL byte detected. This byte ' + 'cannot be processed in Python\'s ' + 'native csv library at the moment, ' + 'so please pass in engine=\'c\' instead') + elif 'newline inside string' in msg: + msg = ('EOF inside string starting with ' + 'line ' + str(kwargs['row_num'])) + + if self.skipfooter > 0: + reason = ('Error could possibly be due to ' + 'parsing errors in the skipped footer rows ' + '(the skipfooter keyword is only applied ' + 'after Python\'s csv library has parsed ' + 'all rows).') + msg += '. ' + reason + + raise csv.Error(msg) + def _check_comments(self, lines): if self.comment is None: return lines @@ -2688,7 +2706,6 @@ def _rows_to_cols(self, content): return zipped_content def _get_lines(self, rows=None): - source = self.data lines = self.buf new_rows = None @@ -2703,14 +2720,14 @@ def _get_lines(self, rows=None): rows -= len(self.buf) if new_rows is None: - if isinstance(source, list): - if self.pos > len(source): + if isinstance(self.data, list): + if self.pos > len(self.data): raise StopIteration if rows is None: - new_rows = source[self.pos:] - new_pos = len(source) + new_rows = self.data[self.pos:] + new_pos = len(self.data) else: - new_rows = source[self.pos:self.pos + rows] + new_rows = self.data[self.pos:self.pos + rows] new_pos = self.pos + rows # Check for stop rows. n.b.: self.skiprows is a set. @@ -2726,21 +2743,17 @@ def _get_lines(self, rows=None): try: if rows is not None: for _ in range(rows): - new_rows.append(next(source)) + new_rows.append(next(self.data)) lines.extend(new_rows) else: rows = 0 + while True: - try: - new_rows.append(next(source)) - rows += 1 - except csv.Error as inst: - if 'newline inside string' in str(inst): - row_num = str(self.pos + rows) - msg = ('EOF inside string starting with ' - 'line ' + row_num) - raise Exception(msg) - raise + new_row = self._next_iter_line( + row_num=self.pos + rows) + new_rows.append(new_row) + rows += 1 + except StopIteration: if self.skiprows: new_rows = [row for i, row in enumerate(new_rows) diff --git a/pandas/tests/io/parser/c_parser_only.py b/pandas/tests/io/parser/c_parser_only.py index ffbd904843bfc..837b7a7922d75 100644 --- a/pandas/tests/io/parser/c_parser_only.py +++ b/pandas/tests/io/parser/c_parser_only.py @@ -408,3 +408,12 @@ def test_large_difference_in_columns(self): expected = DataFrame([row.split(',')[0] for row in rows]) tm.assert_frame_equal(result, expected) + + def test_data_after_quote(self): + # see gh-15910 + + data = 'a\n1\n"b"a' + result = self.read_csv(StringIO(data)) + expected = DataFrame({'a': ['1', 'ba']}) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/python_parser_only.py b/pandas/tests/io/parser/python_parser_only.py index bd76070933c47..36356315419c4 100644 --- a/pandas/tests/io/parser/python_parser_only.py +++ b/pandas/tests/io/parser/python_parser_only.py @@ -225,15 +225,17 @@ def test_multi_char_sep_quotes(self): def test_skipfooter_bad_row(self): # see gh-13879 + # see gh-15910 - data = 'a,b,c\ncat,foo,bar\ndog,foo,"baz' msg = 'parsing errors in the skipped footer rows' - with tm.assertRaisesRegexp(csv.Error, msg): - self.read_csv(StringIO(data), skipfooter=1) - - # We expect no match, so there should be an assertion - # error out of the inner context manager. - with tm.assertRaises(AssertionError): + for data in ('a\n1\n"b"a', + 'a,b,c\ncat,foo,bar\ndog,foo,"baz'): with tm.assertRaisesRegexp(csv.Error, msg): - self.read_csv(StringIO(data)) + self.read_csv(StringIO(data), skipfooter=1) + + # We expect no match, so there should be an assertion + # error out of the inner context manager. + with tm.assertRaises(AssertionError): + with tm.assertRaisesRegexp(csv.Error, msg): + self.read_csv(StringIO(data))