Skip to content

BUG: Standardize malformed row handling in Python engine #15913

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 6, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,7 @@ Other Enhancements
- ``pandas.io.json.json_normalize()`` gained the option ``errors='ignore'|'raise'``; the default is ``errors='raise'`` which is backward compatible. (:issue:`14583`)
- ``pandas.io.json.json_normalize()`` with an empty ``list`` will return an empty ``DataFrame`` (:issue:`15534`)
- ``pandas.io.json.json_normalize()`` has gained a ``sep`` option that accepts ``str`` to separate joined fields; the default is ".", which is backward compatible. (:issue:`14883`)
- ``pd.read_csv()`` will now raise a ``csv.Error`` error whenever an end-of-file character is encountered in the middle of a data row (:issue:`15913`)


.. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations
Expand Down Expand Up @@ -1034,7 +1035,8 @@ I/O
- Bug in ``pd.read_csv()`` with ``float_precision='round_trip'`` which caused a segfault when a text entry is parsed (:issue:`15140`)
- Bug in ``pd.read_csv()`` when an index was specified and no values were specified as null values (:issue:`15835`)
- Bug in ``pd.read_csv()`` in which certain invalid file objects caused the Python interpreter to crash (:issue:`15337`)
- Added checks in ``pd.read_csv()`` ensuring that values for ``nrows`` and ``chunksize`` are valid (:issue:`15767`)
- Bug in ``pd.read_csv()`` in which invalid values for ``nrows`` and ``chunksize`` were allowed (:issue:`15767`)
- Bug in ``pd.read_csv()`` for the Python engine in which unhelpful error messages were being raised when parsing errors occurred (:issue:`15910`)
- Bug in ``pd.tools.hashing.hash_pandas_object()`` in which hashing of categoricals depended on the ordering of categories, instead of just their values. (:issue:`15143`)
- Bug in ``.to_json()`` where ``lines=True`` and contents (keys or values) contain escaped characters (:issue:`15096`)
- Bug in ``.to_json()`` causing single byte ascii characters to be expanded to four byte unicode (:issue:`15344`)
Expand Down
87 changes: 50 additions & 37 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2469,26 +2469,7 @@ def _next_line(self):
next(self.data)

while True:
try:
orig_line = next(self.data)
except csv.Error as e:
msg = str(e)

if 'NULL byte' in str(e):
msg = ('NULL byte detected. This byte '
'cannot be processed in Python\'s '
'native csv library at the moment, '
'so please pass in engine=\'c\' instead')

if self.skipfooter > 0:
reason = ('Error could possibly be due to '
'parsing errors in the skipped footer rows '
'(the skipfooter keyword is only applied '
'after Python\'s csv library has parsed '
'all rows).')
msg += '. ' + reason

raise csv.Error(msg)
orig_line = self._next_iter_line()
line = self._check_comments([orig_line])[0]
self.pos += 1
if (not self.skip_blank_lines and
Expand All @@ -2510,6 +2491,43 @@ def _next_line(self):
self.buf.append(line)
return line

def _next_iter_line(self, **kwargs):
"""
Wrapper around iterating through `self.data` (CSV source).
When a CSV error is raised, we check for specific
error messages that allow us to customize the
error message displayed to the user.
Parameters
----------
kwargs : Keyword arguments used to customize the error message.
"""

try:
return next(self.data)
except csv.Error as e:
msg = str(e)

if 'NULL byte' in msg:
msg = ('NULL byte detected. This byte '
'cannot be processed in Python\'s '
'native csv library at the moment, '
'so please pass in engine=\'c\' instead')
elif 'newline inside string' in msg:
msg = ('EOF inside string starting with '
'line ' + str(kwargs['row_num']))

if self.skipfooter > 0:
reason = ('Error could possibly be due to '
'parsing errors in the skipped footer rows '
'(the skipfooter keyword is only applied '
'after Python\'s csv library has parsed '
'all rows).')
msg += '. ' + reason

raise csv.Error(msg)

def _check_comments(self, lines):
if self.comment is None:
return lines
Expand Down Expand Up @@ -2688,7 +2706,6 @@ def _rows_to_cols(self, content):
return zipped_content

def _get_lines(self, rows=None):
source = self.data
lines = self.buf
new_rows = None

Expand All @@ -2703,14 +2720,14 @@ def _get_lines(self, rows=None):
rows -= len(self.buf)

if new_rows is None:
if isinstance(source, list):
if self.pos > len(source):
if isinstance(self.data, list):
if self.pos > len(self.data):
raise StopIteration
if rows is None:
new_rows = source[self.pos:]
new_pos = len(source)
new_rows = self.data[self.pos:]
new_pos = len(self.data)
else:
new_rows = source[self.pos:self.pos + rows]
new_rows = self.data[self.pos:self.pos + rows]
new_pos = self.pos + rows

# Check for stop rows. n.b.: self.skiprows is a set.
Expand All @@ -2726,21 +2743,17 @@ def _get_lines(self, rows=None):
try:
if rows is not None:
for _ in range(rows):
new_rows.append(next(source))
new_rows.append(next(self.data))
lines.extend(new_rows)
else:
rows = 0

while True:
try:
new_rows.append(next(source))
rows += 1
except csv.Error as inst:
if 'newline inside string' in str(inst):
row_num = str(self.pos + rows)
msg = ('EOF inside string starting with '
'line ' + row_num)
raise Exception(msg)
raise
new_row = self._next_iter_line(
row_num=self.pos + rows)
new_rows.append(new_row)
rows += 1

except StopIteration:
if self.skiprows:
new_rows = [row for i, row in enumerate(new_rows)
Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/io/parser/c_parser_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -408,3 +408,12 @@ def test_large_difference_in_columns(self):
expected = DataFrame([row.split(',')[0] for row in rows])

tm.assert_frame_equal(result, expected)

def test_data_after_quote(self):
# see gh-15910

data = 'a\n1\n"b"a'
result = self.read_csv(StringIO(data))
expected = DataFrame({'a': ['1', 'ba']})

tm.assert_frame_equal(result, expected)
18 changes: 10 additions & 8 deletions pandas/tests/io/parser/python_parser_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,15 +225,17 @@ def test_multi_char_sep_quotes(self):

def test_skipfooter_bad_row(self):
# see gh-13879
# see gh-15910

data = 'a,b,c\ncat,foo,bar\ndog,foo,"baz'
msg = 'parsing errors in the skipped footer rows'

with tm.assertRaisesRegexp(csv.Error, msg):
self.read_csv(StringIO(data), skipfooter=1)

# We expect no match, so there should be an assertion
# error out of the inner context manager.
with tm.assertRaises(AssertionError):
for data in ('a\n1\n"b"a',
'a,b,c\ncat,foo,bar\ndog,foo,"baz'):
with tm.assertRaisesRegexp(csv.Error, msg):
self.read_csv(StringIO(data))
self.read_csv(StringIO(data), skipfooter=1)

# We expect no match, so there should be an assertion
# error out of the inner context manager.
with tm.assertRaises(AssertionError):
with tm.assertRaisesRegexp(csv.Error, msg):
self.read_csv(StringIO(data))