diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 425b8daec6081..e3b3759df60c4 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -742,7 +742,7 @@ Bug Fixes ~~~~~~~~~ - Bug in ``groupby().shift()``, which could cause a segfault or corruption in rare circumstances when grouping by columns with missing values (:issue:`13813`) -- Bug in ``pd.read_csv()``, which may cause a segfault or corruption when iterating in large chunks over a stream/file under rare circumstances (:issue:`13703`) +- Bug in ``pd.read_csv()``, which may cause a segfault or corruption when iterating in large chunks over a stream/file under rare circumstances (:issue:`13703`, :issue:`5291`, :issue:`11793`) - Bug in ``io.json.json_normalize()``, where non-ascii keys raised an exception (:issue:`13213`) - Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`) - Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing result may have normal ``Index`` (:issue:`13144`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index abbe7bdf18461..8c615741679b5 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2190,7 +2190,17 @@ def _next_line(self): next(self.data) while True: - orig_line = next(self.data) + try: + orig_line = next(self.data) + except csv.Error as e: + if 'NULL byte' in str(e): + raise csv.Error( + 'NULL byte detected. This byte ' + 'cannot be processed in Python\'s ' + 'native csv library at the moment, ' + 'so please pass in engine=\'c\' instead.') + else: + raise line = self._check_comments([orig_line])[0] self.pos += 1 if (not self.skip_blank_lines and diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index 103c9fa2b7ce8..d3eb8664d4d3a 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -439,7 +439,7 @@ def test_parse_trim_buffers(self): # Generate the expected output: manually create the dataframe # by splitting by comma and repeating the `n_lines` times. - row = tuple(val_ if val_ else float("nan") + row = tuple(val_ if val_ else np.nan for val_ in record_.split(",")) expected = pd.DataFrame([row for _ in range(n_lines)], dtype=object, columns=None, index=None) @@ -447,7 +447,16 @@ def test_parse_trim_buffers(self): # Iterate over the CSV file in chunks of `chunksize` lines chunks_ = self.read_csv(StringIO(csv_data), header=None, dtype=object, chunksize=chunksize) - result = pd.concat(chunks_, axis=0, ignore_index=True) + result1 = pd.concat(chunks_, axis=0, ignore_index=True) # Check for data corruption if there was no segfault - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result1, expected) + + # This extra test was added to replicate the fault in #5291. + # Force 'utf-8' encoding, so that `_string_convert` would take + # a different execution branch. + chunks_ = self.read_csv(StringIO(csv_data), header=None, + dtype=object, chunksize=chunksize, + encoding='utf_8') + result2 = pd.concat(chunks_, axis=0, ignore_index=True) + tm.assert_frame_equal(result2, expected) diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index f8fc6c2bf78c3..129e925e38d5b 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -1501,3 +1501,19 @@ def test_memory_map(self): out = self.read_csv(mmap_file, memory_map=True) tm.assert_frame_equal(out, expected) + + def test_null_byte_char(self): + # see gh-2741 + data = '\x00,foo' + cols = ['a', 'b'] + + expected = DataFrame([[np.nan, 'foo']], + columns=cols) + + if self.engine == 'c': + out = self.read_csv(StringIO(data), names=cols) + tm.assert_frame_equal(out, expected) + else: + msg = "NULL byte detected" + with tm.assertRaisesRegexp(csv.Error, msg): + self.read_csv(StringIO(data), names=cols)