pandas-dev · ivannz · Aug 1, 2016 · Jul 29, 2016 · Jul 29, 2016 · Jul 29, 2016
diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
@@ -742,7 +742,7 @@ Bug Fixes
 ~~~~~~~~~
 
 - Bug in ``groupby().shift()``, which could cause a segfault or corruption in rare circumstances when grouping by columns with missing values (:issue:`13813`)
-- Bug in ``pd.read_csv()``, which may cause a segfault or corruption when iterating in large chunks over a stream/file under rare circumstances (:issue:`13703`)
+- Bug in ``pd.read_csv()``, which may cause a segfault or corruption when iterating in large chunks over a stream/file under rare circumstances (:issue:`13703`, :issue:`5291`, :issue:`11793`)
 - Bug in ``io.json.json_normalize()``, where non-ascii keys raised an exception (:issue:`13213`)
 - Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`)
 - Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing result may have normal ``Index`` (:issue:`13144`)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -2190,7 +2190,17 @@ def _next_line(self):
                 next(self.data)
 
             while True:
-                orig_line = next(self.data)
+                try:
+                    orig_line = next(self.data)
+                except csv.Error as e:
+                    if 'NULL byte' in str(e):
+                        raise csv.Error(
+                            'NULL byte detected. This byte '
+                            'cannot be processed in Python\'s '
+                            'native csv library at the moment, '
+                            'so please pass in engine=\'c\' instead.')
+                    else:
+                        raise
                 line = self._check_comments([orig_line])[0]
                 self.pos += 1
                 if (not self.skip_blank_lines and

diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py
@@ -439,15 +439,24 @@ def test_parse_trim_buffers(self):
 
         # Generate the expected output: manually create the dataframe
         # by splitting by comma and repeating the `n_lines` times.
-        row = tuple(val_ if val_ else float("nan")
+        row = tuple(val_ if val_ else np.nan
                     for val_ in record_.split(","))
         expected = pd.DataFrame([row for _ in range(n_lines)],
                                 dtype=object, columns=None, index=None)
 
         # Iterate over the CSV file in chunks of `chunksize` lines
         chunks_ = self.read_csv(StringIO(csv_data), header=None,
                                 dtype=object, chunksize=chunksize)
-        result = pd.concat(chunks_, axis=0, ignore_index=True)
+        result1 = pd.concat(chunks_, axis=0, ignore_index=True)
 
         # Check for data corruption if there was no segfault
-        tm.assert_frame_equal(result, expected)
+        tm.assert_frame_equal(result1, expected)
+
+        # This extra test was added to replicate the fault in #5291.
+        # Force 'utf-8' encoding, so that `_string_convert` would take
+        # a different execution branch.
+        chunks_ = self.read_csv(StringIO(csv_data), header=None,
+                                dtype=object, chunksize=chunksize,
+                                encoding='utf_8')
+        result2 = pd.concat(chunks_, axis=0, ignore_index=True)
+        tm.assert_frame_equal(result2, expected)
diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py
@@ -1501,3 +1501,19 @@ def test_memory_map(self):
 
         out = self.read_csv(mmap_file, memory_map=True)
         tm.assert_frame_equal(out, expected)
+
+    def test_null_byte_char(self):
+        # see gh-2741
+        data = '\x00,foo'
+        cols = ['a', 'b']
+
+        expected = DataFrame([[np.nan, 'foo']],
+                             columns=cols)
+
+        if self.engine == 'c':
+            out = self.read_csv(StringIO(data), names=cols)
+            tm.assert_frame_equal(out, expected)
+        else:
+            msg = "NULL byte detected"
+            with tm.assertRaisesRegexp(csv.Error, msg):
+                self.read_csv(StringIO(data), names=cols)