TST: Add another test for segfault in C engine (#18128)

gfyoung · TomAugspurger · commit 7dabfc614971 · 2017-12-11T11:24:50.000-06:00
xref gh-13833. Closes gh-5291. (cherry picked from commit 6f0ff1a)
diff --git a/pandas/tests/io/parser/c_parser_only.py b/pandas/tests/io/parser/c_parser_only.py
@@ -290,11 +290,11 @@ def test_empty_header_read(count):
             test_empty_header_read(count)
 
     def test_parse_trim_buffers(self):
-        # This test is part of a bugfix for issue #13703. It attmepts to
+        # This test is part of a bugfix for issue #13703. It attempts to
         # to stress the system memory allocator, to cause it to move the
         # stream buffer and either let the OS reclaim the region, or let
         # other memory requests of parser otherwise modify the contents
-        # of memory space, where it was formely located.
+        # of memory space, where it was formally located.
         # This test is designed to cause a `segfault` with unpatched
         # `tokenizer.c`. Sometimes the test fails on `segfault`, other
         # times it fails due to memory corruption, which causes the
@@ -346,7 +346,7 @@ def test_parse_trim_buffers(self):
 
         # Generate the expected output: manually create the dataframe
         # by splitting by comma and repeating the `n_lines` times.
-        row = tuple(val_ if val_ else float("nan")
+        row = tuple(val_ if val_ else np.nan
                     for val_ in record_.split(","))
         expected = pd.DataFrame([row for _ in range(n_lines)],
                                 dtype=object, columns=None, index=None)
@@ -359,6 +359,15 @@ def test_parse_trim_buffers(self):
         # Check for data corruption if there was no segfault
         tm.assert_frame_equal(result, expected)
 
+        # This extra test was added to replicate the fault in gh-5291.
+        # Force 'utf-8' encoding, so that `_string_convert` would take
+        # a different execution branch.
+        chunks_ = self.read_csv(StringIO(csv_data), header=None,
+                                dtype=object, chunksize=chunksize,
+                                encoding='utf_8')
+        result = pd.concat(chunks_, axis=0, ignore_index=True)
+        tm.assert_frame_equal(result, expected)
+
     def test_internal_null_byte(self):
         # see gh-14012
         #