@@ -290,11 +290,11 @@ def test_empty_header_read(count):
290
290
test_empty_header_read (count )
291
291
292
292
def test_parse_trim_buffers (self ):
293
- # This test is part of a bugfix for issue #13703. It attmepts to
293
+ # This test is part of a bugfix for issue #13703. It attempts to
294
294
# to stress the system memory allocator, to cause it to move the
295
295
# stream buffer and either let the OS reclaim the region, or let
296
296
# other memory requests of parser otherwise modify the contents
297
- # of memory space, where it was formely located.
297
+ # of memory space, where it was formally located.
298
298
# This test is designed to cause a `segfault` with unpatched
299
299
# `tokenizer.c`. Sometimes the test fails on `segfault`, other
300
300
# times it fails due to memory corruption, which causes the
@@ -346,7 +346,7 @@ def test_parse_trim_buffers(self):
346
346
347
347
# Generate the expected output: manually create the dataframe
348
348
# by splitting by comma and repeating the `n_lines` times.
349
- row = tuple (val_ if val_ else float ( " nan" )
349
+ row = tuple (val_ if val_ else np . nan
350
350
for val_ in record_ .split ("," ))
351
351
expected = pd .DataFrame ([row for _ in range (n_lines )],
352
352
dtype = object , columns = None , index = None )
@@ -359,6 +359,15 @@ def test_parse_trim_buffers(self):
359
359
# Check for data corruption if there was no segfault
360
360
tm .assert_frame_equal (result , expected )
361
361
362
+ # This extra test was added to replicate the fault in gh-5291.
363
+ # Force 'utf-8' encoding, so that `_string_convert` would take
364
+ # a different execution branch.
365
+ chunks_ = self .read_csv (StringIO (csv_data ), header = None ,
366
+ dtype = object , chunksize = chunksize ,
367
+ encoding = 'utf_8' )
368
+ result = pd .concat (chunks_ , axis = 0 , ignore_index = True )
369
+ tm .assert_frame_equal (result , expected )
370
+
362
371
def test_internal_null_byte (self ):
363
372
# see gh-14012
364
373
#
0 commit comments