@@ -69,6 +69,51 @@ def setUp(self):
69
69
self .csv2 = os .path .join (self .dirpath , 'test2.csv' )
70
70
self .xls1 = os .path .join (self .dirpath , 'test.xls' )
71
71
72
+ def construct_dataframe (self , num_rows ):
73
+
74
+ df = DataFrame (np .random .rand (num_rows , 5 ), columns = list ('abcde' ))
75
+ df ['foo' ] = 'foo'
76
+ df ['bar' ] = 'bar'
77
+ df ['baz' ] = 'baz'
78
+ df ['date' ] = pd .date_range ('20000101 09:00:00' ,
79
+ periods = num_rows ,
80
+ freq = 's' )
81
+ df ['int' ] = np .arange (num_rows , dtype = 'int64' )
82
+ return df
83
+
84
+ def generate_multithread_dataframe (self , path , num_rows , num_tasks ):
85
+
86
+ def reader (arg ):
87
+ start , nrows = arg
88
+
89
+ if not start :
90
+ return pd .read_csv (path , index_col = 0 , header = 0 , nrows = nrows ,
91
+ parse_dates = ['date' ])
92
+
93
+ return pd .read_csv (path ,
94
+ index_col = 0 ,
95
+ header = None ,
96
+ skiprows = int (start ) + 1 ,
97
+ nrows = nrows ,
98
+ parse_dates = [9 ])
99
+
100
+ tasks = [
101
+ (num_rows * i / num_tasks ,
102
+ num_rows / num_tasks ) for i in range (num_tasks )
103
+ ]
104
+
105
+ pool = ThreadPool (processes = num_tasks )
106
+
107
+ results = pool .map (reader , tasks )
108
+
109
+ header = results [0 ].columns
110
+ for r in results [1 :]:
111
+ r .columns = header
112
+
113
+ final_dataframe = pd .concat (results )
114
+
115
+ return final_dataframe
116
+
72
117
def test_converters_type_must_be_dict (self ):
73
118
with tm .assertRaisesRegexp (TypeError , 'Type converters.+' ):
74
119
self .read_csv (StringIO (self .data1 ), converters = 0 )
@@ -3361,8 +3406,43 @@ def test_variable_width_unicode(self):
3361
3406
tm .assert_frame_equal (expected , read_fwf (BytesIO (test .encode ('utf8' )),
3362
3407
header = None , encoding = 'utf8' ))
3363
3408
3409
+ class CParserTests (ParserTests ):
3410
+ """ base class for CParser Testsing """
3411
+
3412
+ def test_buffer_overflow (self ):
3413
+ # GH9205
3414
+ # test certain malformed input files that cause buffer overflows in
3415
+ # tokenizer.c
3416
+ malfw = "1\r 1\r 1\r 1\r 1\r " # buffer overflow in words pointer
3417
+ malfs = "1\r 1\r 1\r 1\r 1\r 11\r " # buffer overflow in stream pointer
3418
+ malfl = "1\r 1\r 1\r 1\r 1\r 11\r 1\r " # buffer overflow in lines pointer
3419
+ for malf in (malfw , malfs , malfl ):
3420
+ try :
3421
+ df = self .read_table (StringIO (malf ))
3422
+ except Exception as cperr :
3423
+ self .assertIn (
3424
+ 'Buffer overflow caught - possible malformed input file.' , str (cperr ))
3425
+
3426
+ def test_buffer_rd_bytes (self ):
3427
+ # GH 12098
3428
+ # src->buffer can be freed twice leading to a segfault if a corrupt
3429
+ # gzip file is read with read_csv and the buffer is filled more than
3430
+ # once before gzip throws an exception
3431
+
3432
+ data = '\x1F \x8B \x08 \x00 \x00 \x00 \x00 \x00 \x00 \x03 \xED \xC3 \x41 \x09 ' \
3433
+ '\x00 \x00 \x08 \x00 \xB1 \xB7 \xB6 \xBA \xFE \xA5 \xCC \x21 \x6C \xB0 ' \
3434
+ '\xA6 \x4D ' + '\x55 ' * 267 + \
3435
+ '\x7D \xF7 \x00 \x91 \xE0 \x47 \x97 \x14 \x38 \x04 \x00 ' \
3436
+ '\x1f \x8b \x08 \x00 VT\x97 V\x00 \x03 \xed ]\xef O'
3437
+ for i in range (100 ):
3438
+ try :
3439
+ _ = self .read_csv (StringIO (data ),
3440
+ compression = 'gzip' ,
3441
+ delim_whitespace = True )
3442
+ except Exception as e :
3443
+ pass
3364
3444
3365
- class TestCParserHighMemory (ParserTests , tm .TestCase ):
3445
+ class TestCParserHighMemory (CParserTests , tm .TestCase ):
3366
3446
3367
3447
def read_csv (self , * args , ** kwds ):
3368
3448
kwds = kwds .copy ()
@@ -3653,39 +3733,6 @@ def test_fallback_to_python(self):
3653
3733
with tm .assertRaisesRegexp (ValueError , 'does not support' ):
3654
3734
self .read_table (StringIO (data ), engine = 'c' , skip_footer = 1 )
3655
3735
3656
- def test_buffer_overflow (self ):
3657
- # GH9205
3658
- # test certain malformed input files that cause buffer overflows in
3659
- # tokenizer.c
3660
- malfw = "1\r 1\r 1\r 1\r 1\r " # buffer overflow in words pointer
3661
- malfs = "1\r 1\r 1\r 1\r 1\r 11\r " # buffer overflow in stream pointer
3662
- malfl = "1\r 1\r 1\r 1\r 1\r 11\r 1\r " # buffer overflow in lines pointer
3663
- for malf in (malfw , malfs , malfl ):
3664
- try :
3665
- df = self .read_table (StringIO (malf ))
3666
- except Exception as cperr :
3667
- self .assertIn (
3668
- 'Buffer overflow caught - possible malformed input file.' , str (cperr ))
3669
-
3670
- def test_buffer_rd_bytes (self ):
3671
- # GH 12098
3672
- # src->buffer can be freed twice leading to a segfault if a corrupt
3673
- # gzip file is read with read_csv and the buffer is filled more than
3674
- # once before gzip throws an exception
3675
-
3676
- data = '\x1F \x8B \x08 \x00 \x00 \x00 \x00 \x00 \x00 \x03 \xED \xC3 \x41 \x09 ' \
3677
- '\x00 \x00 \x08 \x00 \xB1 \xB7 \xB6 \xBA \xFE \xA5 \xCC \x21 \x6C \xB0 ' \
3678
- '\xA6 \x4D ' + '\x55 ' * 267 + \
3679
- '\x7D \xF7 \x00 \x91 \xE0 \x47 \x97 \x14 \x38 \x04 \x00 ' \
3680
- '\x1f \x8b \x08 \x00 VT\x97 V\x00 \x03 \xed ]\xef O'
3681
- for i in range (100 ):
3682
- try :
3683
- _ = self .read_csv (StringIO (data ),
3684
- compression = 'gzip' ,
3685
- delim_whitespace = True )
3686
- except Exception as e :
3687
- pass
3688
-
3689
3736
def test_single_char_leading_whitespace (self ):
3690
3737
# GH 9710
3691
3738
data = """\
@@ -3706,7 +3753,7 @@ def test_single_char_leading_whitespace(self):
3706
3753
tm .assert_frame_equal (result , expected )
3707
3754
3708
3755
3709
- class TestCParserLowMemory (ParserTests , tm .TestCase ):
3756
+ class TestCParserLowMemory (CParserTests , tm .TestCase ):
3710
3757
3711
3758
def read_csv (self , * args , ** kwds ):
3712
3759
kwds = kwds .copy ()
@@ -4213,39 +4260,6 @@ def test_raise_on_sep_with_delim_whitespace(self):
4213
4260
with tm .assertRaisesRegexp (ValueError , 'you can only specify one' ):
4214
4261
self .read_table (StringIO (data ), sep = '\s' , delim_whitespace = True )
4215
4262
4216
- def test_buffer_overflow (self ):
4217
- # GH9205
4218
- # test certain malformed input files that cause buffer overflows in
4219
- # tokenizer.c
4220
- malfw = "1\r 1\r 1\r 1\r 1\r " # buffer overflow in words pointer
4221
- malfs = "1\r 1\r 1\r 1\r 1\r 11\r " # buffer overflow in stream pointer
4222
- malfl = "1\r 1\r 1\r 1\r 1\r 11\r 1\r " # buffer overflow in lines pointer
4223
- for malf in (malfw , malfs , malfl ):
4224
- try :
4225
- df = self .read_table (StringIO (malf ))
4226
- except Exception as cperr :
4227
- self .assertIn (
4228
- 'Buffer overflow caught - possible malformed input file.' , str (cperr ))
4229
-
4230
- def test_buffer_rd_bytes (self ):
4231
- # GH 12098
4232
- # src->buffer can be freed twice leading to a segfault if a corrupt
4233
- # gzip file is read with read_csv and the buffer is filled more than
4234
- # once before gzip throws an exception
4235
-
4236
- data = '\x1F \x8B \x08 \x00 \x00 \x00 \x00 \x00 \x00 \x03 \xED \xC3 \x41 \x09 ' \
4237
- '\x00 \x00 \x08 \x00 \xB1 \xB7 \xB6 \xBA \xFE \xA5 \xCC \x21 \x6C \xB0 ' \
4238
- '\xA6 \x4D ' + '\x55 ' * 267 + \
4239
- '\x7D \xF7 \x00 \x91 \xE0 \x47 \x97 \x14 \x38 \x04 \x00 ' \
4240
- '\x1f \x8b \x08 \x00 VT\x97 V\x00 \x03 \xed ]\xef O'
4241
- for i in range (100 ):
4242
- try :
4243
- _ = self .read_csv (StringIO (data ),
4244
- compression = 'gzip' ,
4245
- delim_whitespace = True )
4246
- except Exception as e :
4247
- pass
4248
-
4249
4263
def test_single_char_leading_whitespace (self ):
4250
4264
# GH 9710
4251
4265
data = """\
@@ -4300,51 +4314,6 @@ def test_multithread_stringio_read_csv(self):
4300
4314
for result in results :
4301
4315
tm .assert_frame_equal (first_result , result )
4302
4316
4303
- def construct_dataframe (self , num_rows ):
4304
-
4305
- df = DataFrame (np .random .rand (num_rows , 5 ), columns = list ('abcde' ))
4306
- df ['foo' ] = 'foo'
4307
- df ['bar' ] = 'bar'
4308
- df ['baz' ] = 'baz'
4309
- df ['date' ] = pd .date_range ('20000101 09:00:00' ,
4310
- periods = num_rows ,
4311
- freq = 's' )
4312
- df ['int' ] = np .arange (num_rows , dtype = 'int64' )
4313
- return df
4314
-
4315
- def generate_multithread_dataframe (self , path , num_rows , num_tasks ):
4316
-
4317
- def reader (arg ):
4318
- start , nrows = arg
4319
-
4320
- if not start :
4321
- return pd .read_csv (path , index_col = 0 , header = 0 , nrows = nrows ,
4322
- parse_dates = ['date' ])
4323
-
4324
- return pd .read_csv (path ,
4325
- index_col = 0 ,
4326
- header = None ,
4327
- skiprows = int (start ) + 1 ,
4328
- nrows = nrows ,
4329
- parse_dates = [9 ])
4330
-
4331
- tasks = [
4332
- (num_rows * i / num_tasks ,
4333
- num_rows / num_tasks ) for i in range (num_tasks )
4334
- ]
4335
-
4336
- pool = ThreadPool (processes = num_tasks )
4337
-
4338
- results = pool .map (reader , tasks )
4339
-
4340
- header = results [0 ].columns
4341
- for r in results [1 :]:
4342
- r .columns = header
4343
-
4344
- final_dataframe = pd .concat (results )
4345
-
4346
- return final_dataframe
4347
-
4348
4317
def test_multithread_path_multipart_read_csv (self ):
4349
4318
# GH 11786
4350
4319
num_tasks = 4
0 commit comments