@@ -4241,16 +4241,22 @@ def setUp(self):
4241
4241
4242
4242
@tm .network
4243
4243
def test_parse_public_s3_bucket (self ):
4244
- import nose .tools as nt
4245
- df = pd .read_csv ('s3://nyqpug/tips.csv' )
4246
- nt .assert_true (isinstance (df , pd .DataFrame ))
4247
- nt .assert_false (df .empty )
4248
- tm .assert_frame_equal (pd .read_csv (tm .get_data_path ('tips.csv' )), df )
4244
+ for ext , comp in [('' , None ), ('.gz' , 'gzip' ), ('.bz2' , 'bz2' )]:
4245
+ if comp == 'bz2' and compat .PY2 :
4246
+ # The Python 2 C parser can't read bz2 from S3.
4247
+ self .assertRaises (ValueError , pd .read_csv ,
4248
+ 's3://pandas-test/tips.csv' + ext ,
4249
+ compression = comp )
4250
+ else :
4251
+ df = pd .read_csv ('s3://pandas-test/tips.csv' + ext , compression = comp )
4252
+ self .assertTrue (isinstance (df , pd .DataFrame ))
4253
+ self .assertFalse (df .empty )
4254
+ tm .assert_frame_equal (pd .read_csv (tm .get_data_path ('tips.csv' )), df )
4249
4255
4250
4256
# Read public file from bucket with not-public contents
4251
4257
df = pd .read_csv ('s3://cant_get_it/tips.csv' )
4252
- nt . assert_true (isinstance (df , pd .DataFrame ))
4253
- nt . assert_false (df .empty )
4258
+ self . assertTrue (isinstance (df , pd .DataFrame ))
4259
+ self . assertFalse (df .empty )
4254
4260
tm .assert_frame_equal (pd .read_csv (tm .get_data_path ('tips.csv' )), df )
4255
4261
4256
4262
@tm .network
@@ -4269,6 +4275,81 @@ def test_parse_public_s3a_bucket(self):
4269
4275
self .assertFalse (df .empty )
4270
4276
tm .assert_frame_equal (pd .read_csv (tm .get_data_path ('tips.csv' )).iloc [:10 ], df )
4271
4277
4278
+ @tm .network
4279
+ def test_parse_public_s3_bucket_nrows (self ):
4280
+ for ext , comp in [('' , None ), ('.gz' , 'gzip' ), ('.bz2' , 'bz2' )]:
4281
+ if comp == 'bz2' and compat .PY2 :
4282
+ # The Python 2 C parser can't read bz2 from S3.
4283
+ self .assertRaises (ValueError , pd .read_csv ,
4284
+ 's3://pandas-test/tips.csv' + ext ,
4285
+ compression = comp )
4286
+ else :
4287
+ df = pd .read_csv ('s3://pandas-test/tips.csv' + ext , nrows = 10 , compression = comp )
4288
+ self .assertTrue (isinstance (df , pd .DataFrame ))
4289
+ self .assertFalse (df .empty )
4290
+ tm .assert_frame_equal (pd .read_csv (tm .get_data_path ('tips.csv' )).iloc [:10 ], df )
4291
+
4292
+ @tm .network
4293
+ def test_parse_public_s3_bucket_chunked (self ):
4294
+ # Read with a chunksize
4295
+ chunksize = 5
4296
+ local_tips = pd .read_csv (tm .get_data_path ('tips.csv' ))
4297
+ for ext , comp in [('' , None ), ('.gz' , 'gzip' ), ('.bz2' , 'bz2' )]:
4298
+ if comp == 'bz2' and compat .PY2 :
4299
+ # The Python 2 C parser can't read bz2 from S3.
4300
+ self .assertRaises (ValueError , pd .read_csv ,
4301
+ 's3://pandas-test/tips.csv' + ext ,
4302
+ compression = comp )
4303
+ else :
4304
+ df_reader = pd .read_csv ('s3://pandas-test/tips.csv' + ext ,
4305
+ chunksize = chunksize , compression = comp )
4306
+ self .assertEqual (df_reader .chunksize , chunksize )
4307
+ for i_chunk in [0 , 1 , 2 ]:
4308
+ # Read a couple of chunks and make sure we see them properly.
4309
+ df = df_reader .get_chunk ()
4310
+ self .assertTrue (isinstance (df , pd .DataFrame ))
4311
+ self .assertFalse (df .empty )
4312
+ true_df = local_tips .iloc [chunksize * i_chunk : chunksize * (i_chunk + 1 )]
4313
+ true_df = true_df .reset_index ().drop ('index' , axis = 1 ) # Chunking doesn't preserve row numbering
4314
+ tm .assert_frame_equal (true_df , df )
4315
+
4316
+ @tm .network
4317
+ def test_parse_public_s3_bucket_chunked_python (self ):
4318
+ # Read with a chunksize using the Python parser
4319
+ chunksize = 5
4320
+ local_tips = pd .read_csv (tm .get_data_path ('tips.csv' ))
4321
+ for ext , comp in [('' , None ), ('.gz' , 'gzip' ), ('.bz2' , 'bz2' )]:
4322
+ df_reader = pd .read_csv ('s3://pandas-test/tips.csv' + ext ,
4323
+ chunksize = chunksize , compression = comp ,
4324
+ engine = 'python' )
4325
+ self .assertEqual (df_reader .chunksize , chunksize )
4326
+ for i_chunk in [0 , 1 , 2 ]:
4327
+ # Read a couple of chunks and make sure we see them properly.
4328
+ df = df_reader .get_chunk ()
4329
+ self .assertTrue (isinstance (df , pd .DataFrame ))
4330
+ self .assertFalse (df .empty )
4331
+ true_df = local_tips .iloc [chunksize * i_chunk : chunksize * (i_chunk + 1 )]
4332
+ true_df = true_df .reset_index ().drop ('index' , axis = 1 ) # Chunking doesn't preserve row numbering
4333
+ tm .assert_frame_equal (true_df , df )
4334
+
4335
+ @tm .network
4336
+ def test_parse_public_s3_bucket_python (self ):
4337
+ for ext , comp in [('' , None ), ('.gz' , 'gzip' ), ('.bz2' , 'bz2' )]:
4338
+ df = pd .read_csv ('s3://pandas-test/tips.csv' + ext , engine = 'python' ,
4339
+ compression = comp )
4340
+ self .assertTrue (isinstance (df , pd .DataFrame ))
4341
+ self .assertFalse (df .empty )
4342
+ tm .assert_frame_equal (pd .read_csv (tm .get_data_path ('tips.csv' )), df )
4343
+
4344
+ @tm .network
4345
+ def test_parse_public_s3_bucket_nrows_python (self ):
4346
+ for ext , comp in [('' , None ), ('.gz' , 'gzip' ), ('.bz2' , 'bz2' )]:
4347
+ df = pd .read_csv ('s3://pandas-test/tips.csv' + ext , engine = 'python' ,
4348
+ nrows = 10 , compression = comp )
4349
+ self .assertTrue (isinstance (df , pd .DataFrame ))
4350
+ self .assertFalse (df .empty )
4351
+ tm .assert_frame_equal (pd .read_csv (tm .get_data_path ('tips.csv' )).iloc [:10 ], df )
4352
+
4272
4353
@tm .network
4273
4354
def test_s3_fails (self ):
4274
4355
import boto
0 commit comments