4
4
Tests parsers ability to read and parse non-local files
5
5
and hence require a network connection to be read.
6
6
"""
7
+ import logging
8
+
7
9
import pytest
10
+ import numpy as np
8
11
9
12
import pandas .util .testing as tm
10
13
import pandas .util ._test_decorators as td
11
14
from pandas import DataFrame
12
15
from pandas .io .parsers import read_csv , read_table
13
- from pandas .compat import BytesIO
16
+ from pandas .compat import BytesIO , StringIO
14
17
15
18
16
19
@pytest .mark .network
@@ -45,9 +48,9 @@ def check_compressed_urls(salaries_table, compression, extension, mode,
45
48
tm .assert_frame_equal (url_table , salaries_table )
46
49
47
50
51
+ @pytest .mark .usefixtures ("s3_resource" )
48
52
class TestS3 (object ):
49
53
50
- @tm .network
51
54
def test_parse_public_s3_bucket (self ):
52
55
pytest .importorskip ('s3fs' )
53
56
# more of an integration test due to the not-public contents portion
@@ -66,7 +69,6 @@ def test_parse_public_s3_bucket(self):
66
69
assert not df .empty
67
70
tm .assert_frame_equal (read_csv (tm .get_data_path ('tips.csv' )), df )
68
71
69
- @tm .network
70
72
def test_parse_public_s3n_bucket (self ):
71
73
72
74
# Read from AWS s3 as "s3n" URL
@@ -76,7 +78,6 @@ def test_parse_public_s3n_bucket(self):
76
78
tm .assert_frame_equal (read_csv (
77
79
tm .get_data_path ('tips.csv' )).iloc [:10 ], df )
78
80
79
- @tm .network
80
81
def test_parse_public_s3a_bucket (self ):
81
82
# Read from AWS s3 as "s3a" URL
82
83
df = read_csv ('s3a://pandas-test/tips.csv' , nrows = 10 )
@@ -85,7 +86,6 @@ def test_parse_public_s3a_bucket(self):
85
86
tm .assert_frame_equal (read_csv (
86
87
tm .get_data_path ('tips.csv' )).iloc [:10 ], df )
87
88
88
- @tm .network
89
89
def test_parse_public_s3_bucket_nrows (self ):
90
90
for ext , comp in [('' , None ), ('.gz' , 'gzip' ), ('.bz2' , 'bz2' )]:
91
91
df = read_csv ('s3://pandas-test/tips.csv' +
@@ -95,7 +95,6 @@ def test_parse_public_s3_bucket_nrows(self):
95
95
tm .assert_frame_equal (read_csv (
96
96
tm .get_data_path ('tips.csv' )).iloc [:10 ], df )
97
97
98
- @tm .network
99
98
def test_parse_public_s3_bucket_chunked (self ):
100
99
# Read with a chunksize
101
100
chunksize = 5
@@ -114,7 +113,6 @@ def test_parse_public_s3_bucket_chunked(self):
114
113
chunksize * i_chunk : chunksize * (i_chunk + 1 )]
115
114
tm .assert_frame_equal (true_df , df )
116
115
117
- @tm .network
118
116
def test_parse_public_s3_bucket_chunked_python (self ):
119
117
# Read with a chunksize using the Python parser
120
118
chunksize = 5
@@ -133,7 +131,6 @@ def test_parse_public_s3_bucket_chunked_python(self):
133
131
chunksize * i_chunk : chunksize * (i_chunk + 1 )]
134
132
tm .assert_frame_equal (true_df , df )
135
133
136
- @tm .network
137
134
def test_parse_public_s3_bucket_python (self ):
138
135
for ext , comp in [('' , None ), ('.gz' , 'gzip' ), ('.bz2' , 'bz2' )]:
139
136
df = read_csv ('s3://pandas-test/tips.csv' + ext , engine = 'python' ,
@@ -143,7 +140,6 @@ def test_parse_public_s3_bucket_python(self):
143
140
tm .assert_frame_equal (read_csv (
144
141
tm .get_data_path ('tips.csv' )), df )
145
142
146
- @tm .network
147
143
def test_infer_s3_compression (self ):
148
144
for ext in ['' , '.gz' , '.bz2' ]:
149
145
df = read_csv ('s3://pandas-test/tips.csv' + ext ,
@@ -153,7 +149,6 @@ def test_infer_s3_compression(self):
153
149
tm .assert_frame_equal (read_csv (
154
150
tm .get_data_path ('tips.csv' )), df )
155
151
156
- @tm .network
157
152
def test_parse_public_s3_bucket_nrows_python (self ):
158
153
for ext , comp in [('' , None ), ('.gz' , 'gzip' ), ('.bz2' , 'bz2' )]:
159
154
df = read_csv ('s3://pandas-test/tips.csv' + ext , engine = 'python' ,
@@ -163,7 +158,6 @@ def test_parse_public_s3_bucket_nrows_python(self):
163
158
tm .assert_frame_equal (read_csv (
164
159
tm .get_data_path ('tips.csv' )).iloc [:10 ], df )
165
160
166
- @tm .network
167
161
def test_s3_fails (self ):
168
162
with pytest .raises (IOError ):
169
163
read_csv ('s3://nyqpug/asdf.csv' )
@@ -188,3 +182,22 @@ def test_read_csv_handles_boto_s3_object(self,
188
182
189
183
expected = read_csv (tips_file )
190
184
tm .assert_frame_equal (result , expected )
185
+
186
+ def test_read_csv_chunked_download (self , s3_resource , caplog ):
187
+ # 8 MB, S3FS usees 5MB chunks
188
+ df = DataFrame (np .random .randn (100000 , 4 ), columns = list ('abcd' ))
189
+ buf = BytesIO ()
190
+ str_buf = StringIO ()
191
+
192
+ df .to_csv (str_buf )
193
+
194
+ buf = BytesIO (str_buf .getvalue ().encode ('utf-8' ))
195
+
196
+ s3_resource .Bucket ("pandas-test" ).put_object (
197
+ Key = "large-file.csv" ,
198
+ Body = buf )
199
+
200
+ with caplog .at_level (logging .DEBUG , logger = 's3fs.core' ):
201
+ read_csv ("s3://pandas-test/large-file.csv" , nrows = 5 )
202
+ # log of fetch_range (start, stop)
203
+ assert ((0 , 5505024 ) in set (x .args [- 2 :] for x in caplog .records ))
0 commit comments