|
4 | 4 | Tests parsers ability to read and parse non-local files
|
5 | 5 | and hence require a network connection to be read.
|
6 | 6 | """
|
7 |
| - |
8 | 7 | import os
|
| 8 | + |
9 | 9 | import pytest
|
| 10 | +import six |
10 | 11 |
|
11 | 12 | import pandas.util.testing as tm
|
12 | 13 | from pandas import DataFrame
|
13 | 14 | from pandas.io.parsers import read_csv, read_table
|
14 | 15 |
|
15 | 16 |
|
| 17 | +@pytest.fixture(scope='module') |
| 18 | +def tips_file(): |
| 19 | + return os.path.join(tm.get_data_path(), 'tips.csv') |
| 20 | + |
| 21 | + |
16 | 22 | @pytest.fixture(scope='module')
|
17 | 23 | def salaries_table():
|
18 | 24 | path = os.path.join(tm.get_data_path(), 'salaries.csv')
|
19 | 25 | return read_table(path)
|
20 | 26 |
|
21 | 27 |
|
| 28 | +@pytest.fixture(scope='module') |
| 29 | +def test_s3_resource(request, tips_file): |
| 30 | + pytest.importorskip('s3fs') |
| 31 | + moto = pytest.importorskip('moto') |
| 32 | + moto.mock_s3().start() |
| 33 | + |
| 34 | + test_s3_files = [ |
| 35 | + ('tips.csv', tips_file), |
| 36 | + ('tips.csv.gz', tips_file + '.gz'), |
| 37 | + ('tips.csv.bz2', tips_file + '.bz2'), |
| 38 | + ] |
| 39 | + |
| 40 | + def add_tips_files(bucket_name): |
| 41 | + for s3_key, file_name in test_s3_files: |
| 42 | + with open(file_name, 'rb') as f: |
| 43 | + conn.Bucket(bucket_name).put_object( |
| 44 | + Key=s3_key, |
| 45 | + Body=f) |
| 46 | + |
| 47 | + boto3 = pytest.importorskip('boto3') |
| 48 | + # see gh-16135 |
| 49 | + bucket = 'pandas-test' |
| 50 | + |
| 51 | + conn = boto3.resource("s3", region_name="us-east-1") |
| 52 | + conn.create_bucket(Bucket=bucket) |
| 53 | + add_tips_files(bucket) |
| 54 | + |
| 55 | + conn.create_bucket(Bucket='cant_get_it', ACL='private') |
| 56 | + add_tips_files('cant_get_it') |
| 57 | + |
| 58 | + def teardown(): |
| 59 | + moto.mock_s3().stop() |
| 60 | + request.addfinalizer(teardown) |
| 61 | + |
| 62 | + return conn |
| 63 | + |
| 64 | + |
22 | 65 | @pytest.mark.network
|
23 | 66 | @pytest.mark.parametrize(
|
24 | 67 | "compression,extension",
|
@@ -50,151 +93,142 @@ def check_compressed_urls(salaries_table, compression, extension, mode,
|
50 | 93 | tm.assert_frame_equal(url_table, salaries_table)
|
51 | 94 |
|
52 | 95 |
|
53 |
| -class TestS3(object): |
| 96 | +@tm.network |
| 97 | +def test_parse_public_s3_bucket(): |
| 98 | + pytest.importorskip('s3fs') |
| 99 | + # more of an integration test due to the not-public contents portion |
| 100 | + # can probably mock this though. |
| 101 | + for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: |
| 102 | + df = read_csv('s3://pandas-test/tips.csv' + |
| 103 | + ext, compression=comp) |
| 104 | + assert isinstance(df, DataFrame) |
| 105 | + assert not df.empty |
| 106 | + tm.assert_frame_equal(read_csv( |
| 107 | + tm.get_data_path('tips.csv')), df) |
| 108 | + |
| 109 | + # Read public file from bucket with not-public contents |
| 110 | + df = read_csv('s3://cant_get_it/tips.csv') |
| 111 | + assert isinstance(df, DataFrame) |
| 112 | + assert not df.empty |
| 113 | + tm.assert_frame_equal(read_csv(tm.get_data_path('tips.csv')), df) |
| 114 | + |
| 115 | + |
| 116 | +def test_parse_public_s3n_bucket(test_s3_resource): |
| 117 | + |
| 118 | + # Read from AWS s3 as "s3n" URL |
| 119 | + df = read_csv('s3n://pandas-test/tips.csv', nrows=10) |
| 120 | + assert isinstance(df, DataFrame) |
| 121 | + assert not df.empty |
| 122 | + tm.assert_frame_equal(read_csv( |
| 123 | + tm.get_data_path('tips.csv')).iloc[:10], df) |
54 | 124 |
|
55 |
| - def setup_method(self, method): |
56 |
| - try: |
57 |
| - import s3fs # noqa |
58 |
| - except ImportError: |
59 |
| - pytest.skip("s3fs not installed") |
60 | 125 |
|
61 |
| - @tm.network |
62 |
| - def test_parse_public_s3_bucket(self): |
63 |
| - for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: |
64 |
| - df = read_csv('s3://pandas-test/tips.csv' + |
65 |
| - ext, compression=comp) |
| 126 | +def test_parse_public_s3a_bucket(test_s3_resource): |
| 127 | + # Read from AWS s3 as "s3a" URL |
| 128 | + df = read_csv('s3a://pandas-test/tips.csv', nrows=10) |
| 129 | + assert isinstance(df, DataFrame) |
| 130 | + assert not df.empty |
| 131 | + tm.assert_frame_equal(read_csv( |
| 132 | + tm.get_data_path('tips.csv')).iloc[:10], df) |
| 133 | + |
| 134 | + |
| 135 | +def test_parse_public_s3_bucket_nrows(test_s3_resource): |
| 136 | + for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: |
| 137 | + df = read_csv('s3://pandas-test/tips.csv' + |
| 138 | + ext, nrows=10, compression=comp) |
| 139 | + assert isinstance(df, DataFrame) |
| 140 | + assert not df.empty |
| 141 | + tm.assert_frame_equal(read_csv( |
| 142 | + tm.get_data_path('tips.csv')).iloc[:10], df) |
| 143 | + |
| 144 | + |
| 145 | +def test_parse_public_s3_bucket_chunked(test_s3_resource): |
| 146 | + # Read with a chunksize |
| 147 | + chunksize = 5 |
| 148 | + local_tips = read_csv(tm.get_data_path('tips.csv')) |
| 149 | + for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: |
| 150 | + df_reader = read_csv('s3://pandas-test/tips.csv' + ext, |
| 151 | + chunksize=chunksize, compression=comp) |
| 152 | + assert df_reader.chunksize == chunksize |
| 153 | + for i_chunk in [0, 1, 2]: |
| 154 | + # Read a couple of chunks and make sure we see them |
| 155 | + # properly. |
| 156 | + df = df_reader.get_chunk() |
| 157 | + assert isinstance(df, DataFrame) |
| 158 | + assert not df.empty |
| 159 | + true_df = local_tips.iloc[ |
| 160 | + chunksize * i_chunk: chunksize * (i_chunk + 1)] |
| 161 | + tm.assert_frame_equal(true_df, df) |
| 162 | + |
| 163 | + |
| 164 | +def test_parse_public_s3_bucket_chunked_python(test_s3_resource): |
| 165 | + # Read with a chunksize using the Python parser |
| 166 | + chunksize = 5 |
| 167 | + local_tips = read_csv(tm.get_data_path('tips.csv')) |
| 168 | + for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: |
| 169 | + df_reader = read_csv('s3://pandas-test/tips.csv' + ext, |
| 170 | + chunksize=chunksize, compression=comp, |
| 171 | + engine='python') |
| 172 | + assert df_reader.chunksize == chunksize |
| 173 | + for i_chunk in [0, 1, 2]: |
| 174 | + # Read a couple of chunks and make sure we see them properly. |
| 175 | + df = df_reader.get_chunk() |
66 | 176 | assert isinstance(df, DataFrame)
|
67 | 177 | assert not df.empty
|
68 |
| - tm.assert_frame_equal(read_csv( |
69 |
| - tm.get_data_path('tips.csv')), df) |
| 178 | + true_df = local_tips.iloc[ |
| 179 | + chunksize * i_chunk: chunksize * (i_chunk + 1)] |
| 180 | + tm.assert_frame_equal(true_df, df) |
| 181 | + |
70 | 182 |
|
71 |
| - # Read public file from bucket with not-public contents |
72 |
| - df = read_csv('s3://cant_get_it/tips.csv') |
| 183 | +def test_parse_public_s3_bucket_python(test_s3_resource): |
| 184 | + for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: |
| 185 | + df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python', |
| 186 | + compression=comp) |
73 | 187 | assert isinstance(df, DataFrame)
|
74 | 188 | assert not df.empty
|
75 |
| - tm.assert_frame_equal(read_csv(tm.get_data_path('tips.csv')), df) |
| 189 | + tm.assert_frame_equal(read_csv( |
| 190 | + tm.get_data_path('tips.csv')), df) |
76 | 191 |
|
77 |
| - @tm.network |
78 |
| - def test_parse_public_s3n_bucket(self): |
79 |
| - # Read from AWS s3 as "s3n" URL |
80 |
| - df = read_csv('s3n://pandas-test/tips.csv', nrows=10) |
| 192 | + |
| 193 | +def test_infer_s3_compression(test_s3_resource): |
| 194 | + for ext in ['', '.gz', '.bz2']: |
| 195 | + df = read_csv('s3://pandas-test/tips.csv' + ext, |
| 196 | + engine='python', compression='infer') |
81 | 197 | assert isinstance(df, DataFrame)
|
82 | 198 | assert not df.empty
|
83 | 199 | tm.assert_frame_equal(read_csv(
|
84 |
| - tm.get_data_path('tips.csv')).iloc[:10], df) |
| 200 | + tm.get_data_path('tips.csv')), df) |
| 201 | + |
85 | 202 |
|
86 |
| - @tm.network |
87 |
| - def test_parse_public_s3a_bucket(self): |
88 |
| - # Read from AWS s3 as "s3a" URL |
89 |
| - df = read_csv('s3a://pandas-test/tips.csv', nrows=10) |
| 203 | +def test_parse_public_s3_bucket_nrows_python(test_s3_resource): |
| 204 | + for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: |
| 205 | + df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python', |
| 206 | + nrows=10, compression=comp) |
90 | 207 | assert isinstance(df, DataFrame)
|
91 | 208 | assert not df.empty
|
92 | 209 | tm.assert_frame_equal(read_csv(
|
93 | 210 | tm.get_data_path('tips.csv')).iloc[:10], df)
|
94 | 211 |
|
95 |
| - @tm.network |
96 |
| - def test_parse_public_s3_bucket_nrows(self): |
97 |
| - for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: |
98 |
| - df = read_csv('s3://pandas-test/tips.csv' + |
99 |
| - ext, nrows=10, compression=comp) |
100 |
| - assert isinstance(df, DataFrame) |
101 |
| - assert not df.empty |
102 |
| - tm.assert_frame_equal(read_csv( |
103 |
| - tm.get_data_path('tips.csv')).iloc[:10], df) |
104 |
| - |
105 |
| - @tm.network |
106 |
| - def test_parse_public_s3_bucket_chunked(self): |
107 |
| - # Read with a chunksize |
108 |
| - chunksize = 5 |
109 |
| - local_tips = read_csv(tm.get_data_path('tips.csv')) |
110 |
| - for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: |
111 |
| - df_reader = read_csv('s3://pandas-test/tips.csv' + ext, |
112 |
| - chunksize=chunksize, compression=comp) |
113 |
| - assert df_reader.chunksize == chunksize |
114 |
| - for i_chunk in [0, 1, 2]: |
115 |
| - # Read a couple of chunks and make sure we see them |
116 |
| - # properly. |
117 |
| - df = df_reader.get_chunk() |
118 |
| - assert isinstance(df, DataFrame) |
119 |
| - assert not df.empty |
120 |
| - true_df = local_tips.iloc[ |
121 |
| - chunksize * i_chunk: chunksize * (i_chunk + 1)] |
122 |
| - tm.assert_frame_equal(true_df, df) |
123 |
| - |
124 |
| - @tm.network |
125 |
| - def test_parse_public_s3_bucket_chunked_python(self): |
126 |
| - # Read with a chunksize using the Python parser |
127 |
| - chunksize = 5 |
128 |
| - local_tips = read_csv(tm.get_data_path('tips.csv')) |
129 |
| - for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: |
130 |
| - df_reader = read_csv('s3://pandas-test/tips.csv' + ext, |
131 |
| - chunksize=chunksize, compression=comp, |
132 |
| - engine='python') |
133 |
| - assert df_reader.chunksize == chunksize |
134 |
| - for i_chunk in [0, 1, 2]: |
135 |
| - # Read a couple of chunks and make sure we see them properly. |
136 |
| - df = df_reader.get_chunk() |
137 |
| - assert isinstance(df, DataFrame) |
138 |
| - assert not df.empty |
139 |
| - true_df = local_tips.iloc[ |
140 |
| - chunksize * i_chunk: chunksize * (i_chunk + 1)] |
141 |
| - tm.assert_frame_equal(true_df, df) |
142 |
| - |
143 |
| - @tm.network |
144 |
| - def test_parse_public_s3_bucket_python(self): |
145 |
| - for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: |
146 |
| - df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python', |
147 |
| - compression=comp) |
148 |
| - assert isinstance(df, DataFrame) |
149 |
| - assert not df.empty |
150 |
| - tm.assert_frame_equal(read_csv( |
151 |
| - tm.get_data_path('tips.csv')), df) |
152 |
| - |
153 |
| - @tm.network |
154 |
| - def test_infer_s3_compression(self): |
155 |
| - for ext in ['', '.gz', '.bz2']: |
156 |
| - df = read_csv('s3://pandas-test/tips.csv' + ext, |
157 |
| - engine='python', compression='infer') |
158 |
| - assert isinstance(df, DataFrame) |
159 |
| - assert not df.empty |
160 |
| - tm.assert_frame_equal(read_csv( |
161 |
| - tm.get_data_path('tips.csv')), df) |
162 |
| - |
163 |
| - @tm.network |
164 |
| - def test_parse_public_s3_bucket_nrows_python(self): |
165 |
| - for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: |
166 |
| - df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python', |
167 |
| - nrows=10, compression=comp) |
168 |
| - assert isinstance(df, DataFrame) |
169 |
| - assert not df.empty |
170 |
| - tm.assert_frame_equal(read_csv( |
171 |
| - tm.get_data_path('tips.csv')).iloc[:10], df) |
172 | 212 |
|
173 |
| - @tm.network |
174 |
| - def test_s3_fails(self): |
175 |
| - with pytest.raises(IOError): |
176 |
| - read_csv('s3://nyqpug/asdf.csv') |
| 213 | +def test_s3_fails(test_s3_resource): |
| 214 | + with pytest.raises(IOError): |
| 215 | + read_csv('s3://nyqpug/asdf.csv') |
177 | 216 |
|
178 |
| - # Receive a permission error when trying to read a private bucket. |
179 |
| - # It's irrelevant here that this isn't actually a table. |
180 |
| - with pytest.raises(IOError): |
181 |
| - read_csv('s3://cant_get_it/') |
| 217 | + # Receive a permission error when trying to read a private bucket. |
| 218 | + # It's irrelevant here that this isn't actually a table. |
| 219 | + with pytest.raises(IOError): |
| 220 | + read_csv('s3://cant_get_it/') |
182 | 221 |
|
183 |
| - @tm.network |
184 |
| - def boto3_client_s3(self): |
185 |
| - # see gh-16135 |
186 | 222 |
|
187 |
| - # boto3 is a dependency of s3fs |
188 |
| - import boto3 |
189 |
| - client = boto3.client("s3") |
| 223 | +def test_read_csv__handles_boto_s3_object(test_s3_resource, tips_file): |
| 224 | + # see gh-16135 |
190 | 225 |
|
191 |
| - key = "/tips.csv" |
192 |
| - bucket = "pandas-test" |
193 |
| - s3_object = client.get_object(Bucket=bucket, Key=key) |
| 226 | + s3_object = test_s3_resource.meta.client.get_object(Bucket='pandas-test', |
| 227 | + Key='tips.csv') |
194 | 228 |
|
195 |
| - result = read_csv(s3_object["Body"]) |
196 |
| - assert isinstance(result, DataFrame) |
197 |
| - assert not result.empty |
| 229 | + result = read_csv(six.BytesIO(s3_object["Body"].read()), encoding='utf8') |
| 230 | + assert isinstance(result, DataFrame) |
| 231 | + assert not result.empty |
198 | 232 |
|
199 |
| - expected = read_csv(tm.get_data_path('tips.csv')) |
200 |
| - tm.assert_frame_equal(result, expected) |
| 233 | + expected = read_csv(tips_file) |
| 234 | + tm.assert_frame_equal(result, expected) |
0 commit comments