forked from pandas-dev/pandas
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_user_agent.py
379 lines (306 loc) · 11.5 KB
/
test_user_agent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
"""
Tests for the pandas custom headers in http(s) requests
"""
import gzip
import http.server
from io import BytesIO
import multiprocessing
import os
import socket
import time
import urllib.error
import pytest
import pandas.util._test_decorators as td
import pandas as pd
import pandas._testing as tm
pytestmark = pytest.mark.skipif(
os.environ.get("PANDAS_CI", "0") == "1",
reason="This test can hang in our CI min_versions build "
"and leads to '##[error]The runner has "
"received a shutdown signal...' in GHA. GH 45651",
)
class BaseUserAgentResponder(http.server.BaseHTTPRequestHandler):
"""
Base class for setting up a server that can be set up to respond
with a particular file format with accompanying content-type headers.
The interfaces on the different io methods are different enough
that this seemed logical to do.
"""
def start_processing_headers(self):
"""
shared logic at the start of a GET request
"""
self.send_response(200)
self.requested_from_user_agent = self.headers["User-Agent"]
response_df = pd.DataFrame(
{
"header": [self.requested_from_user_agent],
}
)
return response_df
def gzip_bytes(self, response_bytes):
"""
some web servers will send back gzipped files to save bandwidth
"""
with BytesIO() as bio:
with gzip.GzipFile(fileobj=bio, mode="w") as zipper:
zipper.write(response_bytes)
response_bytes = bio.getvalue()
return response_bytes
def write_back_bytes(self, response_bytes):
"""
shared logic at the end of a GET request
"""
self.wfile.write(response_bytes)
class CSVUserAgentResponder(BaseUserAgentResponder):
def do_GET(self):
response_df = self.start_processing_headers()
self.send_header("Content-Type", "text/csv")
self.end_headers()
response_bytes = response_df.to_csv(index=False).encode("utf-8")
self.write_back_bytes(response_bytes)
class GzippedCSVUserAgentResponder(BaseUserAgentResponder):
def do_GET(self):
response_df = self.start_processing_headers()
self.send_header("Content-Type", "text/csv")
self.send_header("Content-Encoding", "gzip")
self.end_headers()
response_bytes = response_df.to_csv(index=False).encode("utf-8")
response_bytes = self.gzip_bytes(response_bytes)
self.write_back_bytes(response_bytes)
class JSONUserAgentResponder(BaseUserAgentResponder):
def do_GET(self):
response_df = self.start_processing_headers()
self.send_header("Content-Type", "application/json")
self.end_headers()
response_bytes = response_df.to_json().encode("utf-8")
self.write_back_bytes(response_bytes)
class GzippedJSONUserAgentResponder(BaseUserAgentResponder):
def do_GET(self):
response_df = self.start_processing_headers()
self.send_header("Content-Type", "application/json")
self.send_header("Content-Encoding", "gzip")
self.end_headers()
response_bytes = response_df.to_json().encode("utf-8")
response_bytes = self.gzip_bytes(response_bytes)
self.write_back_bytes(response_bytes)
class ParquetPyArrowUserAgentResponder(BaseUserAgentResponder):
def do_GET(self):
response_df = self.start_processing_headers()
self.send_header("Content-Type", "application/octet-stream")
self.end_headers()
response_bytes = response_df.to_parquet(index=False, engine="pyarrow")
self.write_back_bytes(response_bytes)
class ParquetFastParquetUserAgentResponder(BaseUserAgentResponder):
def do_GET(self):
response_df = self.start_processing_headers()
self.send_header("Content-Type", "application/octet-stream")
self.end_headers()
# the fastparquet engine doesn't like to write to a buffer
# it can do it via the open_with function being set appropriately
# however it automatically calls the close method and wipes the buffer
# so just overwrite that attribute on this instance to not do that
# protected by an importorskip in the respective test
import fsspec
response_df.to_parquet(
"memory://fastparquet_user_agent.parquet",
index=False,
engine="fastparquet",
compression=None,
)
with fsspec.open("memory://fastparquet_user_agent.parquet", "rb") as f:
response_bytes = f.read()
self.write_back_bytes(response_bytes)
class PickleUserAgentResponder(BaseUserAgentResponder):
def do_GET(self):
response_df = self.start_processing_headers()
self.send_header("Content-Type", "application/octet-stream")
self.end_headers()
bio = BytesIO()
response_df.to_pickle(bio)
response_bytes = bio.getvalue()
self.write_back_bytes(response_bytes)
class StataUserAgentResponder(BaseUserAgentResponder):
def do_GET(self):
response_df = self.start_processing_headers()
self.send_header("Content-Type", "application/octet-stream")
self.end_headers()
bio = BytesIO()
response_df.to_stata(bio, write_index=False)
response_bytes = bio.getvalue()
self.write_back_bytes(response_bytes)
class AllHeaderCSVResponder(http.server.BaseHTTPRequestHandler):
"""
Send all request headers back for checking round trip
"""
def do_GET(self):
response_df = pd.DataFrame(self.headers.items())
self.send_response(200)
self.send_header("Content-Type", "text/csv")
self.end_headers()
response_bytes = response_df.to_csv(index=False).encode("utf-8")
self.wfile.write(response_bytes)
def wait_until_ready(func, *args, **kwargs):
def inner(*args, **kwargs):
while True:
try:
return func(*args, **kwargs)
except urllib.error.URLError:
# Connection refused as http server is starting
time.sleep(0.1)
return inner
def process_server(responder, port):
with http.server.HTTPServer(("localhost", port), responder) as server:
server.handle_request()
server.server_close()
@pytest.fixture
def responder(request):
"""
Fixture that starts a local http server in a separate process on localhost
and returns the port.
Running in a separate process instead of a thread to allow termination/killing
of http server upon cleanup.
"""
# Find an available port
with socket.socket() as sock:
sock.bind(("localhost", 0))
port = sock.getsockname()[1]
server_process = multiprocessing.Process(
target=process_server, args=(request.param, port)
)
server_process.start()
yield port
server_process.join(10)
server_process.terminate()
kill_time = 5
wait_time = 0
while server_process.is_alive():
if wait_time > kill_time:
server_process.kill()
break
else:
wait_time += 0.1
time.sleep(0.1)
server_process.close()
@pytest.mark.parametrize(
"responder, read_method, parquet_engine",
[
(CSVUserAgentResponder, pd.read_csv, None),
(JSONUserAgentResponder, pd.read_json, None),
(ParquetPyArrowUserAgentResponder, pd.read_parquet, "pyarrow"),
pytest.param(
ParquetFastParquetUserAgentResponder,
pd.read_parquet,
"fastparquet",
# TODO(ArrayManager) fastparquet
marks=[
td.skip_array_manager_not_yet_implemented,
],
),
(PickleUserAgentResponder, pd.read_pickle, None),
(StataUserAgentResponder, pd.read_stata, None),
(GzippedCSVUserAgentResponder, pd.read_csv, None),
(GzippedJSONUserAgentResponder, pd.read_json, None),
],
indirect=["responder"],
)
def test_server_and_default_headers(responder, read_method, parquet_engine):
if parquet_engine is not None:
pytest.importorskip(parquet_engine)
if parquet_engine == "fastparquet":
pytest.importorskip("fsspec")
read_method = wait_until_ready(read_method)
if parquet_engine is None:
df_http = read_method(f"http://localhost:{responder}")
else:
df_http = read_method(f"http://localhost:{responder}", engine=parquet_engine)
assert not df_http.empty
@pytest.mark.parametrize(
"responder, read_method, parquet_engine",
[
(CSVUserAgentResponder, pd.read_csv, None),
(JSONUserAgentResponder, pd.read_json, None),
(ParquetPyArrowUserAgentResponder, pd.read_parquet, "pyarrow"),
pytest.param(
ParquetFastParquetUserAgentResponder,
pd.read_parquet,
"fastparquet",
# TODO(ArrayManager) fastparquet
marks=[
td.skip_array_manager_not_yet_implemented,
],
),
(PickleUserAgentResponder, pd.read_pickle, None),
(StataUserAgentResponder, pd.read_stata, None),
(GzippedCSVUserAgentResponder, pd.read_csv, None),
(GzippedJSONUserAgentResponder, pd.read_json, None),
],
indirect=["responder"],
)
def test_server_and_custom_headers(responder, read_method, parquet_engine):
if parquet_engine is not None:
pytest.importorskip(parquet_engine)
if parquet_engine == "fastparquet":
pytest.importorskip("fsspec")
custom_user_agent = "Super Cool One"
df_true = pd.DataFrame({"header": [custom_user_agent]})
read_method = wait_until_ready(read_method)
if parquet_engine is None:
df_http = read_method(
f"http://localhost:{responder}",
storage_options={"User-Agent": custom_user_agent},
)
else:
df_http = read_method(
f"http://localhost:{responder}",
storage_options={"User-Agent": custom_user_agent},
engine=parquet_engine,
)
tm.assert_frame_equal(df_true, df_http)
@pytest.mark.parametrize(
"responder, read_method",
[
(AllHeaderCSVResponder, pd.read_csv),
],
indirect=["responder"],
)
def test_server_and_all_custom_headers(responder, read_method):
custom_user_agent = "Super Cool One"
custom_auth_token = "Super Secret One"
storage_options = {
"User-Agent": custom_user_agent,
"Auth": custom_auth_token,
}
read_method = wait_until_ready(read_method)
df_http = read_method(
f"http://localhost:{responder}",
storage_options=storage_options,
)
df_http = df_http[df_http["0"].isin(storage_options.keys())]
df_http = df_http.sort_values(["0"]).reset_index()
df_http = df_http[["0", "1"]]
keys = list(storage_options.keys())
df_true = pd.DataFrame({"0": keys, "1": [storage_options[k] for k in keys]})
df_true = df_true.sort_values(["0"])
df_true = df_true.reset_index().drop(["index"], axis=1)
tm.assert_frame_equal(df_true, df_http)
@pytest.mark.parametrize(
"engine",
[
"pyarrow",
"fastparquet",
],
)
def test_to_parquet_to_disk_with_storage_options(engine):
headers = {
"User-Agent": "custom",
"Auth": "other_custom",
}
pytest.importorskip(engine)
true_df = pd.DataFrame({"column_name": ["column_value"]})
msg = (
"storage_options passed with file object or non-fsspec file path|"
"storage_options passed with buffer, or non-supported URL"
)
with pytest.raises(ValueError, match=msg):
true_df.to_parquet("/tmp/junk.parquet", storage_options=headers, engine=engine)