forked from pandas-dev/pandas
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_gcs.py
206 lines (168 loc) · 6.2 KB
/
test_gcs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
from io import BytesIO
import os
import tarfile
import zipfile
import numpy as np
import pytest
from pandas import (
DataFrame,
date_range,
read_csv,
read_excel,
read_json,
read_parquet,
)
import pandas._testing as tm
from pandas.tests.io.test_compression import _compression_to_extension
from pandas.util import _test_decorators as td
@pytest.fixture
def gcs_buffer(monkeypatch):
"""Emulate GCS using a binary buffer."""
from fsspec import (
AbstractFileSystem,
registry,
)
registry.target.clear() # remove state
gcs_buffer = BytesIO()
gcs_buffer.close = lambda: True
class MockGCSFileSystem(AbstractFileSystem):
def open(*args, **kwargs): # pylint: disable=no-method-argument
gcs_buffer.seek(0)
return gcs_buffer
def ls(self, path, **kwargs):
# needed for pyarrow
return [{"name": path, "type": "file"}]
monkeypatch.setattr("gcsfs.GCSFileSystem", MockGCSFileSystem)
return gcs_buffer
@td.skip_if_no("gcsfs")
@pytest.mark.parametrize("format", ["csv", "json", "parquet", "excel", "markdown"])
def test_to_read_gcs(gcs_buffer, format):
"""
Test that many to/read functions support GCS.
GH 33987
"""
from fsspec import registry
registry.target.clear() # remove state
df1 = DataFrame(
{
"int": [1, 3],
"float": [2.0, np.nan],
"str": ["t", "s"],
"dt": date_range("2018-06-18", periods=2),
}
)
path = f"gs://test/test.{format}"
if format == "csv":
df1.to_csv(path, index=True)
df2 = read_csv(path, parse_dates=["dt"], index_col=0)
elif format == "excel":
path = "gs://test/test.xls"
df1.to_excel(path)
df2 = read_excel(path, parse_dates=["dt"], index_col=0)
elif format == "json":
df1.to_json(path)
df2 = read_json(path, convert_dates=["dt"])
elif format == "parquet":
pytest.importorskip("pyarrow")
df1.to_parquet(path)
df2 = read_parquet(path)
elif format == "markdown":
pytest.importorskip("tabulate")
df1.to_markdown(path)
df2 = df1
tm.assert_frame_equal(df1, df2)
def assert_equal_zip_safe(result: bytes, expected: bytes, compression: str):
"""
For zip compression, only compare the CRC-32 checksum of the file contents
to avoid checking the time-dependent last-modified timestamp which
in some CI builds is off-by-one
See https://en.wikipedia.org/wiki/ZIP_(file_format)#File_headers
"""
if compression == "zip":
# Only compare the CRC checksum of the file contents
with zipfile.ZipFile(BytesIO(result)) as exp, zipfile.ZipFile(
BytesIO(expected)
) as res:
for res_info, exp_info in zip(res.infolist(), exp.infolist()):
assert res_info.CRC == exp_info.CRC
elif compression == "tar":
with tarfile.open(fileobj=BytesIO(result)) as tar_exp, tarfile.open(
fileobj=BytesIO(expected)
) as tar_res:
for tar_res_info, tar_exp_info in zip(
tar_res.getmembers(), tar_exp.getmembers()
):
actual_file = tar_res.extractfile(tar_res_info)
expected_file = tar_exp.extractfile(tar_exp_info)
assert (actual_file is None) == (expected_file is None)
if actual_file is not None and expected_file is not None:
assert actual_file.read() == expected_file.read()
else:
assert result == expected
@td.skip_if_no("gcsfs")
@pytest.mark.parametrize("encoding", ["utf-8", "cp1251"])
def test_to_csv_compression_encoding_gcs(gcs_buffer, compression_only, encoding):
"""
Compression and encoding should with GCS.
GH 35677 (to_csv, compression), GH 26124 (to_csv, encoding), and
GH 32392 (read_csv, encoding)
"""
from fsspec import registry
registry.target.clear() # remove state
df = tm.makeDataFrame()
# reference of compressed and encoded file
compression = {"method": compression_only}
if compression_only == "gzip":
compression["mtime"] = 1 # be reproducible
buffer = BytesIO()
df.to_csv(buffer, compression=compression, encoding=encoding, mode="wb")
# write compressed file with explicit compression
path_gcs = "gs://test/test.csv"
df.to_csv(path_gcs, compression=compression, encoding=encoding)
res = gcs_buffer.getvalue()
expected = buffer.getvalue()
assert_equal_zip_safe(res, expected, compression_only)
read_df = read_csv(
path_gcs, index_col=0, compression=compression_only, encoding=encoding
)
tm.assert_frame_equal(df, read_df)
# write compressed file with implicit compression
file_ext = _compression_to_extension[compression_only]
compression["method"] = "infer"
path_gcs += f".{file_ext}"
df.to_csv(path_gcs, compression=compression, encoding=encoding)
res = gcs_buffer.getvalue()
expected = buffer.getvalue()
assert_equal_zip_safe(res, expected, compression_only)
read_df = read_csv(path_gcs, index_col=0, compression="infer", encoding=encoding)
tm.assert_frame_equal(df, read_df)
@td.skip_if_no("fastparquet")
@td.skip_if_no("gcsfs")
def test_to_parquet_gcs_new_file(monkeypatch, tmpdir):
"""Regression test for writing to a not-yet-existent GCS Parquet file."""
from fsspec import (
AbstractFileSystem,
registry,
)
registry.target.clear() # remove state
df1 = DataFrame(
{
"int": [1, 3],
"float": [2.0, np.nan],
"str": ["t", "s"],
"dt": date_range("2018-06-18", periods=2),
}
)
class MockGCSFileSystem(AbstractFileSystem):
def open(self, path, mode="r", *args):
if "w" not in mode:
raise FileNotFoundError
return open(os.path.join(tmpdir, "test.parquet"), mode)
monkeypatch.setattr("gcsfs.GCSFileSystem", MockGCSFileSystem)
df1.to_parquet(
"gs://test/test.csv", index=True, engine="fastparquet", compression=None
)
@td.skip_if_installed("gcsfs")
def test_gcs_not_present_exception():
with tm.external_error_raised(ImportError):
read_csv("gs://test/test.csv")