Skip to content

Commit df23f91

Browse files
committed
Merge pull request #10649 from mdagost/url_gzip_fix
ENH: allow gzip de-compression for files specified by a url
2 parents 0d9bfa1 + a92bd76 commit df23f91

File tree

8 files changed

+59
-20
lines changed

8 files changed

+59
-20
lines changed

doc/source/whatsnew/v0.17.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ New features
2929

3030
- SQL io functions now accept a SQLAlchemy connectable. (:issue:`7877`)
3131
- Enable writing complex values to HDF stores when using table format (:issue:`10447`)
32+
- Enable reading gzip compressed files via URL, either by explicitly setting the compression parameter or by inferring from the presence of the HTTP Content-Encoding header in the response (:issue:`8685`)
3233

3334
.. _whatsnew_0170.enhancements.other:
3435

pandas/io/common.py

+21-8
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ def _is_s3_url(url):
7373
return False
7474

7575

76-
def maybe_read_encoded_stream(reader, encoding=None):
76+
def maybe_read_encoded_stream(reader, encoding=None, compression=None):
7777
"""read an encoded stream from the reader and transform the bytes to
7878
unicode if required based on the encoding
7979
@@ -94,8 +94,14 @@ def maybe_read_encoded_stream(reader, encoding=None):
9494
else:
9595
errors = 'replace'
9696
encoding = 'utf-8'
97-
reader = StringIO(reader.read().decode(encoding, errors))
97+
98+
if compression == 'gzip':
99+
reader = BytesIO(reader.read())
100+
else:
101+
reader = StringIO(reader.read().decode(encoding, errors))
98102
else:
103+
if compression == 'gzip':
104+
reader = BytesIO(reader.read())
99105
encoding = None
100106
return reader, encoding
101107

@@ -118,7 +124,8 @@ def _expand_user(filepath_or_buffer):
118124
return filepath_or_buffer
119125

120126

121-
def get_filepath_or_buffer(filepath_or_buffer, encoding=None):
127+
def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
128+
compression=None):
122129
"""
123130
If the filepath_or_buffer is a url, translate and return the buffer
124131
passthru otherwise.
@@ -130,12 +137,19 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None):
130137
131138
Returns
132139
-------
133-
a filepath_or_buffer, the encoding
140+
a filepath_or_buffer, the encoding, the compression
134141
"""
135142

136143
if _is_url(filepath_or_buffer):
137144
req = _urlopen(str(filepath_or_buffer))
138-
return maybe_read_encoded_stream(req, encoding)
145+
if compression == 'infer':
146+
content_encoding = req.headers.get('Content-Encoding', None)
147+
if content_encoding == 'gzip':
148+
compression = 'gzip'
149+
# cat on the compression to the tuple returned by the function
150+
to_return = list(maybe_read_encoded_stream(req, encoding, compression)) + \
151+
[compression]
152+
return tuple(to_return)
139153

140154
if _is_s3_url(filepath_or_buffer):
141155
try:
@@ -156,10 +170,9 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None):
156170
k.key = parsed_url.path
157171
filepath_or_buffer = BytesIO(k.get_contents_as_string(
158172
encoding=encoding))
159-
return filepath_or_buffer, None
160-
173+
return filepath_or_buffer, None, compression
161174

162-
return _expand_user(filepath_or_buffer), None
175+
return _expand_user(filepath_or_buffer), None, compression
163176

164177

165178
def file_path_to_url(path):

pandas/io/json.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
172172
result : Series or DataFrame
173173
"""
174174

175-
filepath_or_buffer, _ = get_filepath_or_buffer(path_or_buf)
175+
filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf)
176176
if isinstance(filepath_or_buffer, compat.string_types):
177177
try:
178178
exists = os.path.exists(filepath_or_buffer)

pandas/io/packers.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ def read_msgpack(path_or_buf, iterator=False, **kwargs):
126126
obj : type of object stored in file
127127
128128
"""
129-
path_or_buf, _ = get_filepath_or_buffer(path_or_buf)
129+
path_or_buf, _, _ = get_filepath_or_buffer(path_or_buf)
130130
if iterator:
131131
return Iterator(path_or_buf)
132132

pandas/io/parsers.py

+11-7
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import pandas.tslib as tslib
2727
import pandas.parser as _parser
2828

29+
2930
class ParserWarning(Warning):
3031
pass
3132

@@ -234,8 +235,10 @@ def _read(filepath_or_buffer, kwds):
234235
if skipfooter is not None:
235236
kwds['skip_footer'] = skipfooter
236237

237-
filepath_or_buffer, _ = get_filepath_or_buffer(filepath_or_buffer,
238-
encoding)
238+
filepath_or_buffer, _, compression = get_filepath_or_buffer(filepath_or_buffer,
239+
encoding,
240+
compression=kwds.get('compression', None))
241+
kwds['compression'] = compression
239242

240243
if kwds.get('date_parser', None) is not None:
241244
if isinstance(kwds['parse_dates'], bool):
@@ -402,8 +405,9 @@ def parser_f(filepath_or_buffer,
402405
delimiter = sep
403406

404407
if delim_whitespace and delimiter is not default_sep:
405-
raise ValueError("Specified a delimiter with both sep and"\
406-
" delim_whitespace=True; you can only specify one.")
408+
raise ValueError("Specified a delimiter with both sep and"
409+
" delim_whitespace=True; you can only"
410+
" specify one.")
407411

408412
if engine is not None:
409413
engine_specified = True
@@ -1711,7 +1715,7 @@ def _infer_columns(self):
17111715
num_original_columns = ncols
17121716
if not names:
17131717
if self.prefix:
1714-
columns = [['%s%d' % (self.prefix,i) for i in range(ncols)]]
1718+
columns = [['%s%d' % (self.prefix, i) for i in range(ncols)]]
17151719
else:
17161720
columns = [lrange(ncols)]
17171721
columns = self._handle_usecols(columns, columns[0])
@@ -2233,8 +2237,8 @@ def _get_empty_meta(columns, index_col, index_names, dtype=None):
22332237
if index_col is None or index_col is False:
22342238
index = Index([])
22352239
else:
2236-
index = [ np.empty(0, dtype=dtype.get(index_name, np.object))
2237-
for index_name in index_names ]
2240+
index = [np.empty(0, dtype=dtype.get(index_name, np.object))
2241+
for index_name in index_names]
22382242
index = MultiIndex.from_arrays(index, names=index_names)
22392243
index_col.sort()
22402244
for i, n in enumerate(index_col):

pandas/io/stata.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -934,7 +934,7 @@ def __init__(self, path_or_buf, convert_dates=True,
934934

935935
self._native_byteorder = _set_endianness(sys.byteorder)
936936
if isinstance(path_or_buf, str):
937-
path_or_buf, encoding = get_filepath_or_buffer(
937+
path_or_buf, encoding, _ = get_filepath_or_buffer(
938938
path_or_buf, encoding=self._default_encoding
939939
)
940940

pandas/io/tests/test_common.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,12 @@ def test_expand_user_normal_path(self):
2929

3030
def test_get_filepath_or_buffer_with_path(self):
3131
filename = '~/sometest'
32-
filepath_or_buffer, _ = common.get_filepath_or_buffer(filename)
32+
filepath_or_buffer, _, _ = common.get_filepath_or_buffer(filename)
3333
self.assertNotEqual(filepath_or_buffer, filename)
3434
self.assertNotIn('~', filepath_or_buffer)
3535
self.assertEqual(os.path.expanduser(filename), filepath_or_buffer)
3636

3737
def test_get_filepath_or_buffer_with_buffer(self):
3838
input_buffer = StringIO()
39-
filepath_or_buffer, _ = common.get_filepath_or_buffer(input_buffer)
39+
filepath_or_buffer, _, _ = common.get_filepath_or_buffer(input_buffer)
4040
self.assertEqual(filepath_or_buffer, input_buffer)

pandas/io/tests/test_parsers.py

+21
Original file line numberDiff line numberDiff line change
@@ -3089,6 +3089,7 @@ def test_whitespace_lines(self):
30893089
df = self.read_csv(StringIO(data))
30903090
tm.assert_almost_equal(df.values, expected)
30913091

3092+
30923093
class TestFwfColspaceSniffing(tm.TestCase):
30933094
def test_full_file(self):
30943095
# File with all values
@@ -4117,6 +4118,26 @@ def test_convert_sql_column_decimals(self):
41174118
assert_same_values_and_dtype(result, expected)
41184119

41194120

4121+
class TestUrlGz(tm.TestCase):
4122+
def setUp(self):
4123+
dirpath = tm.get_data_path()
4124+
localtable = os.path.join(dirpath, 'salary.table')
4125+
self.local_table = read_table(localtable)
4126+
4127+
@tm.network
4128+
def test_url_gz(self):
4129+
url = ('https://raw.github.com/mdagost/pandas/url_gzip_fix/'
4130+
'pandas/io/tests/data/salary.table.gz')
4131+
url_table = read_table(url, compression="gzip", engine="python")
4132+
tm.assert_frame_equal(url_table, self.local_table)
4133+
4134+
@tm.network
4135+
def test_url_gz_infer(self):
4136+
url = ('https://s3.amazonaws.com/pandas-url-test/salary.table.gz')
4137+
url_table = read_table(url, compression="infer", engine="python")
4138+
tm.assert_frame_equal(url_table, self.local_table)
4139+
4140+
41204141
class TestS3(tm.TestCase):
41214142
def setUp(self):
41224143
try:

0 commit comments

Comments
 (0)