diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index b2a1e10469a0f..f0dd787654e67 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -29,6 +29,7 @@ New features - SQL io functions now accept a SQLAlchemy connectable. (:issue:`7877`) - Enable writing complex values to HDF stores when using table format (:issue:`10447`) +- Enable reading gzip compressed files via URL, either by explicitly setting the compression parameter or by inferring from the presence of the HTTP Content-Encoding header in the response (:issue:`8685`) .. _whatsnew_0170.enhancements.other: diff --git a/pandas/io/common.py b/pandas/io/common.py index 65cfdff1df14b..b7b663ba61a55 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -73,7 +73,7 @@ def _is_s3_url(url): return False -def maybe_read_encoded_stream(reader, encoding=None): +def maybe_read_encoded_stream(reader, encoding=None, compression=None): """read an encoded stream from the reader and transform the bytes to unicode if required based on the encoding @@ -94,8 +94,14 @@ def maybe_read_encoded_stream(reader, encoding=None): else: errors = 'replace' encoding = 'utf-8' - reader = StringIO(reader.read().decode(encoding, errors)) + + if compression == 'gzip': + reader = BytesIO(reader.read()) + else: + reader = StringIO(reader.read().decode(encoding, errors)) else: + if compression == 'gzip': + reader = BytesIO(reader.read()) encoding = None return reader, encoding @@ -118,7 +124,8 @@ def _expand_user(filepath_or_buffer): return filepath_or_buffer -def get_filepath_or_buffer(filepath_or_buffer, encoding=None): +def get_filepath_or_buffer(filepath_or_buffer, encoding=None, + compression=None): """ If the filepath_or_buffer is a url, translate and return the buffer passthru otherwise. @@ -130,12 +137,19 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None): Returns ------- - a filepath_or_buffer, the encoding + a filepath_or_buffer, the encoding, the compression """ if _is_url(filepath_or_buffer): req = _urlopen(str(filepath_or_buffer)) - return maybe_read_encoded_stream(req, encoding) + if compression == 'infer': + content_encoding = req.headers.get('Content-Encoding', None) + if content_encoding == 'gzip': + compression = 'gzip' + # cat on the compression to the tuple returned by the function + to_return = list(maybe_read_encoded_stream(req, encoding, compression)) + \ + [compression] + return tuple(to_return) if _is_s3_url(filepath_or_buffer): try: @@ -156,10 +170,9 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None): k.key = parsed_url.path filepath_or_buffer = BytesIO(k.get_contents_as_string( encoding=encoding)) - return filepath_or_buffer, None - + return filepath_or_buffer, None, compression - return _expand_user(filepath_or_buffer), None + return _expand_user(filepath_or_buffer), None, compression def file_path_to_url(path): diff --git a/pandas/io/json.py b/pandas/io/json.py index 2c1333326b701..81a916e058b3d 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -172,7 +172,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, result : Series or DataFrame """ - filepath_or_buffer, _ = get_filepath_or_buffer(path_or_buf) + filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf) if isinstance(filepath_or_buffer, compat.string_types): try: exists = os.path.exists(filepath_or_buffer) diff --git a/pandas/io/packers.py b/pandas/io/packers.py index 847a7c4f90216..f761ea6bf62e3 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -126,7 +126,7 @@ def read_msgpack(path_or_buf, iterator=False, **kwargs): obj : type of object stored in file """ - path_or_buf, _ = get_filepath_or_buffer(path_or_buf) + path_or_buf, _, _ = get_filepath_or_buffer(path_or_buf) if iterator: return Iterator(path_or_buf) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 62d51fc510f97..73ffefd089647 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -26,6 +26,7 @@ import pandas.tslib as tslib import pandas.parser as _parser + class ParserWarning(Warning): pass @@ -234,8 +235,10 @@ def _read(filepath_or_buffer, kwds): if skipfooter is not None: kwds['skip_footer'] = skipfooter - filepath_or_buffer, _ = get_filepath_or_buffer(filepath_or_buffer, - encoding) + filepath_or_buffer, _, compression = get_filepath_or_buffer(filepath_or_buffer, + encoding, + compression=kwds.get('compression', None)) + kwds['compression'] = compression if kwds.get('date_parser', None) is not None: if isinstance(kwds['parse_dates'], bool): @@ -402,8 +405,9 @@ def parser_f(filepath_or_buffer, delimiter = sep if delim_whitespace and delimiter is not default_sep: - raise ValueError("Specified a delimiter with both sep and"\ - " delim_whitespace=True; you can only specify one.") + raise ValueError("Specified a delimiter with both sep and" + " delim_whitespace=True; you can only" + " specify one.") if engine is not None: engine_specified = True @@ -1711,7 +1715,7 @@ def _infer_columns(self): num_original_columns = ncols if not names: if self.prefix: - columns = [['%s%d' % (self.prefix,i) for i in range(ncols)]] + columns = [['%s%d' % (self.prefix, i) for i in range(ncols)]] else: columns = [lrange(ncols)] columns = self._handle_usecols(columns, columns[0]) @@ -2233,8 +2237,8 @@ def _get_empty_meta(columns, index_col, index_names, dtype=None): if index_col is None or index_col is False: index = Index([]) else: - index = [ np.empty(0, dtype=dtype.get(index_name, np.object)) - for index_name in index_names ] + index = [np.empty(0, dtype=dtype.get(index_name, np.object)) + for index_name in index_names] index = MultiIndex.from_arrays(index, names=index_names) index_col.sort() for i, n in enumerate(index_col): diff --git a/pandas/io/stata.py b/pandas/io/stata.py index db9362c5c821e..6e72c1c2f0cc0 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -932,7 +932,7 @@ def __init__(self, path_or_buf, convert_dates=True, self._native_byteorder = _set_endianness(sys.byteorder) if isinstance(path_or_buf, str): - path_or_buf, encoding = get_filepath_or_buffer( + path_or_buf, encoding, _ = get_filepath_or_buffer( path_or_buf, encoding=self._default_encoding ) diff --git a/pandas/io/tests/data/salary.table.gz b/pandas/io/tests/data/salary.table.gz new file mode 100644 index 0000000000000..629de9703d345 Binary files /dev/null and b/pandas/io/tests/data/salary.table.gz differ diff --git a/pandas/io/tests/test_common.py b/pandas/io/tests/test_common.py index fe163cc13c5da..34e7c94b64bcb 100644 --- a/pandas/io/tests/test_common.py +++ b/pandas/io/tests/test_common.py @@ -29,12 +29,12 @@ def test_expand_user_normal_path(self): def test_get_filepath_or_buffer_with_path(self): filename = '~/sometest' - filepath_or_buffer, _ = common.get_filepath_or_buffer(filename) + filepath_or_buffer, _, _ = common.get_filepath_or_buffer(filename) self.assertNotEqual(filepath_or_buffer, filename) self.assertNotIn('~', filepath_or_buffer) self.assertEqual(os.path.expanduser(filename), filepath_or_buffer) def test_get_filepath_or_buffer_with_buffer(self): input_buffer = StringIO() - filepath_or_buffer, _ = common.get_filepath_or_buffer(input_buffer) + filepath_or_buffer, _, _ = common.get_filepath_or_buffer(input_buffer) self.assertEqual(filepath_or_buffer, input_buffer) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 0f0486e8ea596..b9e9ec3a391ec 100755 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -3070,6 +3070,7 @@ def test_whitespace_lines(self): df = self.read_csv(StringIO(data)) tm.assert_almost_equal(df.values, expected) + class TestFwfColspaceSniffing(tm.TestCase): def test_full_file(self): # File with all values @@ -4060,6 +4061,26 @@ def test_convert_sql_column_decimals(self): assert_same_values_and_dtype(result, expected) +class TestUrlGz(tm.TestCase): + def setUp(self): + dirpath = tm.get_data_path() + localtable = os.path.join(dirpath, 'salary.table') + self.local_table = read_table(localtable) + + @tm.network + def test_url_gz(self): + url = ('https://raw.github.com/mdagost/pandas/url_gzip_fix/' + 'pandas/io/tests/data/salary.table.gz') + url_table = read_table(url, compression="gzip", engine="python") + tm.assert_frame_equal(url_table, self.local_table) + + @tm.network + def test_url_gz_infer(self): + url = ('https://s3.amazonaws.com/pandas-url-test/salary.table.gz') + url_table = read_table(url, compression="infer", engine="python") + tm.assert_frame_equal(url_table, self.local_table) + + class TestS3(tm.TestCase): def setUp(self): try: