Merge pull request #10649 from mdagost/url_gzip_fix

jreback · jreback · commit df23f918cecb · 2015-07-24T14:43:40.000-04:00
ENH: allow gzip de-compression for files specified by a url
diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
@@ -29,6 +29,7 @@ New features
 
 - SQL io functions now accept a SQLAlchemy connectable. (:issue:`7877`)
 - Enable writing complex values to HDF stores when using table format (:issue:`10447`)
+- Enable reading gzip compressed files via URL, either by explicitly setting the compression parameter or by inferring from the presence of the HTTP Content-Encoding header in the response (:issue:`8685`)
 
 .. _whatsnew_0170.enhancements.other:
 
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -73,7 +73,7 @@ def _is_s3_url(url):
         return False
 
 
-def maybe_read_encoded_stream(reader, encoding=None):
+def maybe_read_encoded_stream(reader, encoding=None, compression=None):
     """read an encoded stream from the reader and transform the bytes to
     unicode if required based on the encoding
 
@@ -94,8 +94,14 @@ def maybe_read_encoded_stream(reader, encoding=None):
         else:
             errors = 'replace'
             encoding = 'utf-8'
-        reader = StringIO(reader.read().decode(encoding, errors))
+
+        if compression == 'gzip':
+            reader = BytesIO(reader.read())
+        else:
+            reader = StringIO(reader.read().decode(encoding, errors))
     else:
+        if compression == 'gzip':
+            reader = BytesIO(reader.read())
         encoding = None
     return reader, encoding
 
@@ -118,7 +124,8 @@ def _expand_user(filepath_or_buffer):
     return filepath_or_buffer
 
 
-def get_filepath_or_buffer(filepath_or_buffer, encoding=None):
+def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
+                           compression=None):
     """
     If the filepath_or_buffer is a url, translate and return the buffer
     passthru otherwise.
@@ -130,12 +137,19 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None):
 
     Returns
     -------
-    a filepath_or_buffer, the encoding
+    a filepath_or_buffer, the encoding, the compression
     """
 
     if _is_url(filepath_or_buffer):
         req = _urlopen(str(filepath_or_buffer))
-        return maybe_read_encoded_stream(req, encoding)
+        if compression == 'infer':
+            content_encoding = req.headers.get('Content-Encoding', None)
+            if content_encoding == 'gzip':
+                compression = 'gzip'
+        # cat on the compression to the tuple returned by the function
+        to_return = list(maybe_read_encoded_stream(req, encoding, compression)) + \
+                    [compression]
+        return tuple(to_return)
 
     if _is_s3_url(filepath_or_buffer):
         try:
@@ -156,10 +170,9 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None):
         k.key = parsed_url.path
         filepath_or_buffer = BytesIO(k.get_contents_as_string(
             encoding=encoding))
-        return filepath_or_buffer, None
-
+        return filepath_or_buffer, None, compression
 
-    return _expand_user(filepath_or_buffer), None
+    return _expand_user(filepath_or_buffer), None, compression
 
 
 def file_path_to_url(path):
diff --git a/pandas/io/json.py b/pandas/io/json.py
@@ -172,7 +172,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
     result : Series or DataFrame
     """
 
-    filepath_or_buffer, _ = get_filepath_or_buffer(path_or_buf)
+    filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf)
     if isinstance(filepath_or_buffer, compat.string_types):
         try:
             exists = os.path.exists(filepath_or_buffer)
diff --git a/pandas/io/packers.py b/pandas/io/packers.py
@@ -126,7 +126,7 @@ def read_msgpack(path_or_buf, iterator=False, **kwargs):
     obj : type of object stored in file
 
     """
-    path_or_buf, _ = get_filepath_or_buffer(path_or_buf)
+    path_or_buf, _, _ = get_filepath_or_buffer(path_or_buf)
     if iterator:
         return Iterator(path_or_buf)
 
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -26,6 +26,7 @@
 import pandas.tslib as tslib
 import pandas.parser as _parser
 
+
 class ParserWarning(Warning):
     pass
 
@@ -234,8 +235,10 @@ def _read(filepath_or_buffer, kwds):
     if skipfooter is not None:
         kwds['skip_footer'] = skipfooter
 
-    filepath_or_buffer, _ = get_filepath_or_buffer(filepath_or_buffer,
-                                                   encoding)
+    filepath_or_buffer, _, compression = get_filepath_or_buffer(filepath_or_buffer,
+                                                                encoding,
+                                                                compression=kwds.get('compression', None))
+    kwds['compression'] = compression
 
     if kwds.get('date_parser', None) is not None:
         if isinstance(kwds['parse_dates'], bool):
@@ -402,8 +405,9 @@ def parser_f(filepath_or_buffer,
             delimiter = sep
 
         if delim_whitespace and delimiter is not default_sep:
-            raise ValueError("Specified a delimiter with both sep and"\
-                    " delim_whitespace=True; you can only specify one.")
+            raise ValueError("Specified a delimiter with both sep and"
+                             " delim_whitespace=True; you can only"
+                             " specify one.")
 
         if engine is not None:
             engine_specified = True
@@ -1711,7 +1715,7 @@ def _infer_columns(self):
             num_original_columns = ncols
             if not names:
                 if self.prefix:
-                    columns = [['%s%d' % (self.prefix,i) for i in range(ncols)]]
+                    columns = [['%s%d' % (self.prefix, i) for i in range(ncols)]]
                 else:
                     columns = [lrange(ncols)]
                 columns = self._handle_usecols(columns, columns[0])
@@ -2233,8 +2237,8 @@ def _get_empty_meta(columns, index_col, index_names, dtype=None):
     if index_col is None or index_col is False:
         index = Index([])
     else:
-        index = [ np.empty(0, dtype=dtype.get(index_name, np.object))
-                  for index_name in index_names ]
+        index = [np.empty(0, dtype=dtype.get(index_name, np.object))
+                  for index_name in index_names]
         index = MultiIndex.from_arrays(index, names=index_names)
         index_col.sort()
         for i, n in enumerate(index_col):
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -934,7 +934,7 @@ def __init__(self, path_or_buf, convert_dates=True,
 
         self._native_byteorder =  _set_endianness(sys.byteorder)
         if isinstance(path_or_buf, str):
-            path_or_buf, encoding = get_filepath_or_buffer(
+            path_or_buf, encoding, _ = get_filepath_or_buffer(
                 path_or_buf, encoding=self._default_encoding
             )
 
diff --git a/pandas/io/tests/test_common.py b/pandas/io/tests/test_common.py
@@ -29,12 +29,12 @@ def test_expand_user_normal_path(self):
 
     def test_get_filepath_or_buffer_with_path(self):
         filename = '~/sometest'
-        filepath_or_buffer, _ = common.get_filepath_or_buffer(filename)
+        filepath_or_buffer, _, _ = common.get_filepath_or_buffer(filename)
         self.assertNotEqual(filepath_or_buffer, filename)
         self.assertNotIn('~', filepath_or_buffer)
         self.assertEqual(os.path.expanduser(filename), filepath_or_buffer)
 
     def test_get_filepath_or_buffer_with_buffer(self):
         input_buffer = StringIO()
-        filepath_or_buffer, _ = common.get_filepath_or_buffer(input_buffer)
+        filepath_or_buffer, _, _ = common.get_filepath_or_buffer(input_buffer)
         self.assertEqual(filepath_or_buffer, input_buffer)
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -3089,6 +3089,7 @@ def test_whitespace_lines(self):
         df = self.read_csv(StringIO(data))
         tm.assert_almost_equal(df.values, expected)
 
+
 class TestFwfColspaceSniffing(tm.TestCase):
     def test_full_file(self):
         # File with all values
@@ -4117,6 +4118,26 @@ def test_convert_sql_column_decimals(self):
         assert_same_values_and_dtype(result, expected)
 
 
+class TestUrlGz(tm.TestCase):
+    def setUp(self):
+        dirpath = tm.get_data_path()
+        localtable = os.path.join(dirpath, 'salary.table')
+        self.local_table = read_table(localtable)
+
+    @tm.network
+    def test_url_gz(self):
+        url = ('https://raw.github.com/mdagost/pandas/url_gzip_fix/'
+               'pandas/io/tests/data/salary.table.gz')
+        url_table = read_table(url, compression="gzip", engine="python")
+        tm.assert_frame_equal(url_table, self.local_table)
+
+    @tm.network
+    def test_url_gz_infer(self):
+        url = ('https://s3.amazonaws.com/pandas-url-test/salary.table.gz')
+        url_table = read_table(url, compression="infer", engine="python")
+        tm.assert_frame_equal(url_table, self.local_table)
+
+
 class TestS3(tm.TestCase):
     def setUp(self):
         try:

Original file line number	Diff line number	Diff line change
`@@ -934,7 +934,7 @@ def __init__(self, path_or_buf, convert_dates=True,`
`934`	`934`
`935`	`935`	`self._native_byteorder = _set_endianness(sys.byteorder)`
`936`	`936`	`if isinstance(path_or_buf, str):`
`937`		`- path_or_buf, encoding = get_filepath_or_buffer(`
	`937`	`+ path_or_buf, encoding, _ = get_filepath_or_buffer(`
`938`	`938`	`path_or_buf, encoding=self._default_encoding`
`939`	`939`	`)`
`940`	`940`