Merge pull request #11074 from stephen-hoover/infer-s3-compression

jreback · jreback · commit da6ad3f9677b · 2015-09-15T09:23:42.000-04:00
ENH Add check for inferred compression before `get_filepath_or_buffer`
diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
@@ -481,6 +481,8 @@ Other enhancements
 
 - Read CSV files from AWS S3 incrementally, instead of first downloading the entire file. (Full file download still required for compressed files in Python 2.) (:issue:`11070`, :issue:`11073`)
 
+- ``pd.read_csv`` is now able to infer compression type for files read from AWS S3 storage (:issue:`11070`, :issue:`11074`).
+
 .. _whatsnew_0170.api:
 
 .. _whatsnew_0170.api_breaking:
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -217,6 +217,8 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
             content_encoding = req.headers.get('Content-Encoding', None)
             if content_encoding == 'gzip':
                 compression = 'gzip'
+            else:
+                compression = None
         # cat on the compression to the tuple returned by the function
         to_return = list(maybe_read_encoded_stream(req, encoding, compression)) + \
                     [compression]
@@ -237,7 +239,9 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
             conn = boto.connect_s3(anon=True)
 
         b = conn.get_bucket(parsed_url.netloc, validate=False)
-        if compat.PY2 and compression == 'gzip':
+        if compat.PY2 and (compression == 'gzip' or
+                           (compression == 'infer' and
+                            filepath_or_buffer.endswith(".gz"))):
             k = boto.s3.key.Key(b, parsed_url.path)
             filepath_or_buffer = BytesIO(k.get_contents_as_string(
                 encoding=encoding))
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -235,10 +235,25 @@ def _read(filepath_or_buffer, kwds):
     if skipfooter is not None:
         kwds['skip_footer'] = skipfooter
 
+    # If the input could be a filename, check for a recognizable compression extension.
+    # If we're reading from a URL, the `get_filepath_or_buffer` will use header info
+    # to determine compression, so use what it finds in that case.
+    inferred_compression = kwds.get('compression')
+    if inferred_compression == 'infer':
+        if isinstance(filepath_or_buffer, compat.string_types):
+            if filepath_or_buffer.endswith('.gz'):
+                inferred_compression = 'gzip'
+            elif filepath_or_buffer.endswith('.bz2'):
+                inferred_compression = 'bz2'
+            else:
+                inferred_compression = None
+        else:
+            inferred_compression = None
+
     filepath_or_buffer, _, compression = get_filepath_or_buffer(filepath_or_buffer,
                                                                 encoding,
                                                                 compression=kwds.get('compression', None))
-    kwds['compression'] = compression
+    kwds['compression'] = inferred_compression if compression == 'infer' else compression
 
     if kwds.get('date_parser', None) is not None:
         if isinstance(kwds['parse_dates'], bool):
@@ -301,7 +316,7 @@ def _read(filepath_or_buffer, kwds):
     'verbose': False,
     'encoding': None,
     'squeeze': False,
-    'compression': 'infer',
+    'compression': None,
     'mangle_dupe_cols': True,
     'tupleize_cols': False,
     'infer_datetime_format': False,
@@ -1402,17 +1417,6 @@ def __init__(self, f, **kwds):
         self.comment = kwds['comment']
         self._comment_lines = []
 
-        if self.compression == 'infer':
-            if isinstance(f, compat.string_types):
-                if f.endswith('.gz'):
-                    self.compression = 'gzip'
-                elif f.endswith('.bz2'):
-                    self.compression = 'bz2'
-                else:
-                    self.compression = None
-            else:
-                self.compression = None
-
         if isinstance(f, compat.string_types):
             f = com._get_handle(f, 'r', encoding=self.encoding,
                                 compression=self.compression)
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -4341,6 +4341,15 @@ def test_parse_public_s3_bucket_python(self):
             self.assertFalse(df.empty)
             tm.assert_frame_equal(pd.read_csv(tm.get_data_path('tips.csv')), df)
 
+    @tm.network
+    def test_infer_s3_compression(self):
+        for ext in ['', '.gz', '.bz2']:
+            df = pd.read_csv('s3://pandas-test/tips.csv' + ext,
+                             engine='python', compression='infer')
+            self.assertTrue(isinstance(df, pd.DataFrame))
+            self.assertFalse(df.empty)
+            tm.assert_frame_equal(pd.read_csv(tm.get_data_path('tips.csv')), df)
+
     @tm.network
     def test_parse_public_s3_bucket_nrows_python(self):
         for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
diff --git a/pandas/parser.pyx b/pandas/parser.pyx
@@ -541,17 +541,6 @@ cdef class TextReader:
         self.parser.cb_io = NULL
         self.parser.cb_cleanup = NULL
 
-        if self.compression == 'infer':
-            if isinstance(source, basestring):
-                if source.endswith('.gz'):
-                    self.compression = 'gzip'
-                elif source.endswith('.bz2'):
-                    self.compression = 'bz2'
-                else:
-                    self.compression = None
-            else:
-                self.compression = None
-
         if self.compression:
             if self.compression == 'gzip':
                 import gzip