Move compression inference to io/parsers

dhimmel · dhimmel · commit 064c1b367257 · 2016-12-03T17:36:12.000-05:00
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -63,13 +63,6 @@ def urlopen(*args, **kwargs):
 _VALID_URLS = set(uses_relative + uses_netloc + uses_params)
 _VALID_URLS.discard('')
 
-_compression_to_extension = {
-    'gzip': '.gz',
-    'bz2': '.bz2',
-    'zip': '.zip',
-    'xz': '.xz',
-}
-
 
 class CParserError(ValueError):
     """
@@ -243,13 +236,10 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
     if _is_url(filepath_or_buffer):
         url = str(filepath_or_buffer)
         req = _urlopen(url)
-        if compression == 'infer':
-            for compression, extension in _compression_to_extension.items():
-                if url.endswith(extension):
-                    break
-            else:
-                content_encoding = req.headers.get('Content-Encoding', None)
-                compression = 'gzip' if content_encoding == 'gzip' else None
+        content_encoding = req.headers.get('Content-Encoding', None)
+        if content_encoding == 'gzip':
+            # Override compression based on Content-Encoding header
+            compression = 'gzip'
         reader, encoding = maybe_read_encoded_stream(req, encoding, compression)
         return reader, encoding, compression
 
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -342,38 +342,44 @@ def _validate_nrows(nrows):
     return nrows
 
 
+_compression_to_extension = {
+    'gzip': '.gz',
+    'bz2': '.bz2',
+    'zip': '.zip',
+    'xz': '.xz',
+}
+
+def _infer_compression(filepath_or_buffer):
+    """
+    Infer compression of a filepath or buffer. In case of buffer, compression
+    is None. Otherwise, inference is perfomed using the extension of the
+    filename or URL.
+    """
+    if not isinstance(filepath_or_buffer, compat.string_types):
+        return None
+    filepath = str(filepath_or_buffer)
+    for compression, extension in _compression_to_extension.items():
+        if filepath.endswith(extension):
+            return compression
+    return None
+
 def _read(filepath_or_buffer, kwds):
-    "Generic reader of line files."
+    """Generic reader of line files."""
     encoding = kwds.get('encoding', None)
     if encoding is not None:
         encoding = re.sub('_', '-', encoding).lower()
         kwds['encoding'] = encoding
 
-    # If the input could be a filename, check for a recognizable compression
-    # extension.  If we're reading from a URL, the `get_filepath_or_buffer`
-    # will use header info to determine compression, so use what it finds in
-    # that case.
-    inferred_compression = kwds.get('compression')
-    if inferred_compression == 'infer':
-        if isinstance(filepath_or_buffer, compat.string_types):
-            if filepath_or_buffer.endswith('.gz'):
-                inferred_compression = 'gzip'
-            elif filepath_or_buffer.endswith('.bz2'):
-                inferred_compression = 'bz2'
-            elif filepath_or_buffer.endswith('.zip'):
-                inferred_compression = 'zip'
-            elif filepath_or_buffer.endswith('.xz'):
-                inferred_compression = 'xz'
-            else:
-                inferred_compression = None
-        else:
-            inferred_compression = None
+    compression = kwds.get('compression')
+    if compression not in set(_compression_to_extension) | {None, 'infer'}:
+        raise ValueError('"{}" is not a valid compression'.format(compression))
+
+    if compression == 'infer':
+        compression = _infer_compression(filepath_or_buffer)
 
     filepath_or_buffer, _, compression = get_filepath_or_buffer(
-        filepath_or_buffer, encoding,
-        compression=kwds.get('compression', None))
-    kwds['compression'] = (inferred_compression if compression == 'infer'
-                           else compression)
+        filepath_or_buffer, encoding, compression)
+    kwds['compression'] = compression
 
     if kwds.get('date_parser', None) is not None:
         if isinstance(kwds['parse_dates'], bool):