ENH: Add option in read_csv to infer compression type from filename

evanpw · evanpw · commit 7fe1c69e69d4 · 2015-04-09T23:13:35.000-04:00
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -89,6 +89,8 @@ They can take a number of arguments:
   - ``delim_whitespace``: Parse whitespace-delimited (spaces or tabs) file
     (much faster than using a regular expression)
   - ``compression``: decompress ``'gzip'`` and ``'bz2'`` formats on the fly.
+    Set to  ``'infer'`` (the default) to guess a format based on the file
+    extension.
   - ``dialect``: string or :class:`python:csv.Dialect` instance to expose more
     ways to specify the file format
   - ``dtype``: A data type name or a dict of column name to data type. If not
diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt
@@ -19,6 +19,7 @@ Enhancements
 
 - Added ``StringMethods.capitalize()`` and ``swapcase`` which behave as the same as standard ``str`` (:issue:`9766`)
 
+
 - ``DataFrame.mask()`` and ``Series.mask()`` now support same keywords as ``where`` (:issue:`8801`)
 
 
@@ -55,6 +56,8 @@ API changes
 - :meth:`~pandas.DataFrame.assign` now inserts new columns in alphabetical order. Previously
   the order was arbitrary. (:issue:`9777`)
 
+- By default, ``read_csv`` and ``read_table`` will now try to infer the compression type based on the file extension. Set ``compression=None`` to restore the previous behavior. (:issue:`9770`)
+
 
 .. _whatsnew_0161.performance:
 
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -55,8 +55,10 @@ class ParserWarning(Warning):
 dtype : Type name or dict of column -> type
     Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}
     (Unsupported with engine='python')
-compression : {'gzip', 'bz2', None}, default None
-    For on-the-fly decompression of on-disk data
+compression : {'gzip', 'bz2', 'infer', None}, default 'infer'
+    For on-the-fly decompression of on-disk data. If 'infer', then use gzip or
+    bz2 if filepath_or_buffer is a string ending in '.gz' or '.bz2',
+    respectively, and no decompression otherwise.
 dialect : string or csv.Dialect instance, default None
     If None defaults to Excel dialect. Ignored if sep longer than 1 char
     See csv.Dialect documentation for more details
@@ -294,7 +296,7 @@ def _read(filepath_or_buffer, kwds):
     'verbose': False,
     'encoding': None,
     'squeeze': False,
-    'compression': None,
+    'compression': 'infer',
     'mangle_dupe_cols': True,
     'tupleize_cols': False,
     'infer_datetime_format': False,
@@ -334,7 +336,7 @@ def _make_parser_function(name, sep=','):
     def parser_f(filepath_or_buffer,
                  sep=sep,
                  dialect=None,
-                 compression=None,
+                 compression='infer',
 
                  doublequote=True,
                  escapechar=None,
@@ -1316,6 +1318,7 @@ def _wrap_compressed(f, compression, encoding=None):
     """
     compression = compression.lower()
     encoding = encoding or get_option('display.encoding')
+
     if compression == 'gzip':
         import gzip
 
@@ -1388,6 +1391,17 @@ def __init__(self, f, **kwds):
         self.comment = kwds['comment']
         self._comment_lines = []
 
+        if self.compression == 'infer':
+            if isinstance(f, compat.string_types):
+                if f.endswith('.gz'):
+                    self.compression = 'gzip'
+                elif f.endswith('.bz2'):
+                    self.compression = 'bz2'
+                else:
+                    self.compression = None
+            else:
+                self.compression = None
+
         if isinstance(f, compat.string_types):
             f = com._get_handle(f, 'r', encoding=self.encoding,
                                 compression=self.compression)
diff --git a/pandas/io/tests/data/test1.csv.bz2 b/pandas/io/tests/data/test1.csv.bz2
diff --git a/pandas/io/tests/data/test1.csv.gz b/pandas/io/tests/data/test1.csv.gz
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -1098,6 +1098,20 @@ def test_read_csv_no_index_name(self):
         self.assertEqual(df.ix[:, ['A', 'B', 'C', 'D']].values.dtype, np.float64)
         tm.assert_frame_equal(df, df2)
 
+    def test_read_csv_infer_compression(self):
+        expected = self.read_csv(self.csv1, index_col=0, parse_dates=True)
+
+        inputs = [self.csv1, self.csv1 + '.gz',
+                  self.csv1 + '.bz2', open(self.csv1)]
+
+        for f in inputs:
+            df = self.read_csv(f, index_col=0, parse_dates=True,
+                compression='infer')
+
+            tm.assert_frame_equal(expected, df)
+
+        inputs[3].close()
+
     def test_read_table_unicode(self):
         fin = BytesIO(u('\u0141aski, Jan;1').encode('utf-8'))
         df1 = read_table(fin, sep=";", encoding="utf-8", header=None)
diff --git a/pandas/parser.pyx b/pandas/parser.pyx
@@ -541,6 +541,17 @@ cdef class TextReader:
         self.parser.cb_io = NULL
         self.parser.cb_cleanup = NULL
 
+        if self.compression == 'infer':
+            if isinstance(source, basestring):
+                if source.endswith('.gz'):
+                    self.compression = 'gzip'
+                elif source.endswith('.bz2'):
+                    self.compression = 'bz2'
+                else:
+                    self.compression = None
+            else:
+                self.compression = None
+
         if self.compression:
             if self.compression == 'gzip':
                 import gzip