ENH: Add option in read_csv to infer compression type from filename

evanpw · evanpw · commit e2359256fbbc · 2015-04-01T08:07:26.000-04:00
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -89,6 +89,7 @@ They can take a number of arguments:
   - ``delim_whitespace``: Parse whitespace-delimited (spaces or tabs) file
     (much faster than using a regular expression)
   - ``compression``: decompress ``'gzip'`` and ``'bz2'`` formats on the fly.
+    Set to  ``'infer'`` to guess a format based on the file extension.
   - ``dialect``: string or :class:`python:csv.Dialect` instance to expose more
     ways to specify the file format
   - ``dtype``: A data type name or a dict of column name to data type. If not
diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt
@@ -16,7 +16,7 @@ We recommend that all users upgrade to this version.
 
 Enhancements
 ~~~~~~~~~~~~
-
+- Setting the ``compression`` argument of ``read_csv`` or ``read_table`` to ``'infer'`` will now guess the compression type based on the file extension.
 
 
 
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -55,8 +55,10 @@ class ParserWarning(Warning):
 dtype : Type name or dict of column -> type
     Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}
     (Unsupported with engine='python')
-compression : {'gzip', 'bz2', None}, default None
-    For on-the-fly decompression of on-disk data
+compression : {'gzip', 'bz2', 'infer', None}, default None
+    For on-the-fly decompression of on-disk data. If 'infer', then use gzip or
+    bz2 if filepath_or_buffer is a string ending in '.gz' or '.bz2',
+    respectively, and None otherwise.
 dialect : string or csv.Dialect instance, default None
     If None defaults to Excel dialect. Ignored if sep longer than 1 char
     See csv.Dialect documentation for more details
@@ -1314,6 +1316,7 @@ def _wrap_compressed(f, compression, encoding=None):
     """
     compression = compression.lower()
     encoding = encoding or get_option('display.encoding')
+
     if compression == 'gzip':
         import gzip
 
@@ -1386,6 +1389,17 @@ def __init__(self, f, **kwds):
         self.comment = kwds['comment']
         self._comment_lines = []
 
+        if self.compression == 'infer':
+            if isinstance(f, compat.string_types):
+                if f.endswith('.gz'):
+                    self.compression = 'gzip'
+                elif f.endswith('.bz2'):
+                    self.compression = 'bz2'
+                else:
+                    self.compression = None
+            else:
+                self.compression = None
+
         if isinstance(f, compat.string_types):
             f = com._get_handle(f, 'r', encoding=self.encoding,
                                 compression=self.compression)
diff --git a/pandas/io/tests/data/test1.csv.bz2 b/pandas/io/tests/data/test1.csv.bz2
diff --git a/pandas/io/tests/data/test1.csv.gz b/pandas/io/tests/data/test1.csv.gz
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -1071,6 +1071,21 @@ def test_read_csv_no_index_name(self):
         self.assertEqual(df.ix[:, ['A', 'B', 'C', 'D']].values.dtype, np.float64)
         tm.assert_frame_equal(df, df2)
 
+    def test_read_csv_infer_compression(self):
+        inputs = [self.csv1, self.csv1 + '.gz',
+                  self.csv1 + '.bz2', open(self.csv1)]
+
+        for f in inputs:
+            df = self.read_csv(f, index_col=0, parse_dates=True,
+                compression='infer')
+
+            self.assert_numpy_array_equal(df.columns, ['A', 'B', 'C', 'D'])
+            self.assertEqual(df.index.name, 'index')
+            self.assertIsInstance(df.index[0], (datetime, np.datetime64, Timestamp))
+            self.assertEqual(df.values.dtype, np.float64)
+
+        inputs[3].close()
+
     def test_read_table_unicode(self):
         fin = BytesIO(u('\u0141aski, Jan;1').encode('utf-8'))
         df1 = read_table(fin, sep=";", encoding="utf-8", header=None)
diff --git a/pandas/parser.pyx b/pandas/parser.pyx
@@ -541,6 +541,17 @@ cdef class TextReader:
         self.parser.cb_io = NULL
         self.parser.cb_cleanup = NULL
 
+        if self.compression == 'infer':
+            if isinstance(source, basestring):
+                if source.endswith('.gz'):
+                    self.compression = 'gzip'
+                elif source.endswith('.bz2'):
+                    self.compression = 'bz2'
+                else:
+                    self.compression = None
+            else:
+                self.compression = None
+
         if self.compression:
             if self.compression == 'gzip':
                 import gzip