DOC for refactored compression (GH14576) + BUG: bz2-compressed URL with C engine (GH14874)

dhimmel · jreback · commit e80a2b9ab6ff · 2016-12-17T18:05:01.000-05:00
Follow up on #14576, which refactored compression code to expand URL support. Fixes up some small remaining issues and adds a what's new entry. - [x] Closes #14874 Author: Daniel Himmelstein <daniel.himmelstein@gmail.com> Closes #14880 from dhimmel/whats-new and squashes the following commits: e1b5d42 [Daniel Himmelstein] Address what's new review comments 8568aed [Daniel Himmelstein] TST: Read bz2 files from S3 in PY2 09dcbff [Daniel Himmelstein] DOC: Improve what's new c4ea3d3 [Daniel Himmelstein] STY: PEP8 fixes f8a7900 [Daniel Himmelstein] TST: check bz2 compression in PY2 c engine 0e0fa0a [Daniel Himmelstein] DOC: Reword get_filepath_or_buffer docstring 210fb20 [Daniel Himmelstein] DOC: What's New for refactored compression code cb91007 [Daniel Himmelstein] TST: Read compressed URLs with c engine 85630ea [Daniel Himmelstein] ENH: Support bz2 compression in PY2 for c engine a7960f6 [Daniel Himmelstein] DOC: Improve _infer_compression docstring
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -64,6 +64,27 @@ Strings passed to ``DataFrame.groupby()`` as the ``by`` parameter may now refere
 
    df.groupby(['second', 'A']).sum()
 
+.. _whatsnew_0200.enhancements.compressed_urls:
+
+Better support for compressed URLs in ``read_csv``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The compression code was refactored (:issue:`12688`). As a result, reading
+dataframes from URLs in :func:`read_csv` or :func:`read_table` now supports
+additional compression methods: ``xz``, ``bz2``, and ``zip`` (:issue:`14570`).
+Previously, only ``gzip`` compression was supported. By default, compression of
+URLs and paths are now both inferred using their file extensions. Additionally,
+support for bz2 compression in the python 2 c-engine improved (:issue:`14874`).
+
+.. ipython:: python
+   url = 'https://github.com/{repo}/raw/{branch}/{path}'.format(
+       repo = 'pandas-dev/pandas',
+       branch = 'master',
+       path = 'pandas/io/tests/parser/data/salaries.csv.bz2',
+   )
+   df = pd.read_table(url, compression='infer')  # default, infer compression
+   df = pd.read_table(url, compression='bz2')  # explicitly specify compression
+   df.head(2)
 
 .. _whatsnew_0200.enhancements.other:
 
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -187,8 +187,8 @@ def _stringify_path(filepath_or_buffer):
 def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
                            compression=None):
     """
-    If the filepath_or_buffer is a url, translate and return the buffer
-    passthru otherwise.
+    If the filepath_or_buffer is a url, translate and return the buffer.
+    Otherwise passthrough.
 
     Parameters
     ----------
@@ -247,23 +247,26 @@ def file_path_to_url(path):
 
 def _infer_compression(filepath_or_buffer, compression):
     """
-    Get file handle for given path/buffer and mode.
+    Get the compression method for filepath_or_buffer. If compression='infer',
+    the inferred compression method is returned. Otherwise, the input
+    compression method is returned unchanged, unless it's invalid, in which
+    case an error is raised.
 
     Parameters
     ----------
     filepath_or_buf :
         a path (str) or buffer
-    compression : str, or None
+    compression : str or None
+        the compression method including None for no compression and 'infer'
 
     Returns
     -------
-    string compression method, None
+    string or None :
+        compression method
 
     Raises
     ------
     ValueError on invalid compression specified
-
-    If compression='infer', infer compression. If compression
     """
 
     # No compression has been explicitly specified
diff --git a/pandas/io/tests/parser/compression.py b/pandas/io/tests/parser/compression.py
@@ -8,7 +8,6 @@
 import nose
 
 import pandas.util.testing as tm
-from pandas import compat
 
 
 class CompressionTests(object):
@@ -114,12 +113,8 @@ def test_bz2(self):
                               path, compression='bz3')
 
             with open(path, 'rb') as fin:
-                if compat.PY3:
-                    result = self.read_csv(fin, compression='bz2')
-                    tm.assert_frame_equal(result, expected)
-                elif self.engine is not 'python':
-                    self.assertRaises(ValueError, self.read_csv,
-                                      fin, compression='bz2')
+                result = self.read_csv(fin, compression='bz2')
+                tm.assert_frame_equal(result, expected)
 
         with tm.ensure_clean('test.bz2') as path:
             tmp = bz2.BZ2File(path, mode='wb')
diff --git a/pandas/io/tests/parser/test_network.py b/pandas/io/tests/parser/test_network.py
@@ -12,7 +12,6 @@
 
 import pandas.util.testing as tm
 from pandas import DataFrame
-from pandas import compat
 from pandas.io.parsers import read_csv, read_table
 
 
@@ -39,7 +38,7 @@ def test_compressed_urls(self):
         for compression, extension in self.compression_to_extension.items():
             url = self.base_url + extension
             # args is a (compression, engine) tuple
-            for args in product([compression, 'infer'], ['python']):
+            for args in product([compression, 'infer'], ['python', 'c']):
                 # test_fxn is a workaround for more descriptive nose reporting.
                 # See http://stackoverflow.com/a/37393684/4651668.
                 test_fxn = functools.partial(self.check_table)
@@ -64,18 +63,12 @@ def setUp(self):
     @tm.network
     def test_parse_public_s3_bucket(self):
         for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
-            if comp == 'bz2' and compat.PY2:
-                # The Python 2 C parser can't read bz2 from S3.
-                self.assertRaises(ValueError, read_csv,
-                                  's3://pandas-test/tips.csv' + ext,
-                                  compression=comp)
-            else:
-                df = read_csv('s3://pandas-test/tips.csv' +
-                              ext, compression=comp)
-                self.assertTrue(isinstance(df, DataFrame))
-                self.assertFalse(df.empty)
-                tm.assert_frame_equal(read_csv(
-                    tm.get_data_path('tips.csv')), df)
+            df = read_csv('s3://pandas-test/tips.csv' +
+                          ext, compression=comp)
+            self.assertTrue(isinstance(df, DataFrame))
+            self.assertFalse(df.empty)
+            tm.assert_frame_equal(read_csv(
+                tm.get_data_path('tips.csv')), df)
 
         # Read public file from bucket with not-public contents
         df = read_csv('s3://cant_get_it/tips.csv')
@@ -104,43 +97,31 @@ def test_parse_public_s3a_bucket(self):
     @tm.network
     def test_parse_public_s3_bucket_nrows(self):
         for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
-            if comp == 'bz2' and compat.PY2:
-                # The Python 2 C parser can't read bz2 from S3.
-                self.assertRaises(ValueError, read_csv,
-                                  's3://pandas-test/tips.csv' + ext,
-                                  compression=comp)
-            else:
-                df = read_csv('s3://pandas-test/tips.csv' +
-                              ext, nrows=10, compression=comp)
-                self.assertTrue(isinstance(df, DataFrame))
-                self.assertFalse(df.empty)
-                tm.assert_frame_equal(read_csv(
-                    tm.get_data_path('tips.csv')).iloc[:10], df)
+            df = read_csv('s3://pandas-test/tips.csv' +
+                          ext, nrows=10, compression=comp)
+            self.assertTrue(isinstance(df, DataFrame))
+            self.assertFalse(df.empty)
+            tm.assert_frame_equal(read_csv(
+                tm.get_data_path('tips.csv')).iloc[:10], df)
 
     @tm.network
     def test_parse_public_s3_bucket_chunked(self):
         # Read with a chunksize
         chunksize = 5
         local_tips = read_csv(tm.get_data_path('tips.csv'))
         for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
-            if comp == 'bz2' and compat.PY2:
-                # The Python 2 C parser can't read bz2 from S3.
-                self.assertRaises(ValueError, read_csv,
-                                  's3://pandas-test/tips.csv' + ext,
-                                  compression=comp)
-            else:
-                df_reader = read_csv('s3://pandas-test/tips.csv' + ext,
-                                     chunksize=chunksize, compression=comp)
-                self.assertEqual(df_reader.chunksize, chunksize)
-                for i_chunk in [0, 1, 2]:
-                    # Read a couple of chunks and make sure we see them
-                    # properly.
-                    df = df_reader.get_chunk()
-                    self.assertTrue(isinstance(df, DataFrame))
-                    self.assertFalse(df.empty)
-                    true_df = local_tips.iloc[
-                        chunksize * i_chunk: chunksize * (i_chunk + 1)]
-                    tm.assert_frame_equal(true_df, df)
+            df_reader = read_csv('s3://pandas-test/tips.csv' + ext,
+                                 chunksize=chunksize, compression=comp)
+            self.assertEqual(df_reader.chunksize, chunksize)
+            for i_chunk in [0, 1, 2]:
+                # Read a couple of chunks and make sure we see them
+                # properly.
+                df = df_reader.get_chunk()
+                self.assertTrue(isinstance(df, DataFrame))
+                self.assertFalse(df.empty)
+                true_df = local_tips.iloc[
+                    chunksize * i_chunk: chunksize * (i_chunk + 1)]
+                tm.assert_frame_equal(true_df, df)
 
     @tm.network
     def test_parse_public_s3_bucket_chunked_python(self):
diff --git a/pandas/parser.pyx b/pandas/parser.pyx
@@ -621,8 +621,9 @@ cdef class TextReader:
                 if isinstance(source, basestring) or PY3:
                     source = bz2.BZ2File(source, 'rb')
                 else:
-                    raise ValueError('Python 2 cannot read bz2 from open file '
-                                     'handle')
+                    content = source.read()
+                    source.close()
+                    source = compat.StringIO(bz2.decompress(content))
             elif self.compression == 'zip':
                 import zipfile
                 zip_file = zipfile.ZipFile(source)