diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 508093380ac81..55e5241587298 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -64,6 +64,27 @@ Strings passed to ``DataFrame.groupby()`` as the ``by`` parameter may now refere df.groupby(['second', 'A']).sum() +.. _whatsnew_0200.enhancements.compressed_urls: + +Better support for compressed URLs in ``read_csv`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The compression code was refactored (:issue:`12688`). As a result, reading +dataframes from URLs in :func:`read_csv` or :func:`read_table` now supports +additional compression methods: ``xz``, ``bz2``, and ``zip`` (:issue:`14570`). +Previously, only ``gzip`` compression was supported. By default, compression of +URLs and paths are now both inferred using their file extensions. Additionally, +support for bz2 compression in the python 2 c-engine improved (:issue:`14874`). + +.. ipython:: python + url = 'https://github.com/{repo}/raw/{branch}/{path}'.format( + repo = 'pandas-dev/pandas', + branch = 'master', + path = 'pandas/io/tests/parser/data/salaries.csv.bz2', + ) + df = pd.read_table(url, compression='infer') # default, infer compression + df = pd.read_table(url, compression='bz2') # explicitly specify compression + df.head(2) .. _whatsnew_0200.enhancements.other: diff --git a/pandas/io/common.py b/pandas/io/common.py index c115fab217fba..fa1022b882124 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -187,8 +187,8 @@ def _stringify_path(filepath_or_buffer): def get_filepath_or_buffer(filepath_or_buffer, encoding=None, compression=None): """ - If the filepath_or_buffer is a url, translate and return the buffer - passthru otherwise. + If the filepath_or_buffer is a url, translate and return the buffer. + Otherwise passthrough. Parameters ---------- @@ -247,23 +247,26 @@ def file_path_to_url(path): def _infer_compression(filepath_or_buffer, compression): """ - Get file handle for given path/buffer and mode. + Get the compression method for filepath_or_buffer. If compression='infer', + the inferred compression method is returned. Otherwise, the input + compression method is returned unchanged, unless it's invalid, in which + case an error is raised. Parameters ---------- filepath_or_buf : a path (str) or buffer - compression : str, or None + compression : str or None + the compression method including None for no compression and 'infer' Returns ------- - string compression method, None + string or None : + compression method Raises ------ ValueError on invalid compression specified - - If compression='infer', infer compression. If compression """ # No compression has been explicitly specified diff --git a/pandas/io/tests/parser/compression.py b/pandas/io/tests/parser/compression.py index 3b0c571032fe6..e95617faf2071 100644 --- a/pandas/io/tests/parser/compression.py +++ b/pandas/io/tests/parser/compression.py @@ -8,7 +8,6 @@ import nose import pandas.util.testing as tm -from pandas import compat class CompressionTests(object): @@ -114,12 +113,8 @@ def test_bz2(self): path, compression='bz3') with open(path, 'rb') as fin: - if compat.PY3: - result = self.read_csv(fin, compression='bz2') - tm.assert_frame_equal(result, expected) - elif self.engine is not 'python': - self.assertRaises(ValueError, self.read_csv, - fin, compression='bz2') + result = self.read_csv(fin, compression='bz2') + tm.assert_frame_equal(result, expected) with tm.ensure_clean('test.bz2') as path: tmp = bz2.BZ2File(path, mode='wb') diff --git a/pandas/io/tests/parser/test_network.py b/pandas/io/tests/parser/test_network.py index fd7a1babe4e01..4258749b8d897 100644 --- a/pandas/io/tests/parser/test_network.py +++ b/pandas/io/tests/parser/test_network.py @@ -12,7 +12,6 @@ import pandas.util.testing as tm from pandas import DataFrame -from pandas import compat from pandas.io.parsers import read_csv, read_table @@ -39,7 +38,7 @@ def test_compressed_urls(self): for compression, extension in self.compression_to_extension.items(): url = self.base_url + extension # args is a (compression, engine) tuple - for args in product([compression, 'infer'], ['python']): + for args in product([compression, 'infer'], ['python', 'c']): # test_fxn is a workaround for more descriptive nose reporting. # See http://stackoverflow.com/a/37393684/4651668. test_fxn = functools.partial(self.check_table) @@ -64,18 +63,12 @@ def setUp(self): @tm.network def test_parse_public_s3_bucket(self): for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: - if comp == 'bz2' and compat.PY2: - # The Python 2 C parser can't read bz2 from S3. - self.assertRaises(ValueError, read_csv, - 's3://pandas-test/tips.csv' + ext, - compression=comp) - else: - df = read_csv('s3://pandas-test/tips.csv' + - ext, compression=comp) - self.assertTrue(isinstance(df, DataFrame)) - self.assertFalse(df.empty) - tm.assert_frame_equal(read_csv( - tm.get_data_path('tips.csv')), df) + df = read_csv('s3://pandas-test/tips.csv' + + ext, compression=comp) + self.assertTrue(isinstance(df, DataFrame)) + self.assertFalse(df.empty) + tm.assert_frame_equal(read_csv( + tm.get_data_path('tips.csv')), df) # Read public file from bucket with not-public contents df = read_csv('s3://cant_get_it/tips.csv') @@ -104,18 +97,12 @@ def test_parse_public_s3a_bucket(self): @tm.network def test_parse_public_s3_bucket_nrows(self): for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: - if comp == 'bz2' and compat.PY2: - # The Python 2 C parser can't read bz2 from S3. - self.assertRaises(ValueError, read_csv, - 's3://pandas-test/tips.csv' + ext, - compression=comp) - else: - df = read_csv('s3://pandas-test/tips.csv' + - ext, nrows=10, compression=comp) - self.assertTrue(isinstance(df, DataFrame)) - self.assertFalse(df.empty) - tm.assert_frame_equal(read_csv( - tm.get_data_path('tips.csv')).iloc[:10], df) + df = read_csv('s3://pandas-test/tips.csv' + + ext, nrows=10, compression=comp) + self.assertTrue(isinstance(df, DataFrame)) + self.assertFalse(df.empty) + tm.assert_frame_equal(read_csv( + tm.get_data_path('tips.csv')).iloc[:10], df) @tm.network def test_parse_public_s3_bucket_chunked(self): @@ -123,24 +110,18 @@ def test_parse_public_s3_bucket_chunked(self): chunksize = 5 local_tips = read_csv(tm.get_data_path('tips.csv')) for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: - if comp == 'bz2' and compat.PY2: - # The Python 2 C parser can't read bz2 from S3. - self.assertRaises(ValueError, read_csv, - 's3://pandas-test/tips.csv' + ext, - compression=comp) - else: - df_reader = read_csv('s3://pandas-test/tips.csv' + ext, - chunksize=chunksize, compression=comp) - self.assertEqual(df_reader.chunksize, chunksize) - for i_chunk in [0, 1, 2]: - # Read a couple of chunks and make sure we see them - # properly. - df = df_reader.get_chunk() - self.assertTrue(isinstance(df, DataFrame)) - self.assertFalse(df.empty) - true_df = local_tips.iloc[ - chunksize * i_chunk: chunksize * (i_chunk + 1)] - tm.assert_frame_equal(true_df, df) + df_reader = read_csv('s3://pandas-test/tips.csv' + ext, + chunksize=chunksize, compression=comp) + self.assertEqual(df_reader.chunksize, chunksize) + for i_chunk in [0, 1, 2]: + # Read a couple of chunks and make sure we see them + # properly. + df = df_reader.get_chunk() + self.assertTrue(isinstance(df, DataFrame)) + self.assertFalse(df.empty) + true_df = local_tips.iloc[ + chunksize * i_chunk: chunksize * (i_chunk + 1)] + tm.assert_frame_equal(true_df, df) @tm.network def test_parse_public_s3_bucket_chunked_python(self): diff --git a/pandas/parser.pyx b/pandas/parser.pyx index d94a4ef278dee..c76620cdc647d 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -621,8 +621,9 @@ cdef class TextReader: if isinstance(source, basestring) or PY3: source = bz2.BZ2File(source, 'rb') else: - raise ValueError('Python 2 cannot read bz2 from open file ' - 'handle') + content = source.read() + source.close() + source = compat.StringIO(bz2.decompress(content)) elif self.compression == 'zip': import zipfile zip_file = zipfile.ZipFile(source)