From a7960f6baefcda0dca65b49b9b4565ca06b9e6e3 Mon Sep 17 00:00:00 2001 From: Daniel Himmelstein Date: Wed, 14 Dec 2016 10:11:52 -0500 Subject: [PATCH 01/10] DOC: Improve _infer_compression docstring https://github.com/pandas-dev/pandas/commit/4a5aec40e8b2d6789f946e3e5b5b07ba5e753eb6#commitcomment-20178761 --- pandas/io/common.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index c115fab217fba..9c746d8ce9b68 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -247,23 +247,26 @@ def file_path_to_url(path): def _infer_compression(filepath_or_buffer, compression): """ - Get file handle for given path/buffer and mode. + Get the compression method for filepath_or_buffer. If compression='infer', + the inferred compression method is returned. Otherwise, the input + compression method is returned unchanged, unless it's invalid, in which case + an error is raised. Parameters ---------- filepath_or_buf : a path (str) or buffer - compression : str, or None + compression : str or None + the compression method including None for no compression and 'infer' Returns ------- - string compression method, None + string or None : + compression method Raises ------ ValueError on invalid compression specified - - If compression='infer', infer compression. If compression """ # No compression has been explicitly specified From 85630ea7e2cc356b161335238ec6be068f61dd12 Mon Sep 17 00:00:00 2001 From: Daniel Himmelstein Date: Wed, 14 Dec 2016 11:21:31 -0500 Subject: [PATCH 02/10] ENH: Support bz2 compression in PY2 for c engine Closes https://github.com/pandas-dev/pandas/issues/14874 --- pandas/parser.pyx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/parser.pyx b/pandas/parser.pyx index d94a4ef278dee..c76620cdc647d 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -621,8 +621,9 @@ cdef class TextReader: if isinstance(source, basestring) or PY3: source = bz2.BZ2File(source, 'rb') else: - raise ValueError('Python 2 cannot read bz2 from open file ' - 'handle') + content = source.read() + source.close() + source = compat.StringIO(bz2.decompress(content)) elif self.compression == 'zip': import zipfile zip_file = zipfile.ZipFile(source) From cb91007100a5729e22a3585aa90b5b2a0de1ae65 Mon Sep 17 00:00:00 2001 From: Daniel Himmelstein Date: Wed, 14 Dec 2016 11:24:09 -0500 Subject: [PATCH 03/10] TST: Read compressed URLs with c engine --- pandas/io/tests/parser/test_network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/tests/parser/test_network.py b/pandas/io/tests/parser/test_network.py index fd7a1babe4e01..7d42f83c48e21 100644 --- a/pandas/io/tests/parser/test_network.py +++ b/pandas/io/tests/parser/test_network.py @@ -39,7 +39,7 @@ def test_compressed_urls(self): for compression, extension in self.compression_to_extension.items(): url = self.base_url + extension # args is a (compression, engine) tuple - for args in product([compression, 'infer'], ['python']): + for args in product([compression, 'infer'], ['python', 'c']): # test_fxn is a workaround for more descriptive nose reporting. # See http://stackoverflow.com/a/37393684/4651668. test_fxn = functools.partial(self.check_table) From 210fb20176b4d88b8a53e2852531e247338cda9e Mon Sep 17 00:00:00 2001 From: Daniel Himmelstein Date: Wed, 14 Dec 2016 11:41:16 -0500 Subject: [PATCH 04/10] DOC: What's New for refactored compression code Add what's new corresponding to https://github.com/pandas-dev/pandas/pull/14576. --- doc/source/whatsnew/v0.20.0.txt | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 508093380ac81..bb0fa111b7ab0 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -64,6 +64,18 @@ Strings passed to ``DataFrame.groupby()`` as the ``by`` parameter may now refere df.groupby(['second', 'A']).sum() +Reading dataframes from URLs, in :func:`read_csv` or :func:`read_table`, now +supports additional compression methods (`xz`, `bz2`, `zip`). Previously, only +`gzip` compression was supported. By default, compression of URLs and paths are +now both inferred using their file extensions. + +.. ipython:: python + + url = ('https://github.com/pandas-dev/pandas/raw/master/' + + 'pandas/io/tests/parser/data/salaries.csv.bz2') + df = pd.read_table(url, compression='infer') # default, infer compression + df = pd.read_table(url, compression='bz2') # explicitly specify compression + df.head(2) .. _whatsnew_0200.enhancements.other: From 0e0fa0acf2e064ed23d50dcdf8b6e8a640c6388f Mon Sep 17 00:00:00 2001 From: Daniel Himmelstein Date: Wed, 14 Dec 2016 13:16:34 -0500 Subject: [PATCH 05/10] DOC: Reword get_filepath_or_buffer docstring --- pandas/io/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 9c746d8ce9b68..87f030efa6340 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -187,8 +187,8 @@ def _stringify_path(filepath_or_buffer): def get_filepath_or_buffer(filepath_or_buffer, encoding=None, compression=None): """ - If the filepath_or_buffer is a url, translate and return the buffer - passthru otherwise. + If the filepath_or_buffer is a url, translate and return the buffer. + Otherwise passthrough. Parameters ---------- From f8a7900d6635f3f80bf5ae91a9a51cc69edc36e9 Mon Sep 17 00:00:00 2001 From: Daniel Himmelstein Date: Wed, 14 Dec 2016 15:06:55 -0500 Subject: [PATCH 06/10] TST: check bz2 compression in PY2 c engine --- pandas/io/tests/parser/compression.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/io/tests/parser/compression.py b/pandas/io/tests/parser/compression.py index 3b0c571032fe6..adb7efb0bb3a9 100644 --- a/pandas/io/tests/parser/compression.py +++ b/pandas/io/tests/parser/compression.py @@ -114,12 +114,8 @@ def test_bz2(self): path, compression='bz3') with open(path, 'rb') as fin: - if compat.PY3: - result = self.read_csv(fin, compression='bz2') - tm.assert_frame_equal(result, expected) - elif self.engine is not 'python': - self.assertRaises(ValueError, self.read_csv, - fin, compression='bz2') + result = self.read_csv(fin, compression='bz2') + tm.assert_frame_equal(result, expected) with tm.ensure_clean('test.bz2') as path: tmp = bz2.BZ2File(path, mode='wb') From c4ea3d367580452ec13c1f15033dcaff17f0c9f0 Mon Sep 17 00:00:00 2001 From: Daniel Himmelstein Date: Thu, 15 Dec 2016 09:44:16 -0500 Subject: [PATCH 07/10] STY: PEP8 fixes --- pandas/io/common.py | 4 ++-- pandas/io/tests/parser/compression.py | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 87f030efa6340..fa1022b882124 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -249,8 +249,8 @@ def _infer_compression(filepath_or_buffer, compression): """ Get the compression method for filepath_or_buffer. If compression='infer', the inferred compression method is returned. Otherwise, the input - compression method is returned unchanged, unless it's invalid, in which case - an error is raised. + compression method is returned unchanged, unless it's invalid, in which + case an error is raised. Parameters ---------- diff --git a/pandas/io/tests/parser/compression.py b/pandas/io/tests/parser/compression.py index adb7efb0bb3a9..e95617faf2071 100644 --- a/pandas/io/tests/parser/compression.py +++ b/pandas/io/tests/parser/compression.py @@ -8,7 +8,6 @@ import nose import pandas.util.testing as tm -from pandas import compat class CompressionTests(object): From 09dcbff6b3dc83df748b623786d4ef66fd78062c Mon Sep 17 00:00:00 2001 From: Daniel Himmelstein Date: Wed, 14 Dec 2016 16:33:15 -0500 Subject: [PATCH 08/10] DOC: Improve what's new Reference corresponding issues in What's New. Change code example to use string formating for improved modularity. Add what's new id --- doc/source/whatsnew/v0.20.0.txt | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index bb0fa111b7ab0..c4402e2a9e508 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -64,15 +64,24 @@ Strings passed to ``DataFrame.groupby()`` as the ``by`` parameter may now refere df.groupby(['second', 'A']).sum() -Reading dataframes from URLs, in :func:`read_csv` or :func:`read_table`, now -supports additional compression methods (`xz`, `bz2`, `zip`). Previously, only -`gzip` compression was supported. By default, compression of URLs and paths are -now both inferred using their file extensions. +.. _whatsnew_0200.enhancements.compressed_urls: -.. ipython:: python +Better support for compressed URLs in ``read_csv`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Compression code was refactored (:issue:`12688`). As a result, reading +dataframes from URLs in :func:`read_csv` or :func:`read_table` now supports +additional compression methods: ``xz``, ``bz2``, and ``zip`` (:issue:`14570`). +Previously, only ``gzip`` compression was supported. By default, compression of +URLs and paths are now both inferred using their file extensions. Additionally, +bz2 support for the python 2 c-engine improved (:issue:`14874`). - url = ('https://github.com/pandas-dev/pandas/raw/master/' + - 'pandas/io/tests/parser/data/salaries.csv.bz2') +.. ipython:: python + url = 'https://github.com/{repo}/raw/{branch}/{path}'.format( + repo = 'pandas-dev/pandas', + branch = 'master', + path = 'pandas/io/tests/parser/data/salaries.csv.bz2', + ) df = pd.read_table(url, compression='infer') # default, infer compression df = pd.read_table(url, compression='bz2') # explicitly specify compression df.head(2) From 8568aedc58cd836390e8eddff7e844b0bb9ea875 Mon Sep 17 00:00:00 2001 From: Daniel Himmelstein Date: Fri, 16 Dec 2016 10:14:23 -0500 Subject: [PATCH 09/10] TST: Read bz2 files from S3 in PY2 Addresses https://github.com/pandas-dev/pandas/issues/14874 --- pandas/io/tests/parser/test_network.py | 67 +++++++++----------------- 1 file changed, 24 insertions(+), 43 deletions(-) diff --git a/pandas/io/tests/parser/test_network.py b/pandas/io/tests/parser/test_network.py index 7d42f83c48e21..4258749b8d897 100644 --- a/pandas/io/tests/parser/test_network.py +++ b/pandas/io/tests/parser/test_network.py @@ -12,7 +12,6 @@ import pandas.util.testing as tm from pandas import DataFrame -from pandas import compat from pandas.io.parsers import read_csv, read_table @@ -64,18 +63,12 @@ def setUp(self): @tm.network def test_parse_public_s3_bucket(self): for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: - if comp == 'bz2' and compat.PY2: - # The Python 2 C parser can't read bz2 from S3. - self.assertRaises(ValueError, read_csv, - 's3://pandas-test/tips.csv' + ext, - compression=comp) - else: - df = read_csv('s3://pandas-test/tips.csv' + - ext, compression=comp) - self.assertTrue(isinstance(df, DataFrame)) - self.assertFalse(df.empty) - tm.assert_frame_equal(read_csv( - tm.get_data_path('tips.csv')), df) + df = read_csv('s3://pandas-test/tips.csv' + + ext, compression=comp) + self.assertTrue(isinstance(df, DataFrame)) + self.assertFalse(df.empty) + tm.assert_frame_equal(read_csv( + tm.get_data_path('tips.csv')), df) # Read public file from bucket with not-public contents df = read_csv('s3://cant_get_it/tips.csv') @@ -104,18 +97,12 @@ def test_parse_public_s3a_bucket(self): @tm.network def test_parse_public_s3_bucket_nrows(self): for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: - if comp == 'bz2' and compat.PY2: - # The Python 2 C parser can't read bz2 from S3. - self.assertRaises(ValueError, read_csv, - 's3://pandas-test/tips.csv' + ext, - compression=comp) - else: - df = read_csv('s3://pandas-test/tips.csv' + - ext, nrows=10, compression=comp) - self.assertTrue(isinstance(df, DataFrame)) - self.assertFalse(df.empty) - tm.assert_frame_equal(read_csv( - tm.get_data_path('tips.csv')).iloc[:10], df) + df = read_csv('s3://pandas-test/tips.csv' + + ext, nrows=10, compression=comp) + self.assertTrue(isinstance(df, DataFrame)) + self.assertFalse(df.empty) + tm.assert_frame_equal(read_csv( + tm.get_data_path('tips.csv')).iloc[:10], df) @tm.network def test_parse_public_s3_bucket_chunked(self): @@ -123,24 +110,18 @@ def test_parse_public_s3_bucket_chunked(self): chunksize = 5 local_tips = read_csv(tm.get_data_path('tips.csv')) for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: - if comp == 'bz2' and compat.PY2: - # The Python 2 C parser can't read bz2 from S3. - self.assertRaises(ValueError, read_csv, - 's3://pandas-test/tips.csv' + ext, - compression=comp) - else: - df_reader = read_csv('s3://pandas-test/tips.csv' + ext, - chunksize=chunksize, compression=comp) - self.assertEqual(df_reader.chunksize, chunksize) - for i_chunk in [0, 1, 2]: - # Read a couple of chunks and make sure we see them - # properly. - df = df_reader.get_chunk() - self.assertTrue(isinstance(df, DataFrame)) - self.assertFalse(df.empty) - true_df = local_tips.iloc[ - chunksize * i_chunk: chunksize * (i_chunk + 1)] - tm.assert_frame_equal(true_df, df) + df_reader = read_csv('s3://pandas-test/tips.csv' + ext, + chunksize=chunksize, compression=comp) + self.assertEqual(df_reader.chunksize, chunksize) + for i_chunk in [0, 1, 2]: + # Read a couple of chunks and make sure we see them + # properly. + df = df_reader.get_chunk() + self.assertTrue(isinstance(df, DataFrame)) + self.assertFalse(df.empty) + true_df = local_tips.iloc[ + chunksize * i_chunk: chunksize * (i_chunk + 1)] + tm.assert_frame_equal(true_df, df) @tm.network def test_parse_public_s3_bucket_chunked_python(self): From e1b5d4200fdb4b9f4f7fb590072ead355e9b1517 Mon Sep 17 00:00:00 2001 From: Daniel Himmelstein Date: Sat, 17 Dec 2016 16:24:19 -0500 Subject: [PATCH 10/10] Address what's new review comments --- doc/source/whatsnew/v0.20.0.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index c4402e2a9e508..55e5241587298 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -69,12 +69,12 @@ Strings passed to ``DataFrame.groupby()`` as the ``by`` parameter may now refere Better support for compressed URLs in ``read_csv`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Compression code was refactored (:issue:`12688`). As a result, reading +The compression code was refactored (:issue:`12688`). As a result, reading dataframes from URLs in :func:`read_csv` or :func:`read_table` now supports additional compression methods: ``xz``, ``bz2``, and ``zip`` (:issue:`14570`). Previously, only ``gzip`` compression was supported. By default, compression of URLs and paths are now both inferred using their file extensions. Additionally, -bz2 support for the python 2 c-engine improved (:issue:`14874`). +support for bz2 compression in the python 2 c-engine improved (:issue:`14874`). .. ipython:: python url = 'https://github.com/{repo}/raw/{branch}/{path}'.format(