Skip to content

Commit e80a2b9

Browse files
dhimmeljreback
authored andcommitted
DOC for refactored compression (GH14576) + BUG: bz2-compressed URL with C engine (GH14874)
Follow up on #14576, which refactored compression code to expand URL support. Fixes up some small remaining issues and adds a what's new entry. - [x] Closes #14874 Author: Daniel Himmelstein <[email protected]> Closes #14880 from dhimmel/whats-new and squashes the following commits: e1b5d42 [Daniel Himmelstein] Address what's new review comments 8568aed [Daniel Himmelstein] TST: Read bz2 files from S3 in PY2 09dcbff [Daniel Himmelstein] DOC: Improve what's new c4ea3d3 [Daniel Himmelstein] STY: PEP8 fixes f8a7900 [Daniel Himmelstein] TST: check bz2 compression in PY2 c engine 0e0fa0a [Daniel Himmelstein] DOC: Reword get_filepath_or_buffer docstring 210fb20 [Daniel Himmelstein] DOC: What's New for refactored compression code cb91007 [Daniel Himmelstein] TST: Read compressed URLs with c engine 85630ea [Daniel Himmelstein] ENH: Support bz2 compression in PY2 for c engine a7960f6 [Daniel Himmelstein] DOC: Improve _infer_compression docstring
1 parent f5c8d54 commit e80a2b9

File tree

5 files changed

+61
-60
lines changed

5 files changed

+61
-60
lines changed

doc/source/whatsnew/v0.20.0.txt

+21
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,27 @@ Strings passed to ``DataFrame.groupby()`` as the ``by`` parameter may now refere
6464

6565
df.groupby(['second', 'A']).sum()
6666

67+
.. _whatsnew_0200.enhancements.compressed_urls:
68+
69+
Better support for compressed URLs in ``read_csv``
70+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
71+
72+
The compression code was refactored (:issue:`12688`). As a result, reading
73+
dataframes from URLs in :func:`read_csv` or :func:`read_table` now supports
74+
additional compression methods: ``xz``, ``bz2``, and ``zip`` (:issue:`14570`).
75+
Previously, only ``gzip`` compression was supported. By default, compression of
76+
URLs and paths are now both inferred using their file extensions. Additionally,
77+
support for bz2 compression in the python 2 c-engine improved (:issue:`14874`).
78+
79+
.. ipython:: python
80+
url = 'https://github.com/{repo}/raw/{branch}/{path}'.format(
81+
repo = 'pandas-dev/pandas',
82+
branch = 'master',
83+
path = 'pandas/io/tests/parser/data/salaries.csv.bz2',
84+
)
85+
df = pd.read_table(url, compression='infer') # default, infer compression
86+
df = pd.read_table(url, compression='bz2') # explicitly specify compression
87+
df.head(2)
6788

6889
.. _whatsnew_0200.enhancements.other:
6990

pandas/io/common.py

+10-7
Original file line numberDiff line numberDiff line change
@@ -187,8 +187,8 @@ def _stringify_path(filepath_or_buffer):
187187
def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
188188
compression=None):
189189
"""
190-
If the filepath_or_buffer is a url, translate and return the buffer
191-
passthru otherwise.
190+
If the filepath_or_buffer is a url, translate and return the buffer.
191+
Otherwise passthrough.
192192
193193
Parameters
194194
----------
@@ -247,23 +247,26 @@ def file_path_to_url(path):
247247

248248
def _infer_compression(filepath_or_buffer, compression):
249249
"""
250-
Get file handle for given path/buffer and mode.
250+
Get the compression method for filepath_or_buffer. If compression='infer',
251+
the inferred compression method is returned. Otherwise, the input
252+
compression method is returned unchanged, unless it's invalid, in which
253+
case an error is raised.
251254
252255
Parameters
253256
----------
254257
filepath_or_buf :
255258
a path (str) or buffer
256-
compression : str, or None
259+
compression : str or None
260+
the compression method including None for no compression and 'infer'
257261
258262
Returns
259263
-------
260-
string compression method, None
264+
string or None :
265+
compression method
261266
262267
Raises
263268
------
264269
ValueError on invalid compression specified
265-
266-
If compression='infer', infer compression. If compression
267270
"""
268271

269272
# No compression has been explicitly specified

pandas/io/tests/parser/compression.py

+2-7
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
import nose
99

1010
import pandas.util.testing as tm
11-
from pandas import compat
1211

1312

1413
class CompressionTests(object):
@@ -114,12 +113,8 @@ def test_bz2(self):
114113
path, compression='bz3')
115114

116115
with open(path, 'rb') as fin:
117-
if compat.PY3:
118-
result = self.read_csv(fin, compression='bz2')
119-
tm.assert_frame_equal(result, expected)
120-
elif self.engine is not 'python':
121-
self.assertRaises(ValueError, self.read_csv,
122-
fin, compression='bz2')
116+
result = self.read_csv(fin, compression='bz2')
117+
tm.assert_frame_equal(result, expected)
123118

124119
with tm.ensure_clean('test.bz2') as path:
125120
tmp = bz2.BZ2File(path, mode='wb')

pandas/io/tests/parser/test_network.py

+25-44
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212

1313
import pandas.util.testing as tm
1414
from pandas import DataFrame
15-
from pandas import compat
1615
from pandas.io.parsers import read_csv, read_table
1716

1817

@@ -39,7 +38,7 @@ def test_compressed_urls(self):
3938
for compression, extension in self.compression_to_extension.items():
4039
url = self.base_url + extension
4140
# args is a (compression, engine) tuple
42-
for args in product([compression, 'infer'], ['python']):
41+
for args in product([compression, 'infer'], ['python', 'c']):
4342
# test_fxn is a workaround for more descriptive nose reporting.
4443
# See http://stackoverflow.com/a/37393684/4651668.
4544
test_fxn = functools.partial(self.check_table)
@@ -64,18 +63,12 @@ def setUp(self):
6463
@tm.network
6564
def test_parse_public_s3_bucket(self):
6665
for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
67-
if comp == 'bz2' and compat.PY2:
68-
# The Python 2 C parser can't read bz2 from S3.
69-
self.assertRaises(ValueError, read_csv,
70-
's3://pandas-test/tips.csv' + ext,
71-
compression=comp)
72-
else:
73-
df = read_csv('s3://pandas-test/tips.csv' +
74-
ext, compression=comp)
75-
self.assertTrue(isinstance(df, DataFrame))
76-
self.assertFalse(df.empty)
77-
tm.assert_frame_equal(read_csv(
78-
tm.get_data_path('tips.csv')), df)
66+
df = read_csv('s3://pandas-test/tips.csv' +
67+
ext, compression=comp)
68+
self.assertTrue(isinstance(df, DataFrame))
69+
self.assertFalse(df.empty)
70+
tm.assert_frame_equal(read_csv(
71+
tm.get_data_path('tips.csv')), df)
7972

8073
# Read public file from bucket with not-public contents
8174
df = read_csv('s3://cant_get_it/tips.csv')
@@ -104,43 +97,31 @@ def test_parse_public_s3a_bucket(self):
10497
@tm.network
10598
def test_parse_public_s3_bucket_nrows(self):
10699
for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
107-
if comp == 'bz2' and compat.PY2:
108-
# The Python 2 C parser can't read bz2 from S3.
109-
self.assertRaises(ValueError, read_csv,
110-
's3://pandas-test/tips.csv' + ext,
111-
compression=comp)
112-
else:
113-
df = read_csv('s3://pandas-test/tips.csv' +
114-
ext, nrows=10, compression=comp)
115-
self.assertTrue(isinstance(df, DataFrame))
116-
self.assertFalse(df.empty)
117-
tm.assert_frame_equal(read_csv(
118-
tm.get_data_path('tips.csv')).iloc[:10], df)
100+
df = read_csv('s3://pandas-test/tips.csv' +
101+
ext, nrows=10, compression=comp)
102+
self.assertTrue(isinstance(df, DataFrame))
103+
self.assertFalse(df.empty)
104+
tm.assert_frame_equal(read_csv(
105+
tm.get_data_path('tips.csv')).iloc[:10], df)
119106

120107
@tm.network
121108
def test_parse_public_s3_bucket_chunked(self):
122109
# Read with a chunksize
123110
chunksize = 5
124111
local_tips = read_csv(tm.get_data_path('tips.csv'))
125112
for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
126-
if comp == 'bz2' and compat.PY2:
127-
# The Python 2 C parser can't read bz2 from S3.
128-
self.assertRaises(ValueError, read_csv,
129-
's3://pandas-test/tips.csv' + ext,
130-
compression=comp)
131-
else:
132-
df_reader = read_csv('s3://pandas-test/tips.csv' + ext,
133-
chunksize=chunksize, compression=comp)
134-
self.assertEqual(df_reader.chunksize, chunksize)
135-
for i_chunk in [0, 1, 2]:
136-
# Read a couple of chunks and make sure we see them
137-
# properly.
138-
df = df_reader.get_chunk()
139-
self.assertTrue(isinstance(df, DataFrame))
140-
self.assertFalse(df.empty)
141-
true_df = local_tips.iloc[
142-
chunksize * i_chunk: chunksize * (i_chunk + 1)]
143-
tm.assert_frame_equal(true_df, df)
113+
df_reader = read_csv('s3://pandas-test/tips.csv' + ext,
114+
chunksize=chunksize, compression=comp)
115+
self.assertEqual(df_reader.chunksize, chunksize)
116+
for i_chunk in [0, 1, 2]:
117+
# Read a couple of chunks and make sure we see them
118+
# properly.
119+
df = df_reader.get_chunk()
120+
self.assertTrue(isinstance(df, DataFrame))
121+
self.assertFalse(df.empty)
122+
true_df = local_tips.iloc[
123+
chunksize * i_chunk: chunksize * (i_chunk + 1)]
124+
tm.assert_frame_equal(true_df, df)
144125

145126
@tm.network
146127
def test_parse_public_s3_bucket_chunked_python(self):

pandas/parser.pyx

+3-2
Original file line numberDiff line numberDiff line change
@@ -621,8 +621,9 @@ cdef class TextReader:
621621
if isinstance(source, basestring) or PY3:
622622
source = bz2.BZ2File(source, 'rb')
623623
else:
624-
raise ValueError('Python 2 cannot read bz2 from open file '
625-
'handle')
624+
content = source.read()
625+
source.close()
626+
source = compat.StringIO(bz2.decompress(content))
626627
elif self.compression == 'zip':
627628
import zipfile
628629
zip_file = zipfile.ZipFile(source)

0 commit comments

Comments
 (0)