Skip to content

DOC for refactored compression (GH14576) + BUG: bz2-compressed URL with C engine (GH14874) #14880

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 10 commits into from
21 changes: 21 additions & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,27 @@ Strings passed to ``DataFrame.groupby()`` as the ``by`` parameter may now refere

df.groupby(['second', 'A']).sum()

.. _whatsnew_0200.enhancements.compressed_urls:

Better support for compressed URLs in ``read_csv``
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

The compression code was refactored (:issue:`12688`). As a result, reading
dataframes from URLs in :func:`read_csv` or :func:`read_table` now supports
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if there are any other issues that were closed by this, pls list them as well.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rechecked... they're all already listed.

additional compression methods: ``xz``, ``bz2``, and ``zip`` (:issue:`14570`).
Previously, only ``gzip`` compression was supported. By default, compression of
URLs and paths are now both inferred using their file extensions. Additionally,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The compression code

paths are not inferred using (remove both)

Additionally, support for bz2 compress in the python 2 c-engine improved.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Addressed comments 1 and 3 in e1b5d42. @jreback, I didn't change:

By default, compression of URLs and paths are now both inferred using their file extensions.

Previously, compression of paths was by default inferred from their extension, but not URLs. Now both are inferred by their extension. Am I missing something?

support for bz2 compression in the python 2 c-engine improved (:issue:`14874`).

.. ipython:: python
url = 'https://github.com/{repo}/raw/{branch}/{path}'.format(
repo = 'pandas-dev/pandas',
branch = 'master',
path = 'pandas/io/tests/parser/data/salaries.csv.bz2',
)
df = pd.read_table(url, compression='infer') # default, infer compression
df = pd.read_table(url, compression='bz2') # explicitly specify compression
df.head(2)

.. _whatsnew_0200.enhancements.other:

Expand Down
17 changes: 10 additions & 7 deletions pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,8 +187,8 @@ def _stringify_path(filepath_or_buffer):
def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
compression=None):
"""
If the filepath_or_buffer is a url, translate and return the buffer
passthru otherwise.
If the filepath_or_buffer is a url, translate and return the buffer.
Otherwise passthrough.

Parameters
----------
Expand Down Expand Up @@ -247,23 +247,26 @@ def file_path_to_url(path):

def _infer_compression(filepath_or_buffer, compression):
"""
Get file handle for given path/buffer and mode.
Get the compression method for filepath_or_buffer. If compression='infer',
the inferred compression method is returned. Otherwise, the input
compression method is returned unchanged, unless it's invalid, in which
case an error is raised.

Parameters
----------
filepath_or_buf :
a path (str) or buffer
compression : str, or None
compression : str or None
the compression method including None for no compression and 'infer'

Returns
-------
string compression method, None
string or None :
compression method

Raises
------
ValueError on invalid compression specified

If compression='infer', infer compression. If compression
"""

# No compression has been explicitly specified
Expand Down
9 changes: 2 additions & 7 deletions pandas/io/tests/parser/compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
import nose

import pandas.util.testing as tm
from pandas import compat


class CompressionTests(object):
Expand Down Expand Up @@ -114,12 +113,8 @@ def test_bz2(self):
path, compression='bz3')

with open(path, 'rb') as fin:
if compat.PY3:
result = self.read_csv(fin, compression='bz2')
tm.assert_frame_equal(result, expected)
elif self.engine is not 'python':
self.assertRaises(ValueError, self.read_csv,
fin, compression='bz2')
result = self.read_csv(fin, compression='bz2')
tm.assert_frame_equal(result, expected)

with tm.ensure_clean('test.bz2') as path:
tmp = bz2.BZ2File(path, mode='wb')
Expand Down
69 changes: 25 additions & 44 deletions pandas/io/tests/parser/test_network.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@

import pandas.util.testing as tm
from pandas import DataFrame
from pandas import compat
from pandas.io.parsers import read_csv, read_table


Expand All @@ -39,7 +38,7 @@ def test_compressed_urls(self):
for compression, extension in self.compression_to_extension.items():
url = self.base_url + extension
# args is a (compression, engine) tuple
for args in product([compression, 'infer'], ['python']):
for args in product([compression, 'infer'], ['python', 'c']):
# test_fxn is a workaround for more descriptive nose reporting.
# See http://stackoverflow.com/a/37393684/4651668.
test_fxn = functools.partial(self.check_table)
Expand All @@ -64,18 +63,12 @@ def setUp(self):
@tm.network
def test_parse_public_s3_bucket(self):
for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
if comp == 'bz2' and compat.PY2:
# The Python 2 C parser can't read bz2 from S3.
self.assertRaises(ValueError, read_csv,
's3://pandas-test/tips.csv' + ext,
compression=comp)
else:
df = read_csv('s3://pandas-test/tips.csv' +
ext, compression=comp)
self.assertTrue(isinstance(df, DataFrame))
self.assertFalse(df.empty)
tm.assert_frame_equal(read_csv(
tm.get_data_path('tips.csv')), df)
df = read_csv('s3://pandas-test/tips.csv' +
ext, compression=comp)
self.assertTrue(isinstance(df, DataFrame))
self.assertFalse(df.empty)
tm.assert_frame_equal(read_csv(
tm.get_data_path('tips.csv')), df)

# Read public file from bucket with not-public contents
df = read_csv('s3://cant_get_it/tips.csv')
Expand Down Expand Up @@ -104,43 +97,31 @@ def test_parse_public_s3a_bucket(self):
@tm.network
def test_parse_public_s3_bucket_nrows(self):
for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
if comp == 'bz2' and compat.PY2:
# The Python 2 C parser can't read bz2 from S3.
self.assertRaises(ValueError, read_csv,
's3://pandas-test/tips.csv' + ext,
compression=comp)
else:
df = read_csv('s3://pandas-test/tips.csv' +
ext, nrows=10, compression=comp)
self.assertTrue(isinstance(df, DataFrame))
self.assertFalse(df.empty)
tm.assert_frame_equal(read_csv(
tm.get_data_path('tips.csv')).iloc[:10], df)
df = read_csv('s3://pandas-test/tips.csv' +
ext, nrows=10, compression=comp)
self.assertTrue(isinstance(df, DataFrame))
self.assertFalse(df.empty)
tm.assert_frame_equal(read_csv(
tm.get_data_path('tips.csv')).iloc[:10], df)

@tm.network
def test_parse_public_s3_bucket_chunked(self):
# Read with a chunksize
chunksize = 5
local_tips = read_csv(tm.get_data_path('tips.csv'))
for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
if comp == 'bz2' and compat.PY2:
# The Python 2 C parser can't read bz2 from S3.
self.assertRaises(ValueError, read_csv,
's3://pandas-test/tips.csv' + ext,
compression=comp)
else:
df_reader = read_csv('s3://pandas-test/tips.csv' + ext,
chunksize=chunksize, compression=comp)
self.assertEqual(df_reader.chunksize, chunksize)
for i_chunk in [0, 1, 2]:
# Read a couple of chunks and make sure we see them
# properly.
df = df_reader.get_chunk()
self.assertTrue(isinstance(df, DataFrame))
self.assertFalse(df.empty)
true_df = local_tips.iloc[
chunksize * i_chunk: chunksize * (i_chunk + 1)]
tm.assert_frame_equal(true_df, df)
df_reader = read_csv('s3://pandas-test/tips.csv' + ext,
chunksize=chunksize, compression=comp)
self.assertEqual(df_reader.chunksize, chunksize)
for i_chunk in [0, 1, 2]:
# Read a couple of chunks and make sure we see them
# properly.
df = df_reader.get_chunk()
self.assertTrue(isinstance(df, DataFrame))
self.assertFalse(df.empty)
true_df = local_tips.iloc[
chunksize * i_chunk: chunksize * (i_chunk + 1)]
tm.assert_frame_equal(true_df, df)

@tm.network
def test_parse_public_s3_bucket_chunked_python(self):
Expand Down
5 changes: 3 additions & 2 deletions pandas/parser.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -621,8 +621,9 @@ cdef class TextReader:
if isinstance(source, basestring) or PY3:
source = bz2.BZ2File(source, 'rb')
else:
raise ValueError('Python 2 cannot read bz2 from open file '
'handle')
content = source.read()
source.close()
source = compat.StringIO(bz2.decompress(content))
elif self.compression == 'zip':
import zipfile
zip_file = zipfile.ZipFile(source)
Expand Down