From 636afbeaa9d7db177810e7965dba2421ebcc7f6f Mon Sep 17 00:00:00 2001 From: Stephen Hoover Date: Wed, 9 Sep 2015 20:05:47 -0500 Subject: [PATCH] ENH Enable bzip2 streaming for Python 3 Python 2 can't read bz2 files, but Python 3 can. Python 3 can also read bzip files one piece at a time. --- doc/source/whatsnew/v0.17.0.txt | 2 ++ pandas/io/parsers.py | 11 ++++++----- pandas/io/tests/test_parsers.py | 8 ++++++++ pandas/parser.pyx | 4 ++-- 4 files changed, 18 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 914c18a66af61..986af61414587 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -465,6 +465,8 @@ Other enhancements - Improved error message when concatenating an empty iterable of dataframes (:issue:`9157`) +- ``pd.read_csv`` can now read bz2-compressed files incrementally, and the C parser can read bz2-compressed files from AWS S3 (:issue:`11070`, :issue:`11072`). + .. _whatsnew_0170.api: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index f0c994ba17e27..736c08f72dee8 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1344,12 +1344,13 @@ def _wrap_compressed(f, compression, encoding=None): elif compression == 'bz2': import bz2 - # bz2 module can't take file objects, so have to run through decompress - # manually - data = bz2.decompress(f.read()) if compat.PY3: - data = data.decode(encoding) - f = StringIO(data) + f = bz2.open(f, 'rt', encoding=encoding) + else: + # Python 2's bz2 module can't take file objects, so have to + # run through decompress manually + data = bz2.decompress(f.read()) + f = StringIO(data) return f else: raise ValueError('do not recognize compression method %s' diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index ed261edad4f20..fabe4ce40b22f 100755 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -3836,6 +3836,14 @@ def test_decompression(self): self.assertRaises(ValueError, self.read_csv, path, compression='bz3') + with open(path, 'rb') as fin: + if compat.PY3: + result = self.read_csv(fin, compression='bz2') + tm.assert_frame_equal(result, expected) + else: + self.assertRaises(ValueError, self.read_csv, + fin, compression='bz2') + def test_decompression_regex_sep(self): try: import gzip diff --git a/pandas/parser.pyx b/pandas/parser.pyx index c2916f2c0cfb8..647e8e72414e9 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -561,10 +561,10 @@ cdef class TextReader: source = gzip.GzipFile(fileobj=source) elif self.compression == 'bz2': import bz2 - if isinstance(source, basestring): + if isinstance(source, basestring) or PY3: source = bz2.BZ2File(source, 'rb') else: - raise ValueError('Python cannot read bz2 from open file ' + raise ValueError('Python 2 cannot read bz2 from open file ' 'handle') else: raise ValueError('Unrecognized compression type: %s' %