Skip to content

Commit e8d4243

Browse files
committed
Merge pull request #11072 from stephen-hoover/stream-bzip2-files
ENH Enable bzip2 streaming for Python 3
2 parents f476000 + 636afbe commit e8d4243

File tree

4 files changed

+18
-7
lines changed

4 files changed

+18
-7
lines changed

doc/source/whatsnew/v0.17.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -465,6 +465,8 @@ Other enhancements
465465

466466
- Improved error message when concatenating an empty iterable of dataframes (:issue:`9157`)
467467

468+
- ``pd.read_csv`` can now read bz2-compressed files incrementally, and the C parser can read bz2-compressed files from AWS S3 (:issue:`11070`, :issue:`11072`).
469+
468470

469471
.. _whatsnew_0170.api:
470472

pandas/io/parsers.py

+6-5
Original file line numberDiff line numberDiff line change
@@ -1344,12 +1344,13 @@ def _wrap_compressed(f, compression, encoding=None):
13441344
elif compression == 'bz2':
13451345
import bz2
13461346

1347-
# bz2 module can't take file objects, so have to run through decompress
1348-
# manually
1349-
data = bz2.decompress(f.read())
13501347
if compat.PY3:
1351-
data = data.decode(encoding)
1352-
f = StringIO(data)
1348+
f = bz2.open(f, 'rt', encoding=encoding)
1349+
else:
1350+
# Python 2's bz2 module can't take file objects, so have to
1351+
# run through decompress manually
1352+
data = bz2.decompress(f.read())
1353+
f = StringIO(data)
13531354
return f
13541355
else:
13551356
raise ValueError('do not recognize compression method %s'

pandas/io/tests/test_parsers.py

+8
Original file line numberDiff line numberDiff line change
@@ -3836,6 +3836,14 @@ def test_decompression(self):
38363836
self.assertRaises(ValueError, self.read_csv,
38373837
path, compression='bz3')
38383838

3839+
with open(path, 'rb') as fin:
3840+
if compat.PY3:
3841+
result = self.read_csv(fin, compression='bz2')
3842+
tm.assert_frame_equal(result, expected)
3843+
else:
3844+
self.assertRaises(ValueError, self.read_csv,
3845+
fin, compression='bz2')
3846+
38393847
def test_decompression_regex_sep(self):
38403848
try:
38413849
import gzip

pandas/parser.pyx

+2-2
Original file line numberDiff line numberDiff line change
@@ -561,10 +561,10 @@ cdef class TextReader:
561561
source = gzip.GzipFile(fileobj=source)
562562
elif self.compression == 'bz2':
563563
import bz2
564-
if isinstance(source, basestring):
564+
if isinstance(source, basestring) or PY3:
565565
source = bz2.BZ2File(source, 'rb')
566566
else:
567-
raise ValueError('Python cannot read bz2 from open file '
567+
raise ValueError('Python 2 cannot read bz2 from open file '
568568
'handle')
569569
else:
570570
raise ValueError('Unrecognized compression type: %s' %

0 commit comments

Comments
 (0)