From 636afbeaa9d7db177810e7965dba2421ebcc7f6f Mon Sep 17 00:00:00 2001
From: Stephen Hoover <Stephen.LD.Hoover@gmail.com>
Date: Wed, 9 Sep 2015 20:05:47 -0500
Subject: [PATCH] ENH Enable bzip2 streaming for Python 3

Python 2 can't read bz2 files, but Python 3 can. Python 3 can also read bzip files one piece at a time.
---
 doc/source/whatsnew/v0.17.0.txt |  2 ++
 pandas/io/parsers.py            | 11 ++++++-----
 pandas/io/tests/test_parsers.py |  8 ++++++++
 pandas/parser.pyx               |  4 ++--
 4 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
index 914c18a66af61..986af61414587 100644
--- a/doc/source/whatsnew/v0.17.0.txt
+++ b/doc/source/whatsnew/v0.17.0.txt
@@ -465,6 +465,8 @@ Other enhancements
 
 - Improved error message when concatenating an empty iterable of dataframes (:issue:`9157`)
 
+- ``pd.read_csv`` can now read bz2-compressed files incrementally, and the C parser can read bz2-compressed files from AWS S3 (:issue:`11070`, :issue:`11072`).
+
 
 .. _whatsnew_0170.api:
 
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index f0c994ba17e27..736c08f72dee8 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -1344,12 +1344,13 @@ def _wrap_compressed(f, compression, encoding=None):
     elif compression == 'bz2':
         import bz2
 
-        # bz2 module can't take file objects, so have to run through decompress
-        # manually
-        data = bz2.decompress(f.read())
         if compat.PY3:
-            data = data.decode(encoding)
-        f = StringIO(data)
+            f = bz2.open(f, 'rt', encoding=encoding)
+        else:
+            # Python 2's bz2 module can't take file objects, so have to
+            # run through decompress manually
+            data = bz2.decompress(f.read())
+            f = StringIO(data)
         return f
     else:
         raise ValueError('do not recognize compression method %s'
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
index ed261edad4f20..fabe4ce40b22f 100755
--- a/pandas/io/tests/test_parsers.py
+++ b/pandas/io/tests/test_parsers.py
@@ -3836,6 +3836,14 @@ def test_decompression(self):
             self.assertRaises(ValueError, self.read_csv,
                               path, compression='bz3')
 
+            with open(path, 'rb') as fin:
+                if compat.PY3:
+                    result = self.read_csv(fin, compression='bz2')
+                    tm.assert_frame_equal(result, expected)
+                else:
+                    self.assertRaises(ValueError, self.read_csv,
+                                      fin, compression='bz2')
+
     def test_decompression_regex_sep(self):
         try:
             import gzip
diff --git a/pandas/parser.pyx b/pandas/parser.pyx
index c2916f2c0cfb8..647e8e72414e9 100644
--- a/pandas/parser.pyx
+++ b/pandas/parser.pyx
@@ -561,10 +561,10 @@ cdef class TextReader:
                     source = gzip.GzipFile(fileobj=source)
             elif self.compression == 'bz2':
                 import bz2
-                if isinstance(source, basestring):
+                if isinstance(source, basestring) or PY3:
                     source = bz2.BZ2File(source, 'rb')
                 else:
-                    raise ValueError('Python cannot read bz2 from open file '
+                    raise ValueError('Python 2 cannot read bz2 from open file '
                                      'handle')
             else:
                 raise ValueError('Unrecognized compression type: %s' %