COMPAT: Consider Python 2.x tarfiles file-like (#16533)

gfyoung · jreback · commit e0a127a82868 · 2017-06-01T06:38:50.000-04:00
diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt
@@ -70,6 +70,7 @@ I/O
 - Bug in pd.read_csv() when comment is passed in space deliminted text files (:issue:`16472`)
 - Bug that would force importing of the clipboard routines unnecessarily, potentially causing an import error on startup (:issue:`16288`)
 - Bug that raised IndexError HTML-rendering an empty DataFrame (:issue:`15953`)
+- Bug in ``pd.read_csv()`` in which tarfile object inputs were raising an error in Python 2.x for the C engine (:issue:`16530`)
 
 - Bug in ``HDFStore.select_as_multiple()`` where start/stop arguments were not respected (:issue:`16209`)
 
diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py
@@ -171,7 +171,7 @@ def is_file_like(obj):
     if not (hasattr(obj, 'read') or hasattr(obj, 'write')):
         return False
 
-    if not is_iterator(obj):
+    if not hasattr(obj, "__iter__"):
         return False
 
     return True
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -13,7 +13,7 @@
 import numpy as np
 
 from pandas import compat
-from pandas.compat import (range, lrange, StringIO, lzip,
+from pandas.compat import (range, lrange, PY3, StringIO, lzip,
                            zip, string_types, map, u)
 from pandas.core.dtypes.common import (
     is_integer, _ensure_object,
@@ -31,10 +31,10 @@
 from pandas.core.common import AbstractMethodError
 from pandas.io.date_converters import generic_parser
 from pandas.errors import ParserWarning, ParserError, EmptyDataError
-from pandas.io.common import (get_filepath_or_buffer, _validate_header_arg,
-                              _get_handle, UnicodeReader, UTF8Recoder,
-                              BaseIterator,
-                              _NA_VALUES, _infer_compression)
+from pandas.io.common import (get_filepath_or_buffer, is_file_like,
+                              _validate_header_arg, _get_handle,
+                              UnicodeReader, UTF8Recoder, _NA_VALUES,
+                              BaseIterator, _infer_compression)
 from pandas.core.tools import datetimes as tools
 
 from pandas.util._decorators import Appender
@@ -755,7 +755,9 @@ def __init__(self, f, engine=None, **kwds):
         self.squeeze = options.pop('squeeze', False)
 
         # might mutate self.engine
+        self.engine = self._check_file_or_buffer(f, engine)
         self.options, self.engine = self._clean_options(options, engine)
+
         if 'has_index_names' in kwds:
             self.options['has_index_names'] = kwds['has_index_names']
 
@@ -801,6 +803,23 @@ def _get_options_with_defaults(self, engine):
 
         return options
 
+    def _check_file_or_buffer(self, f, engine):
+        # see gh-16530
+        if is_file_like(f):
+            next_attr = "__next__" if PY3 else "next"
+
+            # The C engine doesn't need the file-like to have the "next" or
+            # "__next__" attribute. However, the Python engine explicitly calls
+            # "next(...)" when iterating through such an object, meaning it
+            # needs to have that attribute ("next" for Python 2.x, "__next__"
+            # for Python 3.x)
+            if engine != "c" and not hasattr(f, next_attr):
+                msg = ("The 'python' engine cannot iterate "
+                       "through this file buffer.")
+                raise ValueError(msg)
+
+        return engine
+
     def _clean_options(self, options, engine):
         result = options.copy()
 
diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py
@@ -120,9 +120,9 @@ class MockFile(object):
     m = MockFile()
     assert not is_file(m)
 
+    # gh-16530: Valid iterator just means we have the
+    # __iter__ attribute for our purposes.
     MockFile.__iter__ = lambda self: self
-    MockFile.__next__ = lambda self: 0
-    MockFile.next = MockFile.__next__
 
     # Valid write-only file
     m = MockFile()
diff --git a/pandas/tests/io/parser/c_parser_only.py b/pandas/tests/io/parser/c_parser_only.py
@@ -7,7 +7,9 @@
 further arguments when parsing.
 """
 
+import os
 import sys
+import tarfile
 
 import pytest
 import numpy as np
@@ -446,3 +448,37 @@ def test_comment_whitespace_delimited(self):
                               [7, np.nan],
                               [8, np.nan]])
         tm.assert_frame_equal(df, expected)
+
+    def test_file_like_no_next(self):
+        # gh-16530: the file-like need not have a "next" or "__next__"
+        # attribute despite having an "__iter__" attribute.
+        #
+        # NOTE: This is only true for the C engine, not Python engine.
+        class NoNextBuffer(StringIO):
+            def __next__(self):
+                raise AttributeError("No next method")
+
+            next = __next__
+
+        data = "a\n1"
+
+        expected = pd.DataFrame({"a": [1]})
+        result = self.read_csv(NoNextBuffer(data))
+
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("tar_suffix", [".tar", ".tar.gz"])
+    def test_read_tarfile(self, tar_suffix):
+        # see gh-16530
+        #
+        # Unfortunately, Python's CSV library can't handle
+        # tarfile objects (expects string, not bytes when
+        # iterating through a file-like).
+        tar_path = os.path.join(self.dirpath, "tar_csv" + tar_suffix)
+
+        tar = tarfile.open(tar_path, "r")
+        data_file = tar.extractfile("tar_data.csv")
+
+        out = self.read_csv(data_file)
+        expected = pd.DataFrame({"a": [1]})
+        tm.assert_frame_equal(out, expected)
diff --git a/pandas/tests/io/parser/data/tar_csv.tar b/pandas/tests/io/parser/data/tar_csv.tar
diff --git a/pandas/tests/io/parser/data/tar_csv.tar.gz b/pandas/tests/io/parser/data/tar_csv.tar.gz
diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py
@@ -16,6 +16,13 @@
 from pandas.errors import ParserError
 from pandas.io.parsers import read_csv, read_table
 
+import pytest
+
+
+@pytest.fixture(params=["python", "python-fwf"], ids=lambda val: val)
+def python_engine(request):
+    return request.param
+
 
 class TestUnsupportedFeatures(object):
 
@@ -82,24 +89,40 @@ def test_c_engine(self):
         with tm.assert_raises_regex(ValueError, msg):
             read_csv(StringIO(data), lineterminator='~~')
 
-    def test_python_engine(self):
+    def test_python_engine(self, python_engine):
         from pandas.io.parsers import _python_unsupported as py_unsupported
 
         data = """1,2,3,,
 1,2,3,4,
 1,2,3,4,5
 1,2,,,
 1,2,3,4,"""
-        engines = 'python', 'python-fwf'
 
-        for engine in engines:
-            for default in py_unsupported:
-                msg = ('The %r option is not supported '
-                       'with the %r engine' % (default, engine))
+        for default in py_unsupported:
+            msg = ('The %r option is not supported '
+                   'with the %r engine' % (default, python_engine))
+
+            kwargs = {default: object()}
+            with tm.assert_raises_regex(ValueError, msg):
+                read_csv(StringIO(data), engine=python_engine, **kwargs)
 
-                kwargs = {default: object()}
-                with tm.assert_raises_regex(ValueError, msg):
-                    read_csv(StringIO(data), engine=engine, **kwargs)
+    def test_python_engine_file_no_next(self, python_engine):
+        # see gh-16530
+        class NoNextBuffer(object):
+            def __init__(self, csv_data):
+                self.data = csv_data
+
+            def __iter__(self):
+                return self
+
+            def read(self):
+                return self.data
+
+        data = "a\n1"
+        msg = "The 'python' engine cannot iterate"
+
+        with tm.assert_raises_regex(ValueError, msg):
+            read_csv(NoNextBuffer(data), engine=python_engine)
 
 
 class TestDeprecatedFeatures(object):
diff --git a/setup.py b/setup.py
@@ -702,6 +702,8 @@ def pxd(name):
                                         'parser/data/*.gz',
                                         'parser/data/*.bz2',
                                         'parser/data/*.txt',
+                                        'parser/data/*.tar',
+                                        'parser/data/*.tar.gz',
                                         'sas/data/*.csv',
                                         'sas/data/*.xpt',
                                         'sas/data/*.sas7bdat',