COMPAT: Consider Python 2.x tarfiles file-like

gfyoung · gfyoung · commit e05cf2a5dda1 · 2017-05-29T16:34:19.000-04:00
Tarfile.ExFileObject has no "next" method in Python 2.x, making it an invalid file-like object in read_csv. However, they can be read in just fine, meaning our check is too strict for file-like. This commit relaxes the check to just look for "__iter__". Closes pandas-devgh-16530.
diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt
@@ -64,6 +64,7 @@ I/O
 - Bug in pd.read_csv() when comment is passed in space deliminted text files (:issue:`16472`)
 - Bug that would force importing of the clipboard routines unnecessarily, potentially causing an import error on startup (:issue:`16288`)
 - Bug that raised IndexError HTML-rendering an empty DataFrame (:issue:`15953`)
+- Bug in ``pd.read_csv()`` in which tarfile object inputs were raising an error in Python 2.x for the C engine (:issue:`16530`)
 
 
 Plotting
diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py
@@ -171,7 +171,7 @@ def is_file_like(obj):
     if not (hasattr(obj, 'read') or hasattr(obj, 'write')):
         return False
 
-    if not is_iterator(obj):
+    if not hasattr(obj, "__iter__"):
         return False
 
     return True
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -13,7 +13,7 @@
 import numpy as np
 
 from pandas import compat
-from pandas.compat import (range, lrange, StringIO, lzip,
+from pandas.compat import (range, lrange, PY3, StringIO, lzip,
                            zip, string_types, map, u)
 from pandas.core.dtypes.common import (
     is_integer, _ensure_object,
@@ -31,10 +31,10 @@
 from pandas.core.common import AbstractMethodError
 from pandas.io.date_converters import generic_parser
 from pandas.errors import ParserWarning, ParserError, EmptyDataError
-from pandas.io.common import (get_filepath_or_buffer, _validate_header_arg,
-                              _get_handle, UnicodeReader, UTF8Recoder,
-                              BaseIterator,
-                              _NA_VALUES, _infer_compression)
+from pandas.io.common import (get_filepath_or_buffer, is_file_like,
+                              _validate_header_arg, _get_handle,
+                              UnicodeReader, UTF8Recoder, _NA_VALUES,
+                              BaseIterator, _infer_compression)
 from pandas.core.tools import datetimes as tools
 
 from pandas.util._decorators import Appender
@@ -755,7 +755,9 @@ def __init__(self, f, engine=None, **kwds):
         self.squeeze = options.pop('squeeze', False)
 
         # might mutate self.engine
+        self.engine = self._check_file_or_buffer(f, engine)
         self.options, self.engine = self._clean_options(options, engine)
+
         if 'has_index_names' in kwds:
             self.options['has_index_names'] = kwds['has_index_names']
 
@@ -801,6 +803,23 @@ def _get_options_with_defaults(self, engine):
 
         return options
 
+    def _check_file_or_buffer(self, f, engine):
+        # see gh-16530
+        if is_file_like(f):
+            next_attr = "__next__" if PY3 else "next"
+
+            if engine != "c" and not hasattr(f, next_attr):
+                msg = ("The 'python' engine can not iterate "
+                       "through this file buffer.")
+                if self._engine_specified:
+                    raise ValueError(msg)
+                else:
+                    engine = "c"
+                    msg += " Falling back to the 'c' engine."
+                    warnings.warn(msg, ParserWarning, stacklevel=5)
+
+        return engine
+
     def _clean_options(self, options, engine):
         result = options.copy()
 
diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py
@@ -120,9 +120,9 @@ class MockFile(object):
     m = MockFile()
     assert not is_file(m)
 
+    # gh-16530: Valid iterator just means we have the
+    # __iter__ attribute for our purposes.
     MockFile.__iter__ = lambda self: self
-    MockFile.__next__ = lambda self: 0
-    MockFile.next = MockFile.__next__
 
     # Valid write-only file
     m = MockFile()
diff --git a/pandas/tests/io/data/tar_csv.tar b/pandas/tests/io/data/tar_csv.tar
diff --git a/pandas/tests/io/parser/c_parser_only.py b/pandas/tests/io/parser/c_parser_only.py
@@ -7,7 +7,9 @@
 further arguments when parsing.
 """
 
+import os
 import sys
+import tarfile
 
 import pytest
 import numpy as np
@@ -446,3 +448,36 @@ def test_comment_whitespace_delimited(self):
                               [7, np.nan],
                               [8, np.nan]])
         tm.assert_frame_equal(df, expected)
+
+    def test_file_like_no_next(self):
+        # gh-16530: the file-like need not have a "next" or "__next__"
+        # attribute despite having an "__iter__" attribute.
+        #
+        # NOTE: This is only true for the C engine, not Python engine.
+        class NoNextBuffer(StringIO):
+            def __next__(self):
+                raise AttributeError("No next method")
+
+            next = __next__
+
+        data = "a\n1"
+
+        expected = pd.DataFrame({"a": [1]})
+        result = self.read_csv(NoNextBuffer(data))
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_read_tarfile(self):
+        # see gh-16530
+        #
+        # Unfortunately, Python's CSV library can't handle
+        # tarfile objects (expects string, not bytes when
+        # iterating through a file-like).
+        tar_path = os.path.join(self.dirpath, "tar_csv.tar")
+
+        tar = tarfile.open(tar_path, "r")
+        data_file = tar.extractfile("tar_data.csv")
+
+        out = self.read_csv(data_file)
+        expected = pd.DataFrame({"a": [1]})
+        tm.assert_frame_equal(out, expected)