Skip to content

Commit e05cf2a

Browse files
committed
COMPAT: Consider Python 2.x tarfiles file-like
Tarfile.ExFileObject has no "next" method in Python 2.x, making it an invalid file-like object in read_csv. However, they can be read in just fine, meaning our check is too strict for file-like. This commit relaxes the check to just look for "__iter__". Closes pandas-devgh-16530.
1 parent e60dc4c commit e05cf2a

File tree

6 files changed

+63
-8
lines changed

6 files changed

+63
-8
lines changed

doc/source/whatsnew/v0.20.2.txt

+1
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ I/O
6464
- Bug in pd.read_csv() when comment is passed in space deliminted text files (:issue:`16472`)
6565
- Bug that would force importing of the clipboard routines unnecessarily, potentially causing an import error on startup (:issue:`16288`)
6666
- Bug that raised IndexError HTML-rendering an empty DataFrame (:issue:`15953`)
67+
- Bug in ``pd.read_csv()`` in which tarfile object inputs were raising an error in Python 2.x for the C engine (:issue:`16530`)
6768

6869

6970
Plotting

pandas/core/dtypes/inference.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ def is_file_like(obj):
171171
if not (hasattr(obj, 'read') or hasattr(obj, 'write')):
172172
return False
173173

174-
if not is_iterator(obj):
174+
if not hasattr(obj, "__iter__"):
175175
return False
176176

177177
return True

pandas/io/parsers.py

+24-5
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
import numpy as np
1414

1515
from pandas import compat
16-
from pandas.compat import (range, lrange, StringIO, lzip,
16+
from pandas.compat import (range, lrange, PY3, StringIO, lzip,
1717
zip, string_types, map, u)
1818
from pandas.core.dtypes.common import (
1919
is_integer, _ensure_object,
@@ -31,10 +31,10 @@
3131
from pandas.core.common import AbstractMethodError
3232
from pandas.io.date_converters import generic_parser
3333
from pandas.errors import ParserWarning, ParserError, EmptyDataError
34-
from pandas.io.common import (get_filepath_or_buffer, _validate_header_arg,
35-
_get_handle, UnicodeReader, UTF8Recoder,
36-
BaseIterator,
37-
_NA_VALUES, _infer_compression)
34+
from pandas.io.common import (get_filepath_or_buffer, is_file_like,
35+
_validate_header_arg, _get_handle,
36+
UnicodeReader, UTF8Recoder, _NA_VALUES,
37+
BaseIterator, _infer_compression)
3838
from pandas.core.tools import datetimes as tools
3939

4040
from pandas.util._decorators import Appender
@@ -755,7 +755,9 @@ def __init__(self, f, engine=None, **kwds):
755755
self.squeeze = options.pop('squeeze', False)
756756

757757
# might mutate self.engine
758+
self.engine = self._check_file_or_buffer(f, engine)
758759
self.options, self.engine = self._clean_options(options, engine)
760+
759761
if 'has_index_names' in kwds:
760762
self.options['has_index_names'] = kwds['has_index_names']
761763

@@ -801,6 +803,23 @@ def _get_options_with_defaults(self, engine):
801803

802804
return options
803805

806+
def _check_file_or_buffer(self, f, engine):
807+
# see gh-16530
808+
if is_file_like(f):
809+
next_attr = "__next__" if PY3 else "next"
810+
811+
if engine != "c" and not hasattr(f, next_attr):
812+
msg = ("The 'python' engine can not iterate "
813+
"through this file buffer.")
814+
if self._engine_specified:
815+
raise ValueError(msg)
816+
else:
817+
engine = "c"
818+
msg += " Falling back to the 'c' engine."
819+
warnings.warn(msg, ParserWarning, stacklevel=5)
820+
821+
return engine
822+
804823
def _clean_options(self, options, engine):
805824
result = options.copy()
806825

pandas/tests/dtypes/test_inference.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -120,9 +120,9 @@ class MockFile(object):
120120
m = MockFile()
121121
assert not is_file(m)
122122

123+
# gh-16530: Valid iterator just means we have the
124+
# __iter__ attribute for our purposes.
123125
MockFile.__iter__ = lambda self: self
124-
MockFile.__next__ = lambda self: 0
125-
MockFile.next = MockFile.__next__
126126

127127
# Valid write-only file
128128
m = MockFile()

pandas/tests/io/data/tar_csv.tar

10 KB
Binary file not shown.

pandas/tests/io/parser/c_parser_only.py

+35
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,9 @@
77
further arguments when parsing.
88
"""
99

10+
import os
1011
import sys
12+
import tarfile
1113

1214
import pytest
1315
import numpy as np
@@ -446,3 +448,36 @@ def test_comment_whitespace_delimited(self):
446448
[7, np.nan],
447449
[8, np.nan]])
448450
tm.assert_frame_equal(df, expected)
451+
452+
def test_file_like_no_next(self):
453+
# gh-16530: the file-like need not have a "next" or "__next__"
454+
# attribute despite having an "__iter__" attribute.
455+
#
456+
# NOTE: This is only true for the C engine, not Python engine.
457+
class NoNextBuffer(StringIO):
458+
def __next__(self):
459+
raise AttributeError("No next method")
460+
461+
next = __next__
462+
463+
data = "a\n1"
464+
465+
expected = pd.DataFrame({"a": [1]})
466+
result = self.read_csv(NoNextBuffer(data))
467+
468+
tm.assert_frame_equal(result, expected)
469+
470+
def test_read_tarfile(self):
471+
# see gh-16530
472+
#
473+
# Unfortunately, Python's CSV library can't handle
474+
# tarfile objects (expects string, not bytes when
475+
# iterating through a file-like).
476+
tar_path = os.path.join(self.dirpath, "tar_csv.tar")
477+
478+
tar = tarfile.open(tar_path, "r")
479+
data_file = tar.extractfile("tar_data.csv")
480+
481+
out = self.read_csv(data_file)
482+
expected = pd.DataFrame({"a": [1]})
483+
tm.assert_frame_equal(out, expected)

0 commit comments

Comments
 (0)