Skip to content

Commit 957c67e

Browse files
committed
COMPAT: Consider Python 2.x tarfiles file-like
Tarfile.ExFileObject has no "next" method in Python 2.x, making it an invalid file-like object in read_csv. However, they can be read in just fine, meaning our check is too strict for file-like. This commit relaxes the check to just look for "__iter__". Closes pandas-devgh-16530.
1 parent e60dc4c commit 957c67e

File tree

4 files changed

+37
-5
lines changed

4 files changed

+37
-5
lines changed

doc/source/whatsnew/v0.20.2.txt

+1
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ I/O
6464
- Bug in pd.read_csv() when comment is passed in space deliminted text files (:issue:`16472`)
6565
- Bug that would force importing of the clipboard routines unnecessarily, potentially causing an import error on startup (:issue:`16288`)
6666
- Bug that raised IndexError HTML-rendering an empty DataFrame (:issue:`15953`)
67+
- Bug in ``pd.read_csv()`` in which tarfile object inputs were raising an error in Python 2.x (:issue:`16530`)
6768

6869

6970
Plotting

pandas/core/dtypes/inference.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ def is_file_like(obj):
171171
if not (hasattr(obj, 'read') or hasattr(obj, 'write')):
172172
return False
173173

174-
if not is_iterator(obj):
174+
if not hasattr(obj, "__iter__"):
175175
return False
176176

177177
return True

pandas/tests/dtypes/test_inference.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -120,9 +120,9 @@ class MockFile(object):
120120
m = MockFile()
121121
assert not is_file(m)
122122

123+
# gh-16530: Valid iterator just means we have the
124+
# __iter__ attribute for our purposes.
123125
MockFile.__iter__ = lambda self: self
124-
MockFile.__next__ = lambda self: 0
125-
MockFile.next = MockFile.__next__
126126

127127
# Valid write-only file
128128
m = MockFile()

pandas/tests/io/parser/common.py

+33-2
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import csv
44
import os
55
import platform
6+
import tarfile
67
import codecs
78

89
import re
@@ -1702,17 +1703,26 @@ class InvalidBuffer(object):
17021703
# Thus, while the object may look "invalid" (these
17031704
# methods are attributes of the `StringIO` class),
17041705
# it is still a valid file-object for our purposes.
1705-
class NoSeekTellBuffer(StringIO):
1706+
#
1707+
# gh-16530: Valid iterator just means we have the
1708+
# "__iter__" attribute (don't need "next" or "__next__")
1709+
class NoSeekTellNextBuffer(StringIO):
17061710
def tell(self):
17071711
raise AttributeError("No tell method")
17081712

17091713
def seek(self, pos, whence=0):
17101714
raise AttributeError("No seek method")
17111715

1716+
def next(self):
1717+
raise AttributeError("No next method")
1718+
1719+
def __next__(self):
1720+
raise AttributeError("No __next__ method")
1721+
17121722
data = "a\n1"
17131723

17141724
expected = pd.DataFrame({"a": [1]})
1715-
result = self.read_csv(NoSeekTellBuffer(data))
1725+
result = self.read_csv(NoSeekTellNextBuffer(data))
17161726

17171727
tm.assert_frame_equal(result, expected)
17181728

@@ -1754,3 +1764,24 @@ def test_skip_bad_lines(self):
17541764
val = sys.stderr.getvalue()
17551765
assert 'Skipping line 3' in val
17561766
assert 'Skipping line 5' in val
1767+
1768+
def test_read_tarfile(self):
1769+
# see gh-16530
1770+
expected = pd.DataFrame({"a": [1]})
1771+
1772+
tar_path = u("__%s__.tar" % tm.rands(10))
1773+
file_path = u("__%s__.csv" % tm.rands(10))
1774+
1775+
with tm.ensure_clean(tar_path) as full_tar_path:
1776+
with tm.ensure_clean(file_path) as full_file_path:
1777+
expected.to_csv(full_file_path, index=False)
1778+
1779+
tar = tarfile.open(full_tar_path, "w")
1780+
tar.add(file_path)
1781+
tar.close()
1782+
1783+
tar = tarfile.open(full_tar_path, "r")
1784+
data_file = tar.extractfile(file_path)
1785+
out = self.read_csv(data_file)
1786+
1787+
tm.assert_frame_equal(out, expected)

0 commit comments

Comments
 (0)