Skip to content

Commit e0a127a

Browse files
gfyoungjreback
authored andcommitted
COMPAT: Consider Python 2.x tarfiles file-like (#16533)
1 parent cab2b6b commit e0a127a

File tree

9 files changed

+98
-17
lines changed

9 files changed

+98
-17
lines changed

doc/source/whatsnew/v0.20.2.txt

+1
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ I/O
7070
- Bug in pd.read_csv() when comment is passed in space deliminted text files (:issue:`16472`)
7171
- Bug that would force importing of the clipboard routines unnecessarily, potentially causing an import error on startup (:issue:`16288`)
7272
- Bug that raised IndexError HTML-rendering an empty DataFrame (:issue:`15953`)
73+
- Bug in ``pd.read_csv()`` in which tarfile object inputs were raising an error in Python 2.x for the C engine (:issue:`16530`)
7374

7475
- Bug in ``HDFStore.select_as_multiple()`` where start/stop arguments were not respected (:issue:`16209`)
7576

pandas/core/dtypes/inference.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ def is_file_like(obj):
171171
if not (hasattr(obj, 'read') or hasattr(obj, 'write')):
172172
return False
173173

174-
if not is_iterator(obj):
174+
if not hasattr(obj, "__iter__"):
175175
return False
176176

177177
return True

pandas/io/parsers.py

+24-5
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
import numpy as np
1414

1515
from pandas import compat
16-
from pandas.compat import (range, lrange, StringIO, lzip,
16+
from pandas.compat import (range, lrange, PY3, StringIO, lzip,
1717
zip, string_types, map, u)
1818
from pandas.core.dtypes.common import (
1919
is_integer, _ensure_object,
@@ -31,10 +31,10 @@
3131
from pandas.core.common import AbstractMethodError
3232
from pandas.io.date_converters import generic_parser
3333
from pandas.errors import ParserWarning, ParserError, EmptyDataError
34-
from pandas.io.common import (get_filepath_or_buffer, _validate_header_arg,
35-
_get_handle, UnicodeReader, UTF8Recoder,
36-
BaseIterator,
37-
_NA_VALUES, _infer_compression)
34+
from pandas.io.common import (get_filepath_or_buffer, is_file_like,
35+
_validate_header_arg, _get_handle,
36+
UnicodeReader, UTF8Recoder, _NA_VALUES,
37+
BaseIterator, _infer_compression)
3838
from pandas.core.tools import datetimes as tools
3939

4040
from pandas.util._decorators import Appender
@@ -755,7 +755,9 @@ def __init__(self, f, engine=None, **kwds):
755755
self.squeeze = options.pop('squeeze', False)
756756

757757
# might mutate self.engine
758+
self.engine = self._check_file_or_buffer(f, engine)
758759
self.options, self.engine = self._clean_options(options, engine)
760+
759761
if 'has_index_names' in kwds:
760762
self.options['has_index_names'] = kwds['has_index_names']
761763

@@ -801,6 +803,23 @@ def _get_options_with_defaults(self, engine):
801803

802804
return options
803805

806+
def _check_file_or_buffer(self, f, engine):
807+
# see gh-16530
808+
if is_file_like(f):
809+
next_attr = "__next__" if PY3 else "next"
810+
811+
# The C engine doesn't need the file-like to have the "next" or
812+
# "__next__" attribute. However, the Python engine explicitly calls
813+
# "next(...)" when iterating through such an object, meaning it
814+
# needs to have that attribute ("next" for Python 2.x, "__next__"
815+
# for Python 3.x)
816+
if engine != "c" and not hasattr(f, next_attr):
817+
msg = ("The 'python' engine cannot iterate "
818+
"through this file buffer.")
819+
raise ValueError(msg)
820+
821+
return engine
822+
804823
def _clean_options(self, options, engine):
805824
result = options.copy()
806825

pandas/tests/dtypes/test_inference.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -120,9 +120,9 @@ class MockFile(object):
120120
m = MockFile()
121121
assert not is_file(m)
122122

123+
# gh-16530: Valid iterator just means we have the
124+
# __iter__ attribute for our purposes.
123125
MockFile.__iter__ = lambda self: self
124-
MockFile.__next__ = lambda self: 0
125-
MockFile.next = MockFile.__next__
126126

127127
# Valid write-only file
128128
m = MockFile()

pandas/tests/io/parser/c_parser_only.py

+36
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,9 @@
77
further arguments when parsing.
88
"""
99

10+
import os
1011
import sys
12+
import tarfile
1113

1214
import pytest
1315
import numpy as np
@@ -446,3 +448,37 @@ def test_comment_whitespace_delimited(self):
446448
[7, np.nan],
447449
[8, np.nan]])
448450
tm.assert_frame_equal(df, expected)
451+
452+
def test_file_like_no_next(self):
453+
# gh-16530: the file-like need not have a "next" or "__next__"
454+
# attribute despite having an "__iter__" attribute.
455+
#
456+
# NOTE: This is only true for the C engine, not Python engine.
457+
class NoNextBuffer(StringIO):
458+
def __next__(self):
459+
raise AttributeError("No next method")
460+
461+
next = __next__
462+
463+
data = "a\n1"
464+
465+
expected = pd.DataFrame({"a": [1]})
466+
result = self.read_csv(NoNextBuffer(data))
467+
468+
tm.assert_frame_equal(result, expected)
469+
470+
@pytest.mark.parametrize("tar_suffix", [".tar", ".tar.gz"])
471+
def test_read_tarfile(self, tar_suffix):
472+
# see gh-16530
473+
#
474+
# Unfortunately, Python's CSV library can't handle
475+
# tarfile objects (expects string, not bytes when
476+
# iterating through a file-like).
477+
tar_path = os.path.join(self.dirpath, "tar_csv" + tar_suffix)
478+
479+
tar = tarfile.open(tar_path, "r")
480+
data_file = tar.extractfile("tar_data.csv")
481+
482+
out = self.read_csv(data_file)
483+
expected = pd.DataFrame({"a": [1]})
484+
tm.assert_frame_equal(out, expected)
10 KB
Binary file not shown.
10 KB
Binary file not shown.

pandas/tests/io/parser/test_unsupported.py

+32-9
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,13 @@
1616
from pandas.errors import ParserError
1717
from pandas.io.parsers import read_csv, read_table
1818

19+
import pytest
20+
21+
22+
@pytest.fixture(params=["python", "python-fwf"], ids=lambda val: val)
23+
def python_engine(request):
24+
return request.param
25+
1926

2027
class TestUnsupportedFeatures(object):
2128

@@ -82,24 +89,40 @@ def test_c_engine(self):
8289
with tm.assert_raises_regex(ValueError, msg):
8390
read_csv(StringIO(data), lineterminator='~~')
8491

85-
def test_python_engine(self):
92+
def test_python_engine(self, python_engine):
8693
from pandas.io.parsers import _python_unsupported as py_unsupported
8794

8895
data = """1,2,3,,
8996
1,2,3,4,
9097
1,2,3,4,5
9198
1,2,,,
9299
1,2,3,4,"""
93-
engines = 'python', 'python-fwf'
94100

95-
for engine in engines:
96-
for default in py_unsupported:
97-
msg = ('The %r option is not supported '
98-
'with the %r engine' % (default, engine))
101+
for default in py_unsupported:
102+
msg = ('The %r option is not supported '
103+
'with the %r engine' % (default, python_engine))
104+
105+
kwargs = {default: object()}
106+
with tm.assert_raises_regex(ValueError, msg):
107+
read_csv(StringIO(data), engine=python_engine, **kwargs)
99108

100-
kwargs = {default: object()}
101-
with tm.assert_raises_regex(ValueError, msg):
102-
read_csv(StringIO(data), engine=engine, **kwargs)
109+
def test_python_engine_file_no_next(self, python_engine):
110+
# see gh-16530
111+
class NoNextBuffer(object):
112+
def __init__(self, csv_data):
113+
self.data = csv_data
114+
115+
def __iter__(self):
116+
return self
117+
118+
def read(self):
119+
return self.data
120+
121+
data = "a\n1"
122+
msg = "The 'python' engine cannot iterate"
123+
124+
with tm.assert_raises_regex(ValueError, msg):
125+
read_csv(NoNextBuffer(data), engine=python_engine)
103126

104127

105128
class TestDeprecatedFeatures(object):

setup.py

+2
Original file line numberDiff line numberDiff line change
@@ -702,6 +702,8 @@ def pxd(name):
702702
'parser/data/*.gz',
703703
'parser/data/*.bz2',
704704
'parser/data/*.txt',
705+
'parser/data/*.tar',
706+
'parser/data/*.tar.gz',
705707
'sas/data/*.csv',
706708
'sas/data/*.xpt',
707709
'sas/data/*.sas7bdat',

0 commit comments

Comments
 (0)