Skip to content

Commit 81103f7

Browse files
committed
ENH: Add file buffer validation to I/O ops
Closes gh-15337.
1 parent 0a37067 commit 81103f7

File tree

8 files changed

+80
-11
lines changed

8 files changed

+80
-11
lines changed

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -1033,6 +1033,7 @@ I/O
10331033
- Bug in ``pd.read_csv()`` with ``parse_dates`` when multiline headers are specified (:issue:`15376`)
10341034
- Bug in ``pd.read_csv()`` with ``float_precision='round_trip'`` which caused a segfault when a text entry is parsed (:issue:`15140`)
10351035
- Bug in ``pd.read_csv()`` when an index was specified and no values were specified as null values (:issue:`15835`)
1036+
- Bug in ``pd.read_csv()`` in which certain invalid file objects caused the Python interpreter to crash (:issue:`15337`)
10361037
- Added checks in ``pd.read_csv()`` ensuring that values for ``nrows`` and ``chunksize`` are valid (:issue:`15767`)
10371038
- Bug in ``pd.tools.hashing.hash_pandas_object()`` in which hashing of categoricals depended on the ordering of categories, instead of just their values. (:issue:`15143`)
10381039
- Bug in ``.to_json()`` where ``lines=True`` and contents (keys or values) contain escaped characters (:issue:`15096`)

pandas/io/common.py

+20-3
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from pandas import compat
1111
from pandas.formats.printing import pprint_thing
1212
from pandas.core.common import AbstractMethodError
13-
from pandas.types.common import is_number
13+
from pandas.types.common import is_number, is_file_like
1414

1515
# compat
1616
from pandas.errors import (ParserError, DtypeWarning, # noqa
@@ -197,9 +197,19 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
197197
encoding=encoding,
198198
compression=compression)
199199

200-
# It is a pathlib.Path/py.path.local or string
200+
# Convert pathlib.Path/py.path.local or string
201201
filepath_or_buffer = _stringify_path(filepath_or_buffer)
202-
return _expand_user(filepath_or_buffer), None, compression
202+
203+
if isinstance(filepath_or_buffer, (compat.string_types,
204+
compat.binary_type,
205+
mmap.mmap)):
206+
return _expand_user(filepath_or_buffer), None, compression
207+
208+
if not is_file_like(filepath_or_buffer):
209+
msg = "Invalid file path or buffer object type: {_type}"
210+
raise ValueError(msg.format(_type=type(filepath_or_buffer)))
211+
212+
return filepath_or_buffer, None, compression
203213

204214

205215
def file_path_to_url(path):
@@ -416,6 +426,9 @@ def __init__(self, f):
416426
def __getattr__(self, name):
417427
return getattr(self.mmap, name)
418428

429+
def __iter__(self):
430+
return self
431+
419432
def __next__(self):
420433
newline = self.mmap.readline()
421434

@@ -433,6 +446,10 @@ def __next__(self):
433446
return newline
434447

435448

449+
if not compat.PY3:
450+
MMapWrapper.next = lambda self: self.__next__()
451+
452+
436453
class UTF8Recoder(BaseIterator):
437454

438455
"""

pandas/io/excel.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -243,9 +243,8 @@ def __init__(self, io, **kwds):
243243
# to get_filepath_or_buffer()
244244
if _is_url(io):
245245
io = _urlopen(io)
246-
# Deal with S3 urls, path objects, etc. Will convert them to
247-
# buffer or path string
248-
io, _, _ = get_filepath_or_buffer(io)
246+
elif not isinstance(io, (ExcelFile, xlrd.Book)):
247+
io, _, _ = get_filepath_or_buffer(io)
249248

250249
if engine == 'xlrd' and isinstance(io, xlrd.Book):
251250
self.book = io

pandas/tests/api/test_types.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ class TestTypes(Base, tm.TestCase):
2424
'is_timedelta64_dtype', 'is_timedelta64_ns_dtype',
2525
'is_unsigned_integer_dtype', 'is_period',
2626
'is_period_dtype', 'is_re', 'is_re_compilable',
27-
'is_dict_like', 'is_iterator',
27+
'is_dict_like', 'is_iterator', 'is_file_like',
2828
'is_list_like', 'is_hashable',
2929
'is_named_tuple', 'is_sequence',
3030
'pandas_dtype']

pandas/tests/io/parser/common.py

+17
Original file line numberDiff line numberDiff line change
@@ -1678,3 +1678,20 @@ def test_file_handles(self):
16781678
if PY3:
16791679
self.assertFalse(m.closed)
16801680
m.close()
1681+
1682+
def test_invalid_file_buffer(self):
1683+
# see gh-15337
1684+
1685+
class InvalidBuffer(object):
1686+
pass
1687+
1688+
msg = "Invalid file path or buffer object type"
1689+
1690+
with tm.assertRaisesRegexp(ValueError, msg):
1691+
self.read_csv(InvalidBuffer())
1692+
1693+
if PY3:
1694+
from unittest import mock
1695+
1696+
with tm.assertRaisesRegexp(ValueError, msg):
1697+
self.read_csv(mock.Mock())

pandas/tests/types/test_inference.py

+15-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from pandas import (Series, Index, DataFrame, Timedelta,
1818
DatetimeIndex, TimedeltaIndex, Timestamp,
1919
Panel, Period, Categorical)
20-
from pandas.compat import u, PY2, lrange
20+
from pandas.compat import u, PY2, PY3, StringIO, lrange
2121
from pandas.types import inference
2222
from pandas.types.common import (is_timedelta64_dtype,
2323
is_timedelta64_ns_dtype,
@@ -78,6 +78,20 @@ def test_is_dict_like():
7878
assert not inference.is_dict_like(f)
7979

8080

81+
def test_is_file_like():
82+
is_file = inference.is_file_like
83+
84+
data = StringIO("data")
85+
assert is_file(data)
86+
87+
data = [1, 2, 3]
88+
assert not is_file(data)
89+
90+
if PY3:
91+
from unittest import mock
92+
assert not is_file(mock.Mock())
93+
94+
8195
def test_is_named_tuple():
8296
passes = (collections.namedtuple('Test', list('abc'))(1, 2, 3), )
8397
fails = ((1, 2, 3), 'a', Series({'pi': 3.14}))

pandas/types/api.py

+1
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@
5252
is_re_compilable,
5353
is_dict_like,
5454
is_iterator,
55+
is_file_like,
5556
is_list_like,
5657
is_hashable,
5758
is_named_tuple,

pandas/types/inference.py

+23-3
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import re
55
import numpy as np
66
from numbers import Number
7-
from pandas.compat import (string_types, text_type,
7+
from pandas.compat import (PY2, string_types, text_type,
88
string_and_binary_types)
99
from pandas._libs import lib
1010

@@ -35,8 +35,28 @@ def _iterable_not_string(x):
3535

3636

3737
def is_iterator(obj):
38-
# python 3 generators have __next__ instead of next
39-
return hasattr(obj, 'next') or hasattr(obj, '__next__')
38+
if not hasattr(obj, '__iter__'):
39+
return False
40+
41+
if PY2:
42+
return hasattr(obj, 'next')
43+
else:
44+
# Python 3 generators have
45+
# __next__ instead of next
46+
return hasattr(obj, '__next__')
47+
48+
49+
def is_file_like(obj):
50+
file_attrs = ('read', 'write', 'seek', 'tell')
51+
52+
for attr in file_attrs:
53+
if not hasattr(obj, attr):
54+
return False
55+
56+
if not is_iterator(obj):
57+
return False
58+
59+
return True
4060

4161

4262
def is_re(obj):

0 commit comments

Comments
 (0)