Skip to content

ENH: Add file buffer validation to I/O ops #15894

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1033,6 +1033,7 @@ I/O
- Bug in ``pd.read_csv()`` with ``parse_dates`` when multiline headers are specified (:issue:`15376`)
- Bug in ``pd.read_csv()`` with ``float_precision='round_trip'`` which caused a segfault when a text entry is parsed (:issue:`15140`)
- Bug in ``pd.read_csv()`` when an index was specified and no values were specified as null values (:issue:`15835`)
- Bug in ``pd.read_csv()`` in which certain invalid file objects caused the Python interpreter to crash (:issue:`15337`)
- Added checks in ``pd.read_csv()`` ensuring that values for ``nrows`` and ``chunksize`` are valid (:issue:`15767`)
- Bug in ``pd.tools.hashing.hash_pandas_object()`` in which hashing of categoricals depended on the ordering of categories, instead of just their values. (:issue:`15143`)
- Bug in ``.to_json()`` where ``lines=True`` and contents (keys or values) contain escaped characters (:issue:`15096`)
Expand Down
23 changes: 20 additions & 3 deletions pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from pandas import compat
from pandas.formats.printing import pprint_thing
from pandas.core.common import AbstractMethodError
from pandas.types.common import is_number
from pandas.types.common import is_number, is_file_like

# compat
from pandas.errors import (ParserError, DtypeWarning, # noqa
Expand Down Expand Up @@ -197,9 +197,19 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
encoding=encoding,
compression=compression)

# It is a pathlib.Path/py.path.local or string
# Convert pathlib.Path/py.path.local or string
filepath_or_buffer = _stringify_path(filepath_or_buffer)
return _expand_user(filepath_or_buffer), None, compression

if isinstance(filepath_or_buffer, (compat.string_types,
compat.binary_type,
mmap.mmap)):
return _expand_user(filepath_or_buffer), None, compression

if not is_file_like(filepath_or_buffer):
msg = "Invalid file path or buffer object type: {_type}"
raise ValueError(msg.format(_type=type(filepath_or_buffer)))

return filepath_or_buffer, None, compression


def file_path_to_url(path):
Expand Down Expand Up @@ -416,6 +426,9 @@ def __init__(self, f):
def __getattr__(self, name):
return getattr(self.mmap, name)

def __iter__(self):
return self

def __next__(self):
newline = self.mmap.readline()

Expand All @@ -433,6 +446,10 @@ def __next__(self):
return newline


if not compat.PY3:
MMapWrapper.next = lambda self: self.__next__()


class UTF8Recoder(BaseIterator):

"""
Expand Down
5 changes: 2 additions & 3 deletions pandas/io/excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,9 +243,8 @@ def __init__(self, io, **kwds):
# to get_filepath_or_buffer()
if _is_url(io):
io = _urlopen(io)
# Deal with S3 urls, path objects, etc. Will convert them to
# buffer or path string
io, _, _ = get_filepath_or_buffer(io)
elif not isinstance(io, (ExcelFile, xlrd.Book)):
io, _, _ = get_filepath_or_buffer(io)

if engine == 'xlrd' and isinstance(io, xlrd.Book):
self.book = io
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/api/test_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class TestTypes(Base, tm.TestCase):
'is_timedelta64_dtype', 'is_timedelta64_ns_dtype',
'is_unsigned_integer_dtype', 'is_period',
'is_period_dtype', 'is_re', 'is_re_compilable',
'is_dict_like', 'is_iterator',
'is_dict_like', 'is_iterator', 'is_file_like',
'is_list_like', 'is_hashable',
'is_named_tuple', 'is_sequence',
'pandas_dtype']
Expand Down
17 changes: 17 additions & 0 deletions pandas/tests/io/parser/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1678,3 +1678,20 @@ def test_file_handles(self):
if PY3:
self.assertFalse(m.closed)
m.close()

def test_invalid_file_buffer(self):
# see gh-15337

class InvalidBuffer(object):
pass

msg = "Invalid file path or buffer object type"

with tm.assertRaisesRegexp(ValueError, msg):
self.read_csv(InvalidBuffer())

if PY3:
from unittest import mock

with tm.assertRaisesRegexp(ValueError, msg):
self.read_csv(mock.Mock())
16 changes: 15 additions & 1 deletion pandas/tests/types/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from pandas import (Series, Index, DataFrame, Timedelta,
DatetimeIndex, TimedeltaIndex, Timestamp,
Panel, Period, Categorical)
from pandas.compat import u, PY2, lrange
from pandas.compat import u, PY2, PY3, StringIO, lrange
from pandas.types import inference
from pandas.types.common import (is_timedelta64_dtype,
is_timedelta64_ns_dtype,
Expand Down Expand Up @@ -78,6 +78,20 @@ def test_is_dict_like():
assert not inference.is_dict_like(f)


def test_is_file_like():
is_file = inference.is_file_like

data = StringIO("data")
assert is_file(data)

data = [1, 2, 3]
assert not is_file(data)

if PY3:
from unittest import mock
assert not is_file(mock.Mock())


def test_is_named_tuple():
passes = (collections.namedtuple('Test', list('abc'))(1, 2, 3), )
fails = ((1, 2, 3), 'a', Series({'pi': 3.14}))
Expand Down
1 change: 1 addition & 0 deletions pandas/types/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
is_re_compilable,
is_dict_like,
is_iterator,
is_file_like,
is_list_like,
is_hashable,
is_named_tuple,
Expand Down
Loading