diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 2e1cc396287ce..cbb4d32cc5edb 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -1033,6 +1033,7 @@ I/O - Bug in ``pd.read_csv()`` with ``parse_dates`` when multiline headers are specified (:issue:`15376`) - Bug in ``pd.read_csv()`` with ``float_precision='round_trip'`` which caused a segfault when a text entry is parsed (:issue:`15140`) - Bug in ``pd.read_csv()`` when an index was specified and no values were specified as null values (:issue:`15835`) +- Bug in ``pd.read_csv()`` in which certain invalid file objects caused the Python interpreter to crash (:issue:`15337`) - Added checks in ``pd.read_csv()`` ensuring that values for ``nrows`` and ``chunksize`` are valid (:issue:`15767`) - Bug in ``pd.tools.hashing.hash_pandas_object()`` in which hashing of categoricals depended on the ordering of categories, instead of just their values. (:issue:`15143`) - Bug in ``.to_json()`` where ``lines=True`` and contents (keys or values) contain escaped characters (:issue:`15096`) diff --git a/pandas/io/common.py b/pandas/io/common.py index 8bc7217db87f9..8ee6ded67f790 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -10,7 +10,7 @@ from pandas import compat from pandas.formats.printing import pprint_thing from pandas.core.common import AbstractMethodError -from pandas.types.common import is_number +from pandas.types.common import is_number, is_file_like # compat from pandas.errors import (ParserError, DtypeWarning, # noqa @@ -197,9 +197,19 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, encoding=encoding, compression=compression) - # It is a pathlib.Path/py.path.local or string + # Convert pathlib.Path/py.path.local or string filepath_or_buffer = _stringify_path(filepath_or_buffer) - return _expand_user(filepath_or_buffer), None, compression + + if isinstance(filepath_or_buffer, (compat.string_types, + compat.binary_type, + mmap.mmap)): + return _expand_user(filepath_or_buffer), None, compression + + if not is_file_like(filepath_or_buffer): + msg = "Invalid file path or buffer object type: {_type}" + raise ValueError(msg.format(_type=type(filepath_or_buffer))) + + return filepath_or_buffer, None, compression def file_path_to_url(path): @@ -416,6 +426,9 @@ def __init__(self, f): def __getattr__(self, name): return getattr(self.mmap, name) + def __iter__(self): + return self + def __next__(self): newline = self.mmap.readline() @@ -433,6 +446,10 @@ def __next__(self): return newline +if not compat.PY3: + MMapWrapper.next = lambda self: self.__next__() + + class UTF8Recoder(BaseIterator): """ diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 6d136869fc73f..737141f11d7d1 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -243,9 +243,8 @@ def __init__(self, io, **kwds): # to get_filepath_or_buffer() if _is_url(io): io = _urlopen(io) - # Deal with S3 urls, path objects, etc. Will convert them to - # buffer or path string - io, _, _ = get_filepath_or_buffer(io) + elif not isinstance(io, (ExcelFile, xlrd.Book)): + io, _, _ = get_filepath_or_buffer(io) if engine == 'xlrd' and isinstance(io, xlrd.Book): self.book = io diff --git a/pandas/tests/api/test_types.py b/pandas/tests/api/test_types.py index 686de4a196034..f3fd6332417a1 100644 --- a/pandas/tests/api/test_types.py +++ b/pandas/tests/api/test_types.py @@ -24,7 +24,7 @@ class TestTypes(Base, tm.TestCase): 'is_timedelta64_dtype', 'is_timedelta64_ns_dtype', 'is_unsigned_integer_dtype', 'is_period', 'is_period_dtype', 'is_re', 'is_re_compilable', - 'is_dict_like', 'is_iterator', + 'is_dict_like', 'is_iterator', 'is_file_like', 'is_list_like', 'is_hashable', 'is_named_tuple', 'is_sequence', 'pandas_dtype'] diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py index 7faf485b65d10..36d5f2dd5274b 100644 --- a/pandas/tests/io/parser/common.py +++ b/pandas/tests/io/parser/common.py @@ -1678,3 +1678,20 @@ def test_file_handles(self): if PY3: self.assertFalse(m.closed) m.close() + + def test_invalid_file_buffer(self): + # see gh-15337 + + class InvalidBuffer(object): + pass + + msg = "Invalid file path or buffer object type" + + with tm.assertRaisesRegexp(ValueError, msg): + self.read_csv(InvalidBuffer()) + + if PY3: + from unittest import mock + + with tm.assertRaisesRegexp(ValueError, msg): + self.read_csv(mock.Mock()) diff --git a/pandas/tests/types/test_inference.py b/pandas/tests/types/test_inference.py index b41df0da45234..de3a2ca35a7f5 100644 --- a/pandas/tests/types/test_inference.py +++ b/pandas/tests/types/test_inference.py @@ -17,7 +17,7 @@ from pandas import (Series, Index, DataFrame, Timedelta, DatetimeIndex, TimedeltaIndex, Timestamp, Panel, Period, Categorical) -from pandas.compat import u, PY2, lrange +from pandas.compat import u, PY2, PY3, StringIO, lrange from pandas.types import inference from pandas.types.common import (is_timedelta64_dtype, is_timedelta64_ns_dtype, @@ -78,6 +78,20 @@ def test_is_dict_like(): assert not inference.is_dict_like(f) +def test_is_file_like(): + is_file = inference.is_file_like + + data = StringIO("data") + assert is_file(data) + + data = [1, 2, 3] + assert not is_file(data) + + if PY3: + from unittest import mock + assert not is_file(mock.Mock()) + + def test_is_named_tuple(): passes = (collections.namedtuple('Test', list('abc'))(1, 2, 3), ) fails = ((1, 2, 3), 'a', Series({'pi': 3.14})) diff --git a/pandas/types/api.py b/pandas/types/api.py index c809cb3614a8c..e78514ce77822 100644 --- a/pandas/types/api.py +++ b/pandas/types/api.py @@ -52,6 +52,7 @@ is_re_compilable, is_dict_like, is_iterator, + is_file_like, is_list_like, is_hashable, is_named_tuple, diff --git a/pandas/types/inference.py b/pandas/types/inference.py index d8e3b3ee7329b..91418677c6b19 100644 --- a/pandas/types/inference.py +++ b/pandas/types/inference.py @@ -4,7 +4,7 @@ import re import numpy as np from numbers import Number -from pandas.compat import (string_types, text_type, +from pandas.compat import (PY2, string_types, text_type, string_and_binary_types) from pandas._libs import lib @@ -22,28 +22,211 @@ def is_number(obj): + """ + Check if the object is a number. + + Parameters + ---------- + obj : The object to check. + + Returns + ------- + is_number : bool + Whether `obj` is a number or not. + + Examples + -------- + >>> is_number(1) + True + >>> is_number("foo") + False + """ + return isinstance(obj, (Number, np.number)) def is_string_like(obj): + """ + Check if the object is a string. + + Parameters + ---------- + obj : The object to check. + + Examples + -------- + >>> is_string_like("foo") + True + >>> is_string_like(1) + False + + Returns + ------- + is_str_like : bool + Whether `obj` is a string or not. + """ + return isinstance(obj, (text_type, string_types)) -def _iterable_not_string(x): - return (isinstance(x, collections.Iterable) and - not isinstance(x, string_types)) +def _iterable_not_string(obj): + """ + Check if the object is an iterable but not a string. + + Parameters + ---------- + obj : The object to check. + + Returns + ------- + is_iter_not_string : bool + Whether `obj` is a non-string iterable. + + Examples + -------- + >>> _iterable_not_string([1, 2, 3]) + True + >>> _iterable_not_string("foo") + False + >>> _iterable_not_string(1) + False + """ + + return (isinstance(obj, collections.Iterable) and + not isinstance(obj, string_types)) def is_iterator(obj): - # python 3 generators have __next__ instead of next - return hasattr(obj, 'next') or hasattr(obj, '__next__') + """ + Check if the object is an iterator. + + For example, lists are considered iterators + but not strings or datetime objects. + + Parameters + ---------- + obj : The object to check. + + Returns + ------- + is_iter : bool + Whether `obj` is an iterator. + + Examples + -------- + >>> is_iterator([1, 2, 3]) + True + >>> is_iterator(datetime(2017, 1, 1)) + False + >>> is_iterator("foo") + False + >>> is_iterator(1) + False + """ + + if not hasattr(obj, '__iter__'): + return False + + if PY2: + return hasattr(obj, 'next') + else: + # Python 3 generators have + # __next__ instead of next + return hasattr(obj, '__next__') + + +def is_file_like(obj): + """ + Check if the object is a file-like object. + + For objects to be considered file-like, they must + be an iterator AND have the following four methods: + + 1) read + 2) write + 3) seek + 4) tell + + Note: file-like objects must be iterable, but + iterable objects need not be file-like. + + .. versionadded:: 0.20.0 + + Parameters + ---------- + obj : The object to check. + + Returns + ------- + is_file_like : bool + Whether `obj` has file-like properties. + + Examples + -------- + >>> buffer(StringIO("data")) + >>> is_file_like(buffer) + True + >>> is_file_like([1, 2, 3]) + False + """ + + file_attrs = ('read', 'write', 'seek', 'tell') + + for attr in file_attrs: + if not hasattr(obj, attr): + return False + + if not is_iterator(obj): + return False + + return True def is_re(obj): + """ + Check if the object is a regex pattern instance. + + Parameters + ---------- + obj : The object to check. + + Returns + ------- + is_regex : bool + Whether `obj` is a regex pattern. + + Examples + -------- + >>> is_re(re.compile(".*")) + True + >>> is_re("foo") + False + """ + return isinstance(obj, re._pattern_type) def is_re_compilable(obj): + """ + Check if the object can be compiled into a regex pattern instance. + + Parameters + ---------- + obj : The object to check. + + Returns + ------- + is_regex_compilable : bool + Whether `obj` can be compiled as a regex pattern. + + Examples + -------- + >>> is_re_compilable(".*") + True + >>> is_re_compilable(1) + False + """ + try: re.compile(obj) except TypeError: @@ -52,21 +235,95 @@ def is_re_compilable(obj): return True -def is_list_like(arg): - return (hasattr(arg, '__iter__') and - not isinstance(arg, string_and_binary_types)) +def is_list_like(obj): + """ + Check if the object is list-like. + + Objects that are considered list-like are for example Python + lists, tuples, sets, NumPy arrays, and Pandas Series. + + Strings and datetime objects, however, are not considered list-like. + + Parameters + ---------- + obj : The object to check. + + Returns + ------- + is_list_like : bool + Whether `obj` has list-like properties. + + Examples + -------- + >>> is_list_like([1, 2, 3]) + True + >>> is_list_like({1, 2, 3}) + True + >>> is_list_like(datetime(2017, 1, 1)) + False + >>> is_list_like("foo") + False + >>> is_list_like(1) + False + """ + + return (hasattr(obj, '__iter__') and + not isinstance(obj, string_and_binary_types)) + +def is_dict_like(obj): + """ + Check if the object is dict-like. -def is_dict_like(arg): - return hasattr(arg, '__getitem__') and hasattr(arg, 'keys') + Parameters + ---------- + obj : The object to check. + Returns + ------- + is_dict_like : bool + Whether `obj` has dict-like properties. -def is_named_tuple(arg): - return isinstance(arg, tuple) and hasattr(arg, '_fields') + Examples + -------- + >>> is_dict_like({1: 2}) + True + >>> is_dict_like([1, 2, 3]) + False + """ + + return hasattr(obj, '__getitem__') and hasattr(obj, 'keys') + + +def is_named_tuple(obj): + """ + Check if the object is a named tuple. + Parameters + ---------- + obj : The object to check. -def is_hashable(arg): - """Return True if hash(arg) will succeed, False otherwise. + Returns + ------- + is_named_tuple : bool + Whether `obj` is a named tuple. + + Examples + -------- + >>> Point = namedtuple("Point", ["x", "y"]) + >>> p = Point(1, 2) + >>> + >>> is_named_tuple(p) + True + >>> is_named_tuple((1, 2)) + False + """ + + return isinstance(obj, tuple) and hasattr(obj, '_fields') + + +def is_hashable(obj): + """Return True if hash(obj) will succeed, False otherwise. Some types will pass a test against collections.Hashable but fail when they are actually hashed with hash(). @@ -82,25 +339,48 @@ def is_hashable(arg): >>> is_hashable(a) False """ - # unfortunately, we can't use isinstance(arg, collections.Hashable), which - # can be faster than calling hash, because numpy scalars on Python 3 fail - # this test + # Unfortunately, we can't use isinstance(obj, collections.Hashable), which + # can be faster than calling hash. That is because numpy scalars on Python + # 3 fail this test. - # reconsider this decision once this numpy bug is fixed: + # Reconsider this decision once this numpy bug is fixed: # https://github.com/numpy/numpy/issues/5562 try: - hash(arg) + hash(obj) except TypeError: return False else: return True -def is_sequence(x): +def is_sequence(obj): + """ + Check if the object is a sequence of objects. + String types are not included as sequences here. + + Parameters + ---------- + obj : The object to check. + + Returns + ------- + is_sequence : bool + Whether `obj` is a sequence of objects. + + Examples + -------- + >>> l = [1, 2, 3] + >>> + >>> is_sequence(l) + True + >>> is_sequence(iter(l)) + False + """ + try: - iter(x) - len(x) # it has a length - return not isinstance(x, string_and_binary_types) + iter(obj) # Can iterate over it. + len(obj) # Has a length associated with it. + return not isinstance(obj, string_and_binary_types) except (TypeError, AttributeError): return False