ENH: Add file buffer validation to I/O ops

gfyoung · gfyoung · commit 81103f75f34e · 2017-04-05T12:08:06.000-04:00
Closes gh-15337.
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -1033,6 +1033,7 @@ I/O
 - Bug in ``pd.read_csv()`` with ``parse_dates`` when multiline headers are specified (:issue:`15376`)
 - Bug in ``pd.read_csv()`` with ``float_precision='round_trip'`` which caused a segfault when a text entry is parsed (:issue:`15140`)
 - Bug in ``pd.read_csv()`` when an index was specified and no values were specified as null values (:issue:`15835`)
+- Bug in ``pd.read_csv()`` in which certain invalid file objects caused the Python interpreter to crash (:issue:`15337`)
 - Added checks in ``pd.read_csv()`` ensuring that values for ``nrows`` and ``chunksize`` are valid (:issue:`15767`)
 - Bug in ``pd.tools.hashing.hash_pandas_object()`` in which hashing of categoricals depended on the ordering of categories, instead of just their values. (:issue:`15143`)
 - Bug in ``.to_json()`` where ``lines=True`` and contents (keys or values) contain escaped characters (:issue:`15096`)
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -10,7 +10,7 @@
 from pandas import compat
 from pandas.formats.printing import pprint_thing
 from pandas.core.common import AbstractMethodError
-from pandas.types.common import is_number
+from pandas.types.common import is_number, is_file_like
 
 # compat
 from pandas.errors import (ParserError, DtypeWarning,  # noqa
@@ -197,9 +197,19 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
                                          encoding=encoding,
                                          compression=compression)
 
-    # It is a pathlib.Path/py.path.local or string
+    # Convert pathlib.Path/py.path.local or string
     filepath_or_buffer = _stringify_path(filepath_or_buffer)
-    return _expand_user(filepath_or_buffer), None, compression
+
+    if isinstance(filepath_or_buffer, (compat.string_types,
+                                       compat.binary_type,
+                                       mmap.mmap)):
+        return _expand_user(filepath_or_buffer), None, compression
+
+    if not is_file_like(filepath_or_buffer):
+        msg = "Invalid file path or buffer object type: {_type}"
+        raise ValueError(msg.format(_type=type(filepath_or_buffer)))
+
+    return filepath_or_buffer, None, compression
 
 
 def file_path_to_url(path):
@@ -416,6 +426,9 @@ def __init__(self, f):
     def __getattr__(self, name):
         return getattr(self.mmap, name)
 
+    def __iter__(self):
+        return self
+
     def __next__(self):
         newline = self.mmap.readline()
 
@@ -433,6 +446,10 @@ def __next__(self):
         return newline
 
 
+if not compat.PY3:
+    MMapWrapper.next = lambda self: self.__next__()
+
+
 class UTF8Recoder(BaseIterator):
 
     """
diff --git a/pandas/io/excel.py b/pandas/io/excel.py
@@ -243,9 +243,8 @@ def __init__(self, io, **kwds):
         # to get_filepath_or_buffer()
         if _is_url(io):
             io = _urlopen(io)
-        # Deal with S3 urls, path objects, etc. Will convert them to
-        # buffer or path string
-        io, _, _ = get_filepath_or_buffer(io)
+        elif not isinstance(io, (ExcelFile, xlrd.Book)):
+            io, _, _ = get_filepath_or_buffer(io)
 
         if engine == 'xlrd' and isinstance(io, xlrd.Book):
             self.book = io
diff --git a/pandas/tests/api/test_types.py b/pandas/tests/api/test_types.py
@@ -24,7 +24,7 @@ class TestTypes(Base, tm.TestCase):
                'is_timedelta64_dtype', 'is_timedelta64_ns_dtype',
                'is_unsigned_integer_dtype', 'is_period',
                'is_period_dtype', 'is_re', 'is_re_compilable',
-               'is_dict_like', 'is_iterator',
+               'is_dict_like', 'is_iterator', 'is_file_like',
                'is_list_like', 'is_hashable',
                'is_named_tuple', 'is_sequence',
                'pandas_dtype']
diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py
@@ -1678,3 +1678,20 @@ def test_file_handles(self):
                 if PY3:
                     self.assertFalse(m.closed)
                 m.close()
+
+    def test_invalid_file_buffer(self):
+        # see gh-15337
+
+        class InvalidBuffer(object):
+            pass
+
+        msg = "Invalid file path or buffer object type"
+
+        with tm.assertRaisesRegexp(ValueError, msg):
+            self.read_csv(InvalidBuffer())
+
+        if PY3:
+            from unittest import mock
+
+            with tm.assertRaisesRegexp(ValueError, msg):
+                self.read_csv(mock.Mock())
diff --git a/pandas/tests/types/test_inference.py b/pandas/tests/types/test_inference.py
@@ -17,7 +17,7 @@
 from pandas import (Series, Index, DataFrame, Timedelta,
                     DatetimeIndex, TimedeltaIndex, Timestamp,
                     Panel, Period, Categorical)
-from pandas.compat import u, PY2, lrange
+from pandas.compat import u, PY2, PY3, StringIO, lrange
 from pandas.types import inference
 from pandas.types.common import (is_timedelta64_dtype,
                                  is_timedelta64_ns_dtype,
@@ -78,6 +78,20 @@ def test_is_dict_like():
         assert not inference.is_dict_like(f)
 
 
+def test_is_file_like():
+    is_file = inference.is_file_like
+
+    data = StringIO("data")
+    assert is_file(data)
+
+    data = [1, 2, 3]
+    assert not is_file(data)
+
+    if PY3:
+        from unittest import mock
+        assert not is_file(mock.Mock())
+
+
 def test_is_named_tuple():
     passes = (collections.namedtuple('Test', list('abc'))(1, 2, 3), )
     fails = ((1, 2, 3), 'a', Series({'pi': 3.14}))
diff --git a/pandas/types/api.py b/pandas/types/api.py
@@ -52,6 +52,7 @@
                      is_re_compilable,
                      is_dict_like,
                      is_iterator,
+                     is_file_like,
                      is_list_like,
                      is_hashable,
                      is_named_tuple,
diff --git a/pandas/types/inference.py b/pandas/types/inference.py
@@ -4,7 +4,7 @@
 import re
 import numpy as np
 from numbers import Number
-from pandas.compat import (string_types, text_type,
+from pandas.compat import (PY2, string_types, text_type,
                            string_and_binary_types)
 from pandas._libs import lib
 
@@ -35,8 +35,28 @@ def _iterable_not_string(x):
 
 
 def is_iterator(obj):
-    # python 3 generators have __next__ instead of next
-    return hasattr(obj, 'next') or hasattr(obj, '__next__')
+    if not hasattr(obj, '__iter__'):
+        return False
+
+    if PY2:
+        return hasattr(obj, 'next')
+    else:
+        # Python 3 generators have
+        # __next__ instead of next
+        return hasattr(obj, '__next__')
+
+
+def is_file_like(obj):
+    file_attrs = ('read', 'write', 'seek', 'tell')
+
+    for attr in file_attrs:
+        if not hasattr(obj, attr):
+            return False
+
+    if not is_iterator(obj):
+        return False
+
+    return True
 
 
 def is_re(obj):