MAINT: rename IOError -> OSError (#43366)

mwtoews · web-flow · commit 5f36af325c27 · 2021-09-10T19:28:39.000-07:00
* MAINT: rename IOError -&gt; OSError

* BUG: use TypeError (not OSError) when read_csv expects file path name or file-like object

* FIX: bytes -&gt; BytesIO buffer in __init__ for BaseExcelReader &amp; ExcelFile

* Fixes from pre-commit [automated commit]
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -394,6 +394,7 @@ I/O
 - Column headers are dropped when constructing a :class:`DataFrame` from a sqlalchemy's ``Row`` object (:issue:`40682`)
 - Bug in unpickling a :class:`Index` with object dtype incorrectly inferring numeric dtypes (:issue:`43188`)
 - Bug in :func:`read_csv` where reading multi-header input with unequal lengths incorrectly raising uncontrolled ``IndexError`` (:issue:`43102`)
+- Bug in :func:`read_csv`, changed exception class when expecting a file path name or file-like object from ``OSError`` to ``TypeError`` (:issue:`43366`)
 
 Period
 ^^^^^^
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -606,10 +606,6 @@ cdef class TextReader:
         cdef:
             void *ptr
 
-        if not hasattr(source, "read"):
-            raise IOError(f'Expected file path name or file-like object, '
-                          f'got {type(source)} type')
-
         ptr = new_rd_source(source)
         self.parser.source = ptr
         self.parser.cb_io = &buffer_rd_bytes
diff --git a/pandas/_testing/_io.py b/pandas/_testing/_io.py
@@ -70,7 +70,7 @@ def _get_default_network_errors():
     # Lazy import for http.client because it imports many things from the stdlib
     import http.client
 
-    return (IOError, http.client.HTTPException, TimeoutError)
+    return (OSError, http.client.HTTPException, TimeoutError)
 
 
 def optional_args(decorator):
@@ -135,7 +135,7 @@ def network(
         If True, checks connectivity before running the test case.
     error_classes : tuple or Exception
         error classes to ignore. If not in ``error_classes``, raises the error.
-        defaults to IOError. Be careful about changing the error classes here.
+        defaults to OSError. Be careful about changing the error classes here.
     skip_errnos : iterable of int
         Any exception that has .errno or .reason.erno set to one
         of these values will be skipped with an appropriate
@@ -165,19 +165,20 @@ def network(
       ... def test_network():
       ...     with pd.io.common.urlopen("rabbit://bonanza.com"):
       ...         pass
+      >>> test_network()
       Traceback
          ...
-      URLError: <urlopen error unknown url type: rabit>
+      URLError: <urlopen error unknown url type: rabbit>
 
       You can specify alternative URLs::
 
         >>> @ts.network("https://www.yahoo.com")
         ... def test_something_with_yahoo():
-        ...    raise IOError("Failure Message")
+        ...    raise OSError("Failure Message")
         >>> test_something_with_yahoo()
         Traceback (most recent call last):
             ...
-        IOError: Failure Message
+        OSError: Failure Message
 
     If you set check_before_test, it will check the url first and not run the
     test on failure::
@@ -241,7 +242,7 @@ def wrapper(*args, **kwargs):
 
 def can_connect(url, error_classes=None):
     """
-    Try to connect to the given url. True if succeeds, False if IOError
+    Try to connect to the given url. True if succeeds, False if OSError
     raised
 
     Parameters
@@ -252,7 +253,7 @@ def can_connect(url, error_classes=None):
     Returns
     -------
     connectable : bool
-        Return True if no IOError (unable to connect) or URLError (bad url) was
+        Return True if no OSError (unable to connect) or URLError (bad url) was
         raised
     """
     if error_classes is None:
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -739,6 +739,12 @@ def get_handle(
             isinstance(ioargs.filepath_or_buffer, str) or ioargs.should_close
         )
 
+    if "r" in ioargs.mode and not hasattr(handle, "read"):
+        raise TypeError(
+            "Expected file path name or file-like object, "
+            f"got {type(ioargs.filepath_or_buffer)} type"
+        )
+
     handles.reverse()  # close the most recently added buffer first
     if ioargs.should_close:
         assert not isinstance(ioargs.filepath_or_buffer, str)
diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
@@ -408,6 +408,10 @@ def read_excel(
 
 class BaseExcelReader(metaclass=abc.ABCMeta):
     def __init__(self, filepath_or_buffer, storage_options: StorageOptions = None):
+        # First argument can also be bytes, so create a buffer
+        if isinstance(filepath_or_buffer, bytes):
+            filepath_or_buffer = BytesIO(filepath_or_buffer)
+
         self.handles = IOHandles(
             handle=filepath_or_buffer, compression={"method": None}
         )
@@ -426,8 +430,6 @@ def __init__(self, filepath_or_buffer, storage_options: StorageOptions = None):
             except Exception:
                 self.close()
                 raise
-        elif isinstance(self.handles.handle, bytes):
-            self.book = self.load_workbook(BytesIO(self.handles.handle))
         else:
             raise ValueError(
                 "Must explicitly set engine if not passing in buffer or path for io."
@@ -1115,7 +1117,7 @@ class ExcelFile:
 
     Parameters
     ----------
-    path_or_buffer : str, path object (pathlib.Path or py._path.local.LocalPath),
+    path_or_buffer : str, bytes, path object (pathlib.Path or py._path.local.LocalPath),
         a file-like object, xlrd workbook or openpyxl workbook.
         If a string or path object, expected to be a path to a
         .xls, .xlsx, .xlsb, .xlsm, .odf, .ods, or .odt file.
@@ -1174,6 +1176,10 @@ def __init__(
         if engine is not None and engine not in self._engines:
             raise ValueError(f"Unknown engine: {engine}")
 
+        # First argument can also be bytes, so create a buffer
+        if isinstance(path_or_buffer, bytes):
+            path_or_buffer = BytesIO(path_or_buffer)
+
         # Could be a str, ExcelFile, Book, etc.
         self.io = path_or_buffer
         # Always a string
diff --git a/pandas/io/sql.py b/pandas/io/sql.py
@@ -47,7 +47,7 @@
 from pandas.util.version import Version
 
 
-class DatabaseError(IOError):
+class DatabaseError(OSError):
     pass
 
 
diff --git a/pandas/tests/io/formats/test_console.py b/pandas/tests/io/formats/test_console.py
@@ -39,7 +39,7 @@ def test_detect_console_encoding_from_stdout_stdin(monkeypatch, empty, filled):
         assert detect_console_encoding() == filled
 
 
-@pytest.mark.parametrize("encoding", [AttributeError, IOError, "ascii"])
+@pytest.mark.parametrize("encoding", [AttributeError, OSError, "ascii"])
 def test_detect_console_encoding_fallback_to_locale(monkeypatch, encoding):
     # GH 21552
     with monkeypatch.context() as context:
@@ -55,8 +55,8 @@ def test_detect_console_encoding_fallback_to_locale(monkeypatch, encoding):
         ["ascii", locale.Error],
         [AttributeError, "ascii"],
         [AttributeError, locale.Error],
-        [IOError, "ascii"],
-        [IOError, locale.Error],
+        [OSError, "ascii"],
+        [OSError, locale.Error],
     ],
 )
 def test_detect_console_encoding_fallback_to_default(monkeypatch, std, locale):
diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py
@@ -506,6 +506,14 @@ def test_raise_on_sep_with_delim_whitespace(all_parsers):
         parser.read_csv(StringIO(data), sep=r"\s", delim_whitespace=True)
 
 
+def test_read_filepath_or_buffer(all_parsers):
+    # see gh-43366
+    parser = all_parsers
+
+    with pytest.raises(TypeError, match="Expected file path name or file-like"):
+        parser.read_csv(filepath_or_buffer=b"input")
+
+
 @xfail_pyarrow
 @pytest.mark.parametrize("delim_whitespace", [True, False])
 def test_single_char_leading_whitespace(all_parsers, delim_whitespace):
diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py
@@ -204,12 +204,12 @@ def test_parse_public_s3_bucket_nrows_python(self, tips_df, s3so):
 
     def test_read_s3_fails(self, s3so):
         msg = "The specified bucket does not exist"
-        with pytest.raises(IOError, match=msg):
+        with pytest.raises(OSError, match=msg):
             read_csv("s3://nyqpug/asdf.csv", storage_options=s3so)
 
         # Receive a permission error when trying to read a private bucket.
         # It's irrelevant here that this isn't actually a table.
-        with pytest.raises(IOError, match=msg):
+        with pytest.raises(OSError, match=msg):
             read_csv("s3://cant_get_it/file.csv")
 
     @pytest.mark.xfail(reason="GH#39155 s3fs upgrade", strict=False)
diff --git a/pandas/tests/io/pytables/test_errors.py b/pandas/tests/io/pytables/test_errors.py
@@ -214,15 +214,15 @@ def test_read_hdf_errors(setup_path):
 
     with ensure_clean_path(setup_path) as path:
         msg = r"File [\S]* does not exist"
-        with pytest.raises(IOError, match=msg):
+        with pytest.raises(OSError, match=msg):
             read_hdf(path, "key")
 
         df.to_hdf(path, "df")
         store = HDFStore(path, mode="r")
         store.close()
 
         msg = "The HDFStore must be open for reading."
-        with pytest.raises(IOError, match=msg):
+        with pytest.raises(OSError, match=msg):
             read_hdf(store, "df")
 
 
diff --git a/pandas/tests/io/pytables/test_file_handling.py b/pandas/tests/io/pytables/test_file_handling.py
@@ -40,7 +40,7 @@ def check(mode):
 
             # constructor
             if mode in ["r", "r+"]:
-                with pytest.raises(IOError, match=msg):
+                with pytest.raises(OSError, match=msg):
                     HDFStore(path, mode=mode)
 
             else:
@@ -52,7 +52,7 @@ def check(mode):
 
             # context
             if mode in ["r", "r+"]:
-                with pytest.raises(IOError, match=msg):
+                with pytest.raises(OSError, match=msg):
                     with HDFStore(path, mode=mode) as store:
                         pass
             else:
@@ -63,7 +63,7 @@ def check(mode):
 
             # conv write
             if mode in ["r", "r+"]:
-                with pytest.raises(IOError, match=msg):
+                with pytest.raises(OSError, match=msg):
                     df.to_hdf(path, "df", mode=mode)
                 df.to_hdf(path, "df", mode="w")
             else:
diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py
@@ -195,7 +195,7 @@ def test_iterator(self):
             (pd.read_csv, "os", FileNotFoundError, "csv"),
             (pd.read_fwf, "os", FileNotFoundError, "txt"),
             (pd.read_excel, "xlrd", FileNotFoundError, "xlsx"),
-            (pd.read_feather, "pyarrow", IOError, "feather"),
+            (pd.read_feather, "pyarrow", OSError, "feather"),
             (pd.read_hdf, "tables", FileNotFoundError, "h5"),
             (pd.read_stata, "os", FileNotFoundError, "dta"),
             (pd.read_sas, "os", FileNotFoundError, "sas7bdat"),
@@ -234,7 +234,7 @@ def test_read_non_existent(self, reader, module, error_class, fn_ext):
             (pd.read_table, "os", FileNotFoundError, "csv"),
             (pd.read_fwf, "os", FileNotFoundError, "txt"),
             (pd.read_excel, "xlrd", FileNotFoundError, "xlsx"),
-            (pd.read_feather, "pyarrow", IOError, "feather"),
+            (pd.read_feather, "pyarrow", OSError, "feather"),
             (pd.read_hdf, "tables", FileNotFoundError, "h5"),
             (pd.read_stata, "os", FileNotFoundError, "dta"),
             (pd.read_sas, "os", FileNotFoundError, "sas7bdat"),