From 06d79b90b874974044d9857fbce42960b0daad03 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 13 Dec 2020 10:28:32 -0500 Subject: [PATCH 01/15] DEPR: Adjust read excel behavior for xlrd >= 2.0 --- doc/source/whatsnew/v1.2.0.rst | 14 +- pandas/io/excel/_base.py | 219 ++++++++++++++++---------- pandas/tests/io/excel/__init__.py | 13 ++ pandas/tests/io/excel/test_readers.py | 12 ++ pandas/tests/io/excel/test_writers.py | 5 +- pandas/tests/io/excel/test_xlrd.py | 24 ++- 6 files changed, 190 insertions(+), 97 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index e2521cedb64cc..18ba49769e602 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -10,21 +10,17 @@ including other versions of pandas. .. warning:: - The packages `xlrd `_ for reading excel - files and `xlwt `_ for - writing excel files are no longer maintained. These are the only engines in pandas + The package `xlrd `_ for reading excel + files now only supports the xls format. The package + `xlwt `_ for writing excel files + is no longer maintained. These are the only engines in pandas that support the xls format. Previously, the default argument ``engine=None`` to ``pd.read_excel`` would result in using the ``xlrd`` engine in many cases. If `openpyxl `_ is installed, many of these cases will now default to using the ``openpyxl`` engine. - See the :func:`read_excel` documentation for more details. Attempting to read - ``.xls`` files or specifying ``engine="xlrd"`` to ``pd.read_excel`` will not - raise a warning. However users should be aware that ``xlrd`` is already - broken with certain package configurations, for example with Python 3.9 - when `defusedxml `_ is installed, and - is anticipated to be unusable in the future. + See the :func:`read_excel` documentation for more details. Attempting to use the the ``xlwt`` engine will raise a ``FutureWarning`` unless the option :attr:`io.excel.xls.writer` is set to ``"xlwt"``. diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index bf1011176693f..48539f29090af 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -4,8 +4,9 @@ from io import BufferedIOBase, BytesIO, RawIOBase import os from textwrap import fill -from typing import Any, Dict, Mapping, Union, cast +from typing import Any, Dict, Mapping, Optional, Union, cast import warnings +import zipfile from pandas._config import config @@ -111,22 +112,19 @@ - "pyxlsb" supports Binary Excel files. .. versionchanged:: 1.2.0 - The engine `xlrd `_ - is no longer maintained, and is not supported with - python >= 3.9. When ``engine=None``, the following logic will be - used to determine the engine. - - - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), - then `odf `_ will be used. - - Otherwise if ``path_or_buffer`` is a bytes stream, the file has the - extension ``.xls``, or is an ``xlrd`` Book instance, then ``xlrd`` will - be used. - - Otherwise if `openpyxl `_ is installed, - then ``openpyxl`` will be used. - - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised. - - Specifying ``engine="xlrd"`` will continue to be allowed for the - indefinite future. + + The engine `xlrd `_ only reads + xls files in version 2.0 and above. When ``engine=None``, + the following logic will be used to determine the engine. + + - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), + then `odf `_ will be used. + - Otherwise if ``path_or_buffer`` is an xls-style format, + ``xlrd`` will be used. + - Otherwise if `openpyxl `_ is installed, + then ``openpyxl`` will be used. + - Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised. + - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised. converters : dict, default None Dict of functions for converting values in certain columns. Keys can @@ -888,39 +886,93 @@ def close(self): return content -def _is_ods_stream(stream: Union[BufferedIOBase, RawIOBase]) -> bool: +XLS_SIGNATURE = b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1" +ZIP_SIGNATURE = b"PK\x03\x04" +PEEK_SIZE = max(len(XLS_SIGNATURE), len(ZIP_SIGNATURE)) + + +def inspect_excel_format( + path: Optional[str] = None, + content: Union[None, BufferedIOBase, RawIOBase, bytes] = None, +) -> Optional[str]: """ - Check if the stream is an OpenDocument Spreadsheet (.ods) file + Inspect the path or content of an excel file. - It uses magic values inside the stream + Adopted from xlrd: https://github.com/python-excel/xlrd. Parameters ---------- - stream : Union[BufferedIOBase, RawIOBase] - IO stream with data which might be an ODS file + path : str, optional + Path to file to inspect. May be a URL. + content : file-like object + Content of file to inspect. Returns ------- - is_ods : bool - Boolean indication that this is indeed an ODS file or not + str + Format of file. Returns the extension if path is a URL. """ - stream.seek(0) - is_ods = False - if stream.read(4) == b"PK\003\004": - stream.seek(30) - is_ods = ( - stream.read(54) == b"mimetype" - b"application/vnd.oasis.opendocument.spreadsheet" - ) - stream.seek(0) - return is_ods + if content is not None: + if isinstance(content, bytes): + peek = content[:PEEK_SIZE] + else: + buf = content.read(PEEK_SIZE) + if buf is None: + raise ValueError("File is empty") + else: + peek = buf + elif path is not None: + try: + with open(path, "rb") as f: + buf = f.read(PEEK_SIZE) + if buf is None: + raise ValueError("File is empty") + else: + peek = buf + except FileNotFoundError: + # File may be a url, return the extension + return os.path.splitext(path)[-1][1:] + else: + raise ValueError("content or path must not be ") + + if peek.startswith(XLS_SIGNATURE): + return "xls" + + if peek.startswith(ZIP_SIGNATURE): + if content: + # ZipFile requires IO[bytes] + if isinstance(content, bytes): + as_bytesio = BytesIO(content) + else: + buf = content.read() + if buf is None: + raise ValueError("File is empty") + else: + as_bytesio = BytesIO(buf) + zf = zipfile.ZipFile(as_bytesio) + else: + assert path is not None + zf = zipfile.ZipFile(path) + + # Workaround for some third party files that use forward slashes and + # lower case names. + component_names = [name.replace("\\", "/").lower() for name in zf.namelist()] + + if "xl/workbook.xml" in component_names: + return "xlsx" + if "xl/workbook.bin" in component_names: + return "xlsb" + if "content.xml" in component_names: + return "ods" + return "zip" + return None class ExcelFile: """ Class for parsing tabular excel sheets into DataFrame objects. - Uses xlrd engine by default. See read_excel for more documentation + See read_excel for more documentation. Parameters ---------- @@ -940,22 +992,18 @@ class ExcelFile: .. versionchanged:: 1.2.0 - The engine `xlrd `_ - is no longer maintained, and is not supported with - python >= 3.9. When ``engine=None``, the following logic will be - used to determine the engine. + The engine `xlrd `_ only reads + xls files in version 2.0 and above. When ``engine=None``, + the following logic will be used to determine the engine. - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), then `odf `_ will be used. - - Otherwise if ``path_or_buffer`` is a bytes stream, the file has the - extension ``.xls``, or is an ``xlrd`` Book instance, then ``xlrd`` - will be used. + - Otherwise if ``path_or_buffer`` is an xls-style format, + ``xlrd`` will be used. - Otherwise if `openpyxl `_ is installed, then ``openpyxl`` will be used. + - Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised. - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised. - - Specifying ``engine="xlrd"`` will continue to be allowed for the - indefinite future. """ from pandas.io.excel._odfreader import ODFReader @@ -973,40 +1021,46 @@ class ExcelFile: def __init__( self, path_or_buffer, engine=None, storage_options: StorageOptions = None ): - if engine is None: - # Determine ext and use odf for ods stream/file - if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)): - ext = None - if _is_ods_stream(path_or_buffer): - engine = "odf" - else: - ext = os.path.splitext(str(path_or_buffer))[-1] - if ext == ".ods": - engine = "odf" + if engine is not None and engine not in self._engines: + raise ValueError(f"Unknown engine: {engine}") - if ( - import_optional_dependency( - "xlrd", raise_on_missing=False, on_version="ignore" - ) - is not None - ): - from xlrd import Book + # Determine xlrd version if installed + if ( + import_optional_dependency( + "xlrd", raise_on_missing=False, on_version="ignore" + ) + is None + ): + xlrd_version = None + else: + import xlrd - if isinstance(path_or_buffer, Book): - engine = "xlrd" + xlrd_version = xlrd.__version__ - # GH 35029 - Prefer openpyxl except for xls files - if engine is None: - if ext is None or isinstance(path_or_buffer, bytes) or ext == ".xls": - engine = "xlrd" - elif ( + if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase, bytes)): + ext = inspect_excel_format(content=path_or_buffer) + elif xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book): + ext = "xls" + else: + # path_or_buffer is path-like, use stringified path + ext = inspect_excel_format(path=str(path_or_buffer)) + + if engine is None: + if ext == "ods": + engine = "odf" + elif ext == "xls": + engine = "xlrd" + else: + # GH 35029 - Prefer openpyxl except for xls files + if ( import_optional_dependency( "openpyxl", raise_on_missing=False, on_version="ignore" ) is not None ): engine = "openpyxl" - else: + elif xlrd_version is not None and xlrd_version < "2": + # If xlrd_version >= "2", we error below caller = inspect.stack()[1] if ( caller.filename.endswith("pandas/io/excel/_base.py") @@ -1016,19 +1070,26 @@ def __init__( else: stacklevel = 2 warnings.warn( - "The xlrd engine is no longer maintained and is not " - "supported when using pandas with python >= 3.9. However, " - "the engine xlrd will continue to be allowed for the " - "indefinite future. Beginning with pandas 1.2.0, the " - "openpyxl engine will be used if it is installed and the " - "engine argument is not specified. Either install openpyxl " - "or specify engine='xlrd' to silence this warning.", + f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " + f"only the xls format is supported. As a result, the " + f"openpyxl engine will be used if it is installed " + f"and the engine argument is not specified. Either install " + f"openpyxl or specify engine='xlrd' to silence this warning.", FutureWarning, stacklevel=stacklevel, ) engine = "xlrd" - if engine not in self._engines: - raise ValueError(f"Unknown engine: {engine}") + + if ( + engine == "xlrd" + and xlrd_version is not None + and xlrd_version >= "2" + and not ext == "xls" + ): + raise ValueError( + f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " + f"only the xls format is supported. " + ) self.engine = engine self.storage_options = storage_options diff --git a/pandas/tests/io/excel/__init__.py b/pandas/tests/io/excel/__init__.py index 384f1006c44df..84a4b47adceaf 100644 --- a/pandas/tests/io/excel/__init__.py +++ b/pandas/tests/io/excel/__init__.py @@ -1,5 +1,7 @@ import pytest +from pandas.compat._optional import import_optional_dependency + pytestmark = [ pytest.mark.filterwarnings( # Looks like tree.getiterator is deprecated in favor of tree.iter @@ -14,3 +16,14 @@ "ignore:As the xlwt package is no longer maintained:FutureWarning" ), ] + + +if ( + import_optional_dependency("xlrd", raise_on_missing=False, on_version="ignore") + is None +): + xlrd_version = None +else: + import xlrd + + xlrd_version = xlrd.__version__ diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 98a55ae39bd77..69703160e6327 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -11,6 +11,7 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series import pandas._testing as tm +from pandas.tests.io.excel import xlrd_version read_ext_params = [".xls", ".xlsx", ".xlsm", ".xlsb", ".ods"] engine_params = [ @@ -57,6 +58,13 @@ def _is_valid_engine_ext_pair(engine, read_ext: str) -> bool: return False if read_ext == ".xlsb" and engine != "pyxlsb": return False + if ( + engine == "xlrd" + and xlrd_version is not None + and xlrd_version >= "2" + and read_ext != ".xls" + ): + return False return True @@ -1158,6 +1166,10 @@ def test_excel_read_binary(self, engine, read_ext): actual = pd.read_excel(data, engine=engine) tm.assert_frame_equal(expected, actual) + @pytest.mark.skipif( + xlrd_version is not None and xlrd_version >= "2", + reason="xlrd no longer supports xlsx", + ) def test_excel_high_surrogate(self, engine): # GH 23809 expected = DataFrame(["\udc88"], columns=["Column1"]) diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 80ebeb4c03d89..741dfedd2824e 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -1196,8 +1196,9 @@ def test_datetimes(self, path): write_frame = DataFrame({"A": datetimes}) write_frame.to_excel(path, "Sheet1") # GH 35029 - Default changed to openpyxl, but test is for odf/xlrd - engine = "odf" if path.endswith("ods") else "xlrd" - read_frame = pd.read_excel(path, sheet_name="Sheet1", header=0, engine=engine) + if path.endswith("xlsx") or path.endswith("xlsm"): + pytest.skip("Defaults to openpyxl and fails - Should this pass?") + read_frame = pd.read_excel(path, sheet_name="Sheet1", header=0) tm.assert_series_equal(write_frame["A"], read_frame["A"]) diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py index f2fbcbc2e2f04..2a1114a9570f0 100644 --- a/pandas/tests/io/excel/test_xlrd.py +++ b/pandas/tests/io/excel/test_xlrd.py @@ -4,6 +4,7 @@ import pandas as pd import pandas._testing as tm +from pandas.tests.io.excel import xlrd_version from pandas.io.excel import ExcelFile @@ -17,6 +18,8 @@ def skip_ods_and_xlsb_files(read_ext): pytest.skip("Not valid for xlrd") if read_ext == ".xlsb": pytest.skip("Not valid for xlrd") + if read_ext in (".xlsx", ".xlsm") and xlrd_version >= "2": + pytest.skip("Not valid for xlrd >= 2.0") def test_read_xlrd_book(read_ext, frame): @@ -66,7 +69,7 @@ def test_excel_file_warning_with_xlsx_file(datapath): pd.read_excel(path, "Sheet1", engine=None) -def test_read_excel_warning_with_xlsx_file(tmpdir, datapath): +def test_read_excel_warning_with_xlsx_file(datapath): # GH 29375 path = datapath("io", "data", "excel", "test1.xlsx") has_openpyxl = ( @@ -76,12 +79,19 @@ def test_read_excel_warning_with_xlsx_file(tmpdir, datapath): is not None ) if not has_openpyxl: - with tm.assert_produces_warning( - FutureWarning, - raise_on_extra_warnings=False, - match="The xlrd engine is no longer maintained", - ): - pd.read_excel(path, "Sheet1", engine=None) + if xlrd_version >= "2": + with pytest.raises( + ValueError, + match="Your version of xlrd is ", + ): + pd.read_excel(path, "Sheet1", engine=None) + else: + with tm.assert_produces_warning( + FutureWarning, + raise_on_extra_warnings=False, + match="The xlrd engine is no longer maintained", + ): + pd.read_excel(path, "Sheet1", engine=None) else: with tm.assert_produces_warning(None): pd.read_excel(path, "Sheet1", engine=None) From e2c29ff14b2448b36d00dcde4a8e479bd5707107 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 19 Dec 2020 06:03:35 -0500 Subject: [PATCH 02/15] Use stringified path, suppress warnings in io.test_common --- pandas/io/excel/_base.py | 18 +++++++++++------- pandas/tests/io/__init__.py | 4 ++++ 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 48539f29090af..028a680b50db8 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1024,6 +1024,11 @@ def __init__( if engine is not None and engine not in self._engines: raise ValueError(f"Unknown engine: {engine}") + # Could be a str, ExcelFile, Book, etc. + self.io = path_or_buffer + # Always a string + self._io = stringify_path(path_or_buffer) + # Determine xlrd version if installed if ( import_optional_dependency( @@ -1041,9 +1046,13 @@ def __init__( ext = inspect_excel_format(content=path_or_buffer) elif xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book): ext = "xls" - else: + elif isinstance(self._io, (str, bytes)): # path_or_buffer is path-like, use stringified path - ext = inspect_excel_format(path=str(path_or_buffer)) + ext = inspect_excel_format(path=self._io) + else: + raise ValueError( + f"Unrecognized path_or_buffer type; got {type(path_or_buffer)}" + ) if engine is None: if ext == "ods": @@ -1094,11 +1103,6 @@ def __init__( self.engine = engine self.storage_options = storage_options - # Could be a str, ExcelFile, Book, etc. - self.io = path_or_buffer - # Always a string - self._io = stringify_path(path_or_buffer) - self._reader = self._engines[engine](self._io, storage_options=storage_options) def __fspath__(self): diff --git a/pandas/tests/io/__init__.py b/pandas/tests/io/__init__.py index c5e867f45b92d..39474dedba78c 100644 --- a/pandas/tests/io/__init__.py +++ b/pandas/tests/io/__init__.py @@ -14,4 +14,8 @@ r"Use 'tree.iter\(\)' or 'list\(tree.iter\(\)\)' instead." ":PendingDeprecationWarning" ), + # GH 26552 + pytest.mark.filterwarnings( + "ignore:As the xlwt package is no longer maintained:FutureWarning" + ), ] From f56970b8364a631fe0736ecdd215b73bda875a8e Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 19 Dec 2020 08:41:09 -0500 Subject: [PATCH 03/15] Use get_handle & seek --- pandas/io/excel/_base.py | 116 +++++++++++++++++++-------------------- 1 file changed, 56 insertions(+), 60 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 028a680b50db8..a27d90109acbb 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -4,7 +4,7 @@ from io import BufferedIOBase, BytesIO, RawIOBase import os from textwrap import fill -from typing import Any, Dict, Mapping, Optional, Union, cast +from typing import IO, Any, Dict, Mapping, Optional, Union, cast import warnings import zipfile @@ -14,11 +14,12 @@ from pandas._typing import Buffer, FilePathOrBuffer, StorageOptions from pandas.compat._optional import import_optional_dependency from pandas.errors import EmptyDataError -from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments +from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments, doc from pandas.core.dtypes.common import is_bool, is_float, is_integer, is_list_like from pandas.core.frame import DataFrame +from pandas.core.shared_docs import _shared_docs from pandas.io.common import IOHandles, get_handle, stringify_path, validate_header_arg from pandas.io.excel._util import ( @@ -891,80 +892,75 @@ def close(self): PEEK_SIZE = max(len(XLS_SIGNATURE), len(ZIP_SIGNATURE)) +@doc(storage_options=_shared_docs["storage_options"]) def inspect_excel_format( path: Optional[str] = None, content: Union[None, BufferedIOBase, RawIOBase, bytes] = None, + storage_options: StorageOptions = None, ) -> Optional[str]: """ Inspect the path or content of an excel file. + At least one of path or content must be not None. If both are not None, + content will take precedence. + Adopted from xlrd: https://github.com/python-excel/xlrd. Parameters ---------- path : str, optional Path to file to inspect. May be a URL. - content : file-like object + content : file-like object, optional Content of file to inspect. + {storage_options} Returns ------- str Format of file. Returns the extension if path is a URL. """ - if content is not None: - if isinstance(content, bytes): - peek = content[:PEEK_SIZE] - else: - buf = content.read(PEEK_SIZE) - if buf is None: - raise ValueError("File is empty") - else: - peek = buf - elif path is not None: - try: - with open(path, "rb") as f: - buf = f.read(PEEK_SIZE) - if buf is None: - raise ValueError("File is empty") - else: - peek = buf - except FileNotFoundError: - # File may be a url, return the extension - return os.path.splitext(path)[-1][1:] + content_or_path: Union[None, str, BufferedIOBase, RawIOBase, IO[bytes]] + if isinstance(content, bytes): + content_or_path = BytesIO(content) else: - raise ValueError("content or path must not be ") - - if peek.startswith(XLS_SIGNATURE): - return "xls" - - if peek.startswith(ZIP_SIGNATURE): - if content: - # ZipFile requires IO[bytes] - if isinstance(content, bytes): - as_bytesio = BytesIO(content) - else: - buf = content.read() - if buf is None: - raise ValueError("File is empty") - else: - as_bytesio = BytesIO(buf) - zf = zipfile.ZipFile(as_bytesio) + content_or_path = content or path + assert content_or_path is not None + + with get_handle( + content_or_path, "rb", storage_options=storage_options, is_text=False + ) as handle: + stream = handle.handle + stream.seek(0) + buf = stream.read(PEEK_SIZE) + if buf is None: + raise ValueError("stream is empty") else: - assert path is not None - zf = zipfile.ZipFile(path) - - # Workaround for some third party files that use forward slashes and - # lower case names. - component_names = [name.replace("\\", "/").lower() for name in zf.namelist()] - - if "xl/workbook.xml" in component_names: - return "xlsx" - if "xl/workbook.bin" in component_names: - return "xlsb" - if "content.xml" in component_names: - return "ods" - return "zip" + assert isinstance(buf, bytes) + peek = buf + stream.seek(0) + + if peek.startswith(XLS_SIGNATURE): + return "xls" + + if peek.startswith(ZIP_SIGNATURE): + # ZipFile typing is overly-strict + # https://github.com/python/typeshed/issues/4212 + zf = zipfile.ZipFile(stream) # type: ignore[arg-type] + + # Workaround for some third party files that use forward slashes and + # lower case names. + component_names = [ + name.replace("\\", "/").lower() for name in zf.namelist() + ] + + if "xl/workbook.xml" in component_names: + return "xlsx" + if "xl/workbook.bin" in component_names: + return "xlsb" + if "content.xml" in component_names: + return "ods" + return "zip" + return None @@ -1043,15 +1039,15 @@ def __init__( xlrd_version = xlrd.__version__ if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase, bytes)): - ext = inspect_excel_format(content=path_or_buffer) + ext = inspect_excel_format( + content=path_or_buffer, storage_options=storage_options + ) elif xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book): ext = "xls" - elif isinstance(self._io, (str, bytes)): - # path_or_buffer is path-like, use stringified path - ext = inspect_excel_format(path=self._io) else: - raise ValueError( - f"Unrecognized path_or_buffer type; got {type(path_or_buffer)}" + # path_or_buffer is path-like, use stringified path + ext = inspect_excel_format( + path=str(self._io), storage_options=storage_options ) if engine is None: From e992049985957438fe97c4f801160dc5ed14475f Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 19 Dec 2020 10:22:44 -0500 Subject: [PATCH 04/15] inspection raises if not a xls/zip file; tests for missing/corrupt file --- pandas/io/excel/_base.py | 44 ++++++++++++++------------- pandas/tests/io/excel/test_readers.py | 11 +++++++ 2 files changed, 34 insertions(+), 21 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index a27d90109acbb..0a32e447dcc5f 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -897,7 +897,7 @@ def inspect_excel_format( path: Optional[str] = None, content: Union[None, BufferedIOBase, RawIOBase, bytes] = None, storage_options: StorageOptions = None, -) -> Optional[str]: +) -> str: """ Inspect the path or content of an excel file. @@ -917,7 +917,14 @@ def inspect_excel_format( Returns ------- str - Format of file. Returns the extension if path is a URL. + Format of file. + + Raises + ------ + ValueError + If resulting stream is empty. + BadZipFile + If resulting stream does not have an XLS signature and is not a valid zipfile. """ content_or_path: Union[None, str, BufferedIOBase, RawIOBase, IO[bytes]] if isinstance(content, bytes): @@ -942,26 +949,21 @@ def inspect_excel_format( if peek.startswith(XLS_SIGNATURE): return "xls" - if peek.startswith(ZIP_SIGNATURE): - # ZipFile typing is overly-strict - # https://github.com/python/typeshed/issues/4212 - zf = zipfile.ZipFile(stream) # type: ignore[arg-type] + # ZipFile typing is overly-strict + # https://github.com/python/typeshed/issues/4212 + zf = zipfile.ZipFile(stream) # type: ignore[arg-type] - # Workaround for some third party files that use forward slashes and - # lower case names. - component_names = [ - name.replace("\\", "/").lower() for name in zf.namelist() - ] + # Workaround for some third party files that use forward slashes and + # lower case names. + component_names = [name.replace("\\", "/").lower() for name in zf.namelist()] - if "xl/workbook.xml" in component_names: - return "xlsx" - if "xl/workbook.bin" in component_names: - return "xlsb" - if "content.xml" in component_names: - return "ods" - return "zip" - - return None + if "xl/workbook.xml" in component_names: + return "xlsx" + if "xl/workbook.bin" in component_names: + return "xlsb" + if "content.xml" in component_names: + return "ods" + return "zip" class ExcelFile: @@ -1093,7 +1095,7 @@ def __init__( ): raise ValueError( f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " - f"only the xls format is supported. " + f"only the xls format is supported." ) self.engine = engine diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 69703160e6327..2d22fd368d4ce 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -2,6 +2,7 @@ from functools import partial import os from urllib.error import URLError +from zipfile import BadZipFile import numpy as np import pytest @@ -622,6 +623,16 @@ def test_bad_engine_raises(self, read_ext): with pytest.raises(ValueError, match="Unknown engine: foo"): pd.read_excel("", engine=bad_engine) + def test_missing_file_raises(self, read_ext): + bad_file = f"foo{read_ext}" + with pytest.raises(FileNotFoundError, match="No such file or directory"): + pd.read_excel(bad_file) + + def test_corrupt_bytes_raises(self, read_ext, engine): + bad_stream = b"foo" + with pytest.raises(BadZipFile, match="File is not a zip file"): + pd.read_excel(bad_stream) + @tm.network def test_read_from_http_url(self, read_ext): url = ( From 792b53adf9452f4e4bb2e3be48ba72bef89c6978 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 20 Dec 2020 08:13:05 -0500 Subject: [PATCH 05/15] Minor doc fixes --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/io/excel/_base.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 18ba49769e602..7e91ecf33eb9b 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -22,7 +22,7 @@ including other versions of pandas. many of these cases will now default to using the ``openpyxl`` engine. See the :func:`read_excel` documentation for more details. - Attempting to use the the ``xlwt`` engine will raise a ``FutureWarning`` + Attempting to use the ``xlwt`` engine will raise a ``FutureWarning`` unless the option :attr:`io.excel.xls.writer` is set to ``"xlwt"``. While this option is now deprecated and will also raise a ``FutureWarning``, it can be globally set and the warning suppressed. Users are recommended to diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 0a32e447dcc5f..07c4a002fc61a 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -120,7 +120,7 @@ - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), then `odf `_ will be used. - - Otherwise if ``path_or_buffer`` is an xls-style format, + - Otherwise if ``path_or_buffer`` is an xls format, ``xlrd`` will be used. - Otherwise if `openpyxl `_ is installed, then ``openpyxl`` will be used. @@ -899,7 +899,7 @@ def inspect_excel_format( storage_options: StorageOptions = None, ) -> str: """ - Inspect the path or content of an excel file. + Inspect the path or content of an excel file and get its format. At least one of path or content must be not None. If both are not None, content will take precedence. @@ -996,7 +996,7 @@ class ExcelFile: - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), then `odf `_ will be used. - - Otherwise if ``path_or_buffer`` is an xls-style format, + - Otherwise if ``path_or_buffer`` is an xls format, ``xlrd`` will be used. - Otherwise if `openpyxl `_ is installed, then ``openpyxl`` will be used. From 266b18b11a376bbf2f2ba2eb6342fa49114d945b Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 20 Dec 2020 09:36:20 -0500 Subject: [PATCH 06/15] Fix test to handle zh_CN.utf8 --- pandas/tests/io/excel/test_readers.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 2d22fd368d4ce..d7c2db18988d2 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -625,7 +625,10 @@ def test_bad_engine_raises(self, read_ext): def test_missing_file_raises(self, read_ext): bad_file = f"foo{read_ext}" - with pytest.raises(FileNotFoundError, match="No such file or directory"): + # CI tests with zh_CN.utf8, translates to "No such file or directory" + with pytest.raises( + FileNotFoundError, match=r"(No such file or directory|没有那个文件或目录)" + ): pd.read_excel(bad_file) def test_corrupt_bytes_raises(self, read_ext, engine): From 56cf9560acef800cce79684bbb49d4f626e583c3 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 21 Dec 2020 09:59:26 -0500 Subject: [PATCH 07/15] Use LooseVersion --- pandas/io/excel/_base.py | 3 ++- pandas/tests/io/excel/__init__.py | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 07c4a002fc61a..b93bba887db10 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1,5 +1,6 @@ import abc import datetime +from distutils.version import LooseVersion import inspect from io import BufferedIOBase, BytesIO, RawIOBase import os @@ -1038,7 +1039,7 @@ def __init__( else: import xlrd - xlrd_version = xlrd.__version__ + xlrd_version = LooseVersion(xlrd.__version__) if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase, bytes)): ext = inspect_excel_format( diff --git a/pandas/tests/io/excel/__init__.py b/pandas/tests/io/excel/__init__.py index 84a4b47adceaf..2f37c90d1b9bd 100644 --- a/pandas/tests/io/excel/__init__.py +++ b/pandas/tests/io/excel/__init__.py @@ -1,3 +1,5 @@ +from distutils.version import LooseVersion + import pytest from pandas.compat._optional import import_optional_dependency @@ -26,4 +28,4 @@ else: import xlrd - xlrd_version = xlrd.__version__ + xlrd_version = LooseVersion(xlrd.__version__) From 01462580ecaa852842d88dbf8b7d799f2f4a41c9 Mon Sep 17 00:00:00 2001 From: rhshadrach Date: Tue, 22 Dec 2020 17:23:37 -0500 Subject: [PATCH 08/15] DEPR: Adjust read excel behavior for xlrd >= 2.0 --- pandas/tests/io/excel/test_writers.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 741dfedd2824e..197738330efe1 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -1195,9 +1195,8 @@ def test_datetimes(self, path): write_frame = DataFrame({"A": datetimes}) write_frame.to_excel(path, "Sheet1") - # GH 35029 - Default changed to openpyxl, but test is for odf/xlrd if path.endswith("xlsx") or path.endswith("xlsm"): - pytest.skip("Defaults to openpyxl and fails - Should this pass?") + pytest.skip("Defaults to openpyxl and fails - GH #38644") read_frame = pd.read_excel(path, sheet_name="Sheet1", header=0) tm.assert_series_equal(write_frame["A"], read_frame["A"]) From 00bfba38cb9dd30747d44b484653b571a37cf6a8 Mon Sep 17 00:00:00 2001 From: rhshadrach Date: Tue, 22 Dec 2020 18:23:15 -0500 Subject: [PATCH 09/15] Improvements from feedback, xlrd>=2.0 for two builds --- ci/deps/azure-windows-37.yaml | 2 +- ci/deps/travis-38-slow.yaml | 2 +- pandas/io/excel/_base.py | 60 +++++++++++++++++------------------ 3 files changed, 32 insertions(+), 32 deletions(-) diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index ad72b9c8577e9..e7ac4c783b855 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -33,7 +33,7 @@ dependencies: - s3fs>=0.4.2 - scipy - sqlalchemy - - xlrd<2.0 + - xlrd>=2.0 - xlsxwriter - xlwt - pyreadstat diff --git a/ci/deps/travis-38-slow.yaml b/ci/deps/travis-38-slow.yaml index 2b4339cf12658..9651837f26114 100644 --- a/ci/deps/travis-38-slow.yaml +++ b/ci/deps/travis-38-slow.yaml @@ -30,7 +30,7 @@ dependencies: - moto>=1.3.14 - scipy - sqlalchemy - - xlrd<2.0 + - xlrd>=2.0 - xlsxwriter - xlwt - moto diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index b93bba887db10..9cd5a419d4dba 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -126,7 +126,8 @@ - Otherwise if `openpyxl `_ is installed, then ``openpyxl`` will be used. - Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised. - - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised. + - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised. This + case will raise a ``ValueError`` in a future version of pandas. converters : dict, default None Dict of functions for converting values in certain columns. Keys can @@ -1003,6 +1004,7 @@ class ExcelFile: then ``openpyxl`` will be used. - Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised. - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised. + This case will raise a ``ValueError`` in a future version of pandas. """ from pandas.io.excel._odfreader import ODFReader @@ -1067,37 +1069,35 @@ def __init__( is not None ): engine = "openpyxl" - elif xlrd_version is not None and xlrd_version < "2": - # If xlrd_version >= "2", we error below - caller = inspect.stack()[1] - if ( - caller.filename.endswith("pandas/io/excel/_base.py") - and caller.function == "read_excel" - ): - stacklevel = 4 - else: - stacklevel = 2 - warnings.warn( - f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " - f"only the xls format is supported. As a result, the " - f"openpyxl engine will be used if it is installed " - f"and the engine argument is not specified. Either install " - f"openpyxl or specify engine='xlrd' to silence this warning.", - FutureWarning, - stacklevel=stacklevel, - ) + else: engine = "xlrd" - if ( - engine == "xlrd" - and xlrd_version is not None - and xlrd_version >= "2" - and not ext == "xls" - ): - raise ValueError( - f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " - f"only the xls format is supported." - ) + if engine == "xlrd" and ext != "xls" and xlrd_version is not None: + if xlrd_version >= "2": + raise ValueError( + f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " + f"only the xls format is supported. Install openpyxl instead." + ) + else: + # If xlrd_version >= "2", we error below + caller = inspect.stack()[1] + if ( + caller.filename.endswith("pandas/io/excel/_base.py") + and caller.function == "read_excel" + ): + stacklevel = 4 + else: + stacklevel = 2 + warnings.warn( + f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " + f"only the xls format is supported. As a result, the " + f"openpyxl engine will be used if it is installed and the " + f"engine argument is not specified. Install " + f"openpyxl instead.", + FutureWarning, + stacklevel=stacklevel, + ) + assert engine in self._engines, f"Engine {engine} not recognized" self.engine = engine self.storage_options = storage_options From 4a8801d9fb7355940a6125394385e28cf6f54bc3 Mon Sep 17 00:00:00 2001 From: rhshadrach Date: Wed, 23 Dec 2020 08:23:36 -0500 Subject: [PATCH 10/15] Ignore FutureWarning in excel tests --- pandas/tests/io/excel/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/tests/io/excel/__init__.py b/pandas/tests/io/excel/__init__.py index 2f37c90d1b9bd..b7ceb28573484 100644 --- a/pandas/tests/io/excel/__init__.py +++ b/pandas/tests/io/excel/__init__.py @@ -17,6 +17,10 @@ pytest.mark.filterwarnings( "ignore:As the xlwt package is no longer maintained:FutureWarning" ), + # GH 38571 + pytest.mark.filterwarnings( + "ignore:.*In xlrd >= 2.0, only the xls format is supported:FutureWarning" + ), ] From 59e0cb942485e6764982126c0d993d39eab5c35f Mon Sep 17 00:00:00 2001 From: rhshadrach Date: Wed, 23 Dec 2020 09:45:52 -0500 Subject: [PATCH 11/15] Added debug statements Windows --- pandas/io/excel/_base.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 9cd5a419d4dba..d9ce9134a0da1 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1079,7 +1079,6 @@ def __init__( f"only the xls format is supported. Install openpyxl instead." ) else: - # If xlrd_version >= "2", we error below caller = inspect.stack()[1] if ( caller.filename.endswith("pandas/io/excel/_base.py") @@ -1093,7 +1092,7 @@ def __init__( f"only the xls format is supported. As a result, the " f"openpyxl engine will be used if it is installed and the " f"engine argument is not specified. Install " - f"openpyxl instead.", + f"openpyxl instead. {caller.filename} -- {caller.function}", FutureWarning, stacklevel=stacklevel, ) From 8f8b74e472a42e857e0c3ff8b46080a6916e2552 Mon Sep 17 00:00:00 2001 From: rhshadrach Date: Wed, 23 Dec 2020 11:46:35 -0500 Subject: [PATCH 12/15] Fixed stacklevel determination for Windows, added test --- pandas/io/excel/_base.py | 6 ++++-- pandas/tests/io/excel/test_readers.py | 7 +++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index d9ce9134a0da1..7f800b4a58e06 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1081,7 +1081,9 @@ def __init__( else: caller = inspect.stack()[1] if ( - caller.filename.endswith("pandas/io/excel/_base.py") + caller.filename.endswith( + os.path.join("pandas", "io", "excel", "_base.py") + ) and caller.function == "read_excel" ): stacklevel = 4 @@ -1092,7 +1094,7 @@ def __init__( f"only the xls format is supported. As a result, the " f"openpyxl engine will be used if it is installed and the " f"engine argument is not specified. Install " - f"openpyxl instead. {caller.filename} -- {caller.function}", + f"openpyxl instead.", FutureWarning, stacklevel=stacklevel, ) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index d7c2db18988d2..5bcb9e78cec8d 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1180,6 +1180,13 @@ def test_excel_read_binary(self, engine, read_ext): actual = pd.read_excel(data, engine=engine) tm.assert_frame_equal(expected, actual) + def test_excel_read_binary_via_read_excel(self, read_ext, engine): + # GH 38424 + expected = pd.read_excel("test1" + read_ext, engine=engine) + with open("test1" + read_ext, "rb") as f: + result = pd.read_excel(f) + tm.assert_frame_equal(result, expected) + @pytest.mark.skipif( xlrd_version is not None and xlrd_version >= "2", reason="xlrd no longer supports xlsx", From 8c95acd714d221e2261e8e1d018a04db397ad5a2 Mon Sep 17 00:00:00 2001 From: rhshadrach Date: Wed, 23 Dec 2020 13:05:59 -0500 Subject: [PATCH 13/15] Changed error message on non-zip file --- pandas/io/excel/_base.py | 2 ++ pandas/tests/io/excel/test_readers.py | 3 +-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 30c6700d78baf..221e8b9ccfb14 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -950,6 +950,8 @@ def inspect_excel_format( if peek.startswith(XLS_SIGNATURE): return "xls" + elif not peek.startswith(ZIP_SIGNATURE): + raise ValueError("File is not a recognized excel file") # ZipFile typing is overly-strict # https://github.com/python/typeshed/issues/4212 diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 5bcb9e78cec8d..0ed4a454a3bd0 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -2,7 +2,6 @@ from functools import partial import os from urllib.error import URLError -from zipfile import BadZipFile import numpy as np import pytest @@ -633,7 +632,7 @@ def test_missing_file_raises(self, read_ext): def test_corrupt_bytes_raises(self, read_ext, engine): bad_stream = b"foo" - with pytest.raises(BadZipFile, match="File is not a zip file"): + with pytest.raises(ValueError, match="File is not a recognized excel file"): pd.read_excel(bad_stream) @tm.network From 46193634dd18b260cb6d0518691940aacf8758da Mon Sep 17 00:00:00 2001 From: rhshadrach Date: Wed, 23 Dec 2020 15:34:10 -0500 Subject: [PATCH 14/15] Handle error with xlsb files --- pandas/tests/io/excel/test_readers.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 0ed4a454a3bd0..bed33ee345968 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1181,10 +1181,16 @@ def test_excel_read_binary(self, engine, read_ext): def test_excel_read_binary_via_read_excel(self, read_ext, engine): # GH 38424 - expected = pd.read_excel("test1" + read_ext, engine=engine) - with open("test1" + read_ext, "rb") as f: - result = pd.read_excel(f) - tm.assert_frame_equal(result, expected) + if read_ext == ".xlsb" and engine == "pyxlsb": + # GH 38667 - should default to pyxlsb but doesn't + with pytest.raises(IOError, match="File contains no valid workbook part"): + with open("test1" + read_ext, "rb") as f: + pd.read_excel(f) + else: + with open("test1" + read_ext, "rb") as f: + result = pd.read_excel(f) + expected = pd.read_excel("test1" + read_ext, engine=engine) + tm.assert_frame_equal(result, expected) @pytest.mark.skipif( xlrd_version is not None and xlrd_version >= "2", From 05a6f0963b0516200523ee2b05722f97301b70d0 Mon Sep 17 00:00:00 2001 From: rhshadrach Date: Wed, 23 Dec 2020 15:37:10 -0500 Subject: [PATCH 15/15] xfail instead --- pandas/tests/io/excel/test_readers.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index bed33ee345968..df1250cee8b00 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1182,15 +1182,11 @@ def test_excel_read_binary(self, engine, read_ext): def test_excel_read_binary_via_read_excel(self, read_ext, engine): # GH 38424 if read_ext == ".xlsb" and engine == "pyxlsb": - # GH 38667 - should default to pyxlsb but doesn't - with pytest.raises(IOError, match="File contains no valid workbook part"): - with open("test1" + read_ext, "rb") as f: - pd.read_excel(f) - else: - with open("test1" + read_ext, "rb") as f: - result = pd.read_excel(f) - expected = pd.read_excel("test1" + read_ext, engine=engine) - tm.assert_frame_equal(result, expected) + pytest.xfail("GH 38667 - should default to pyxlsb but doesn't") + with open("test1" + read_ext, "rb") as f: + result = pd.read_excel(f) + expected = pd.read_excel("test1" + read_ext, engine=engine) + tm.assert_frame_equal(result, expected) @pytest.mark.skipif( xlrd_version is not None and xlrd_version >= "2",