Default to openpyxl not xlrd for read_excel

Debian Science Team · rebecca-palmer · commit ea08ae2c5b2a · 2020-12-07T23:06:28.000Z
xlrd 1.2 fails if defusedxml (needed for odf) is installed Bug: pandas-dev/pandas#35029 Bug-Debian: https://bugs.debian.org/976620 Origin: upstream b3a3932af6aafaa2fd41f17e9b7995643e5f92eb Author: Robert de Vries, Rebecca N. Palmer <rebecca_palmer@zoho.com> Forwarded: not-needed Gbp-Pq: Name xlrd_976620.patch
diff --git a/doc/source/whatsnew/v1.1.5.rst b/doc/source/whatsnew/v1.1.5.rst
@@ -8,6 +8,16 @@ including other versions of pandas.
 
 {{ header }}
 
+.. warning::
+
+   Previously, the default argument ``engine=None`` to ``pd.read_excel``
+   would result in using the `xlrd <https://xlrd.readthedocs.io/en/latest/>`_ engine in
+   many cases. The engine ``xlrd`` is no longer maintained, and may not work if ``defusedxml``
+   is installed.  Hence, from version 1.1.5 in Debian and 1.2.0 upstream,
+   if `openpyxl <https://pypi.org/project/openpyxl/>`_  is installed,
+   many of these  cases will now default to using the ``openpyxl`` engine. See the
+   :func:`read_excel` documentation for more details.
+
 .. ---------------------------------------------------------------------------
 
 .. _whatsnew_115.regressions:
diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
@@ -1,13 +1,16 @@
 import abc
 import datetime
+import inspect
 from io import BufferedIOBase, BytesIO, RawIOBase
 import os
 from textwrap import fill
 from typing import Union
+import warnings
 
 from pandas._config import config
 
 from pandas._libs.parsers import STR_NA_VALUES
+from pandas.compat._optional import import_optional_dependency
 from pandas.errors import EmptyDataError
 from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments
 
@@ -104,12 +107,32 @@
     of dtype conversion.
 engine : str, default None
     If io is not a buffer or path, this must be set to identify io.
-    Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb", default "xlrd".
+    Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb".
     Engine compatibility :
+
     - "xlrd" supports most old/new Excel file formats.
     - "openpyxl" supports newer Excel file formats.
     - "odf" supports OpenDocument file formats (.odf, .ods, .odt).
     - "pyxlsb" supports Binary Excel files.
+
+    .. versionchanged:: 1.1.5 in Debian, 1.2.0 upstream
+        The engine `xlrd <https://xlrd.readthedocs.io/en/latest/>`_
+        is no longer maintained, and is not supported with
+        python >= 3.9. When ``engine=None``, the following logic will be
+        used to determine the engine.
+
+        - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt),
+          then `odf <https://pypi.org/project/odfpy/>`_ will be used.
+        - Otherwise if ``path_or_buffer`` is a bytes stream, the file has the
+          extension ``.xls``, or is an ``xlrd`` Book instance, then ``xlrd`` will
+          be used.
+        - Otherwise if `openpyxl <https://pypi.org/project/openpyxl/>`_ is installed,
+          then ``openpyxl`` will be used.
+        - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised.
+
+        Specifying ``engine="xlrd"`` will continue to be allowed for the
+        indefinite future, but may require uninstalling (python3-)defusedxml.
+
 converters : dict, default None
     Dict of functions for converting values in certain columns. Keys can
     either be integers or column labels, values are functions that take one
@@ -823,13 +846,32 @@ class ExcelFile:
         .xls, .xlsx, .xlsb, .xlsm, .odf, .ods, or .odt file.
     engine : str, default None
         If io is not a buffer or path, this must be set to identify io.
-        Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb``,
-        default ``xlrd``.
+        Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb``
         Engine compatibility :
+
         - ``xlrd`` supports most old/new Excel file formats.
         - ``openpyxl`` supports newer Excel file formats.
         - ``odf`` supports OpenDocument file formats (.odf, .ods, .odt).
         - ``pyxlsb`` supports Binary Excel files.
+
+        .. versionchanged:: 1.1.5 in Debian, 1.2.0 upstream
+
+           The engine `xlrd <https://xlrd.readthedocs.io/en/latest/>`_
+           is no longer maintained, and is not supported with
+           python >= 3.9. When ``engine=None``, the following logic will be
+           used to determine the engine.
+
+           - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt),
+             then `odf <https://pypi.org/project/odfpy/>`_ will be used.
+           - Otherwise if ``path_or_buffer`` is a bytes stream, the file has the
+             extension ``.xls``, or is an ``xlrd`` Book instance, then ``xlrd``
+             will be used.
+           - Otherwise if `openpyxl <https://pypi.org/project/openpyxl/>`_ is installed,
+             then ``openpyxl`` will be used.
+           - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised.
+
+           Specifying ``engine="xlrd"`` will continue to be allowed for the
+           indefinite future, but may require uninstalling (python3-)defusedxml.
     """
 
     from pandas.io.excel._odfreader import _ODFReader
@@ -846,14 +888,59 @@ class ExcelFile:
 
     def __init__(self, path_or_buffer, engine=None):
         if engine is None:
-            engine = "xlrd"
+            # Determine ext and use odf for ods stream/file
             if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)):
+                ext = None
                 if _is_ods_stream(path_or_buffer):
                     engine = "odf"
             else:
                 ext = os.path.splitext(str(path_or_buffer))[-1]
                 if ext == ".ods":
                     engine = "odf"
+
+            if (
+                import_optional_dependency(
+                    "xlrd", raise_on_missing=False, on_version="ignore"
+                )
+                is not None
+            ):
+                from xlrd import Book
+
+                if isinstance(path_or_buffer, Book):
+                    engine = "xlrd"
+
+            # GH 35029 - Prefer openpyxl except for xls files
+            if engine is None:
+                if ext is None or isinstance(path_or_buffer, bytes) or ext == ".xls":
+                    engine = "xlrd"
+                elif (
+                    import_optional_dependency(
+                        "openpyxl", raise_on_missing=False, on_version="ignore"
+                    )
+                    is not None
+                ):
+                    engine = "openpyxl"
+                else:
+                    caller = inspect.stack()[1]
+                    if (
+                        caller.filename.endswith("pandas/io/excel/_base.py")
+                        and caller.function == "read_excel"
+                    ):
+                        stacklevel = 4
+                    else:
+                        stacklevel = 2
+                    warnings.warn(
+                        "The xlrd engine is no longer maintained and is not "
+                        "supported when using pandas with python >= 3.9. However, "
+                        "the engine xlrd will continue to be allowed for the "
+                        "indefinite future. The "
+                        "openpyxl engine will be used if it is installed and the "
+                        "engine argument is not specified. Either install openpyxl "
+                        "or specify engine='xlrd' to silence this warning.",
+                        FutureWarning,
+                        stacklevel=stacklevel,
+                    )
+                    engine = "xlrd"
         if engine not in self._engines:
             raise ValueError(f"Unknown engine: {engine}")
 
diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
@@ -599,6 +599,10 @@ def test_date_conversion_overflow(self, read_ext):
         if pd.read_excel.keywords["engine"] == "openpyxl":
             pytest.xfail("Maybe not supported by openpyxl")
 
+        if pd.read_excel.keywords["engine"] is None:
+            # GH 35029
+            pytest.xfail("Defaults to openpyxl, maybe not supported")
+
         result = pd.read_excel("testdateoverflow" + read_ext)
         tm.assert_frame_equal(result, expected)
 
@@ -1153,12 +1157,13 @@ def test_excel_read_binary(self, engine, read_ext):
         actual = pd.read_excel(data, engine=engine)
         tm.assert_frame_equal(expected, actual)
 
+    @td.skip_if_no("xlrd")
     def test_excel_high_surrogate(self, engine):
         # GH 23809
         expected = pd.DataFrame(["\udc88"], columns=["Column1"])
 
         # should not produce a segmentation violation
-        actual = pd.read_excel("high_surrogate.xlsx")
+        actual = pd.read_excel("high_surrogate.xlsx", engine="xlrd")
         tm.assert_frame_equal(expected, actual)
 
     @pytest.mark.parametrize("filename", ["df_empty.xlsx", "df_equals.xlsx"])
diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py
@@ -351,12 +351,15 @@ def test_excel_sheet_by_name_raise(self, path, engine):
             msg = "sheet 0 not found"
             with pytest.raises(ValueError, match=msg):
                 pd.read_excel(xl, "0")
-        else:
+        elif engine == "xlwt":
             import xlrd
 
             msg = "No sheet named <'0'>"
             with pytest.raises(xlrd.XLRDError, match=msg):
                 pd.read_excel(xl, sheet_name="0")
+        else:
+            with pytest.raises(KeyError, match="Worksheet 0 does not exist."):
+                pd.read_excel(xl, sheet_name="0")
 
     def test_excel_writer_context_manager(self, frame, path):
         with ExcelWriter(path) as writer:
@@ -1195,7 +1198,9 @@ def test_datetimes(self, path):
 
         write_frame = DataFrame({"A": datetimes})
         write_frame.to_excel(path, "Sheet1")
-        read_frame = pd.read_excel(path, sheet_name="Sheet1", header=0)
+        # GH 35029 - Default changed to openpyxl, but test is for odf/xlrd
+        engine = "odf" if path.endswith("ods") else "xlrd"
+        read_frame = pd.read_excel(path, sheet_name="Sheet1", header=0, engine=engine)
 
         tm.assert_series_equal(write_frame["A"], read_frame["A"])
 
diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py
@@ -1,5 +1,7 @@
 import pytest
 
+from pandas.compat._optional import import_optional_dependency
+
 import pandas as pd
 import pandas._testing as tm
 
@@ -38,6 +40,48 @@ def test_read_xlrd_book(read_ext, frame):
 # TODO: test for openpyxl as well
 def test_excel_table_sheet_by_index(datapath, read_ext):
     path = datapath("io", "data", "excel", f"test1{read_ext}")
-    with pd.ExcelFile(path) as excel:
+    with pd.ExcelFile(path, engine="xlrd") as excel:
         with pytest.raises(xlrd.XLRDError):
             pd.read_excel(excel, sheet_name="asdf")
+
+
+def test_excel_file_warning_with_xlsx_file(datapath):
+    # GH 29375
+    path = datapath("io", "data", "excel", "test1.xlsx")
+    has_openpyxl = (
+        import_optional_dependency(
+            "openpyxl", raise_on_missing=False, on_version="ignore"
+        )
+        is not None
+    )
+    if not has_openpyxl:
+        with tm.assert_produces_warning(
+            FutureWarning,
+            raise_on_extra_warnings=False,
+            match="The xlrd engine is no longer maintained",
+        ):
+            ExcelFile(path, engine=None)
+    else:
+        with tm.assert_produces_warning(None):
+            pd.read_excel(path, "Sheet1", engine=None)
+
+
+def test_read_excel_warning_with_xlsx_file(tmpdir, datapath):
+    # GH 29375
+    path = datapath("io", "data", "excel", "test1.xlsx")
+    has_openpyxl = (
+        import_optional_dependency(
+            "openpyxl", raise_on_missing=False, on_version="ignore"
+        )
+        is not None
+    )
+    if not has_openpyxl:
+        with tm.assert_produces_warning(
+            FutureWarning,
+            raise_on_extra_warnings=False,
+            match="The xlrd engine is no longer maintained",
+        ):
+            pd.read_excel(path, "Sheet1", engine=None)
+    else:
+        with tm.assert_produces_warning(None):
+            pd.read_excel(path, "Sheet1", engine=None)