From 3a76a3643f293fe842bdd126fd5d436c584b0941 Mon Sep 17 00:00:00 2001 From: Mike Robinson Date: Sat, 2 Nov 2019 18:16:11 +0000 Subject: [PATCH 01/14] Deprecate using `xlrd` engine and change default engine to read excel files to openpyxl --- doc/source/whatsnew/v1.2.0.rst | 2 ++ pandas/io/excel/_base.py | 18 ++++++++++--- pandas/io/excel/_openpyxl.py | 7 ++++- pandas/tests/io/excel/test_readers.py | 37 ++++++++++++++++++--------- pandas/tests/io/excel/test_writers.py | 18 +++++++++++-- pandas/tests/io/excel/test_xlrd.py | 28 +++++++++++++++++++- 6 files changed, 90 insertions(+), 20 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 55570341cf4e8..47a8cd40fe7fe 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -144,6 +144,8 @@ Deprecations ~~~~~~~~~~~~ - Deprecated parameter ``inplace`` in :meth:`MultiIndex.set_codes` and :meth:`MultiIndex.set_levels` (:issue:`35626`) - Deprecated parameter ``dtype`` in :~meth:`Index.copy` on method all index classes. Use the :meth:`Index.astype` method instead for changing dtype(:issue:`35853`) +- :func:`read_excel` default engine "xlrd" is replaced by "openpyxl" because "xlrd" is deprecated (:issue:`28547`). +- - .. --------------------------------------------------------------------------- diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 3cd0d721bbdc6..436d86d02a8d2 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -4,6 +4,7 @@ import os from textwrap import fill from typing import Any, Mapping, Union +import warnings from pandas._config import config @@ -825,8 +826,7 @@ def _is_ods_stream(stream: Union[BufferedIOBase, RawIOBase]) -> bool: class ExcelFile: """ Class for parsing tabular excel sheets into DataFrame objects. - - Uses xlrd engine by default. See read_excel for more documentation + Uses xlrd, openpyxl or odf. See read_excel for more documentation Parameters ---------- @@ -837,7 +837,7 @@ class ExcelFile: engine : str, default None If io is not a buffer or path, this must be set to identify io. Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb``, - default ``xlrd``. + default ``openpyxl``, ``xlrd`` for .xls files, ``odf`` for .ods files. Engine compatibility : - ``xlrd`` supports most old/new Excel file formats. - ``openpyxl`` supports newer Excel file formats. @@ -861,7 +861,7 @@ def __init__( self, path_or_buffer, engine=None, storage_options: StorageOptions = None ): if engine is None: - engine = "xlrd" + engine = "openpyxl" if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)): if _is_ods_stream(path_or_buffer): engine = "odf" @@ -869,6 +869,16 @@ def __init__( ext = os.path.splitext(str(path_or_buffer))[-1] if ext == ".ods": engine = "odf" + elif ext == ".xls": + engine = "xlrd" + + elif engine == "xlrd": + warnings.warn( + 'The Excel reader engine "xlrd" is deprecated, use "openpyxl" instead. ' + 'Specify engine="openpyxl" to suppress this warning.', + FutureWarning, + stacklevel=2, + ) if engine not in self._engines: raise ValueError(f"Unknown engine: {engine}") diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index c2730536af8a3..94f15efe8b0e5 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -1,3 +1,4 @@ +from datetime import datetime from typing import List import numpy as np @@ -517,7 +518,11 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: # TODO: replace with openpyxl constants if cell.is_date: - return cell.value + try: + # workaround for inaccurate timestamp notation in excel + return datetime.fromtimestamp(round(cell.value.timestamp())) + except (AttributeError, OSError): + return cell.value elif cell.data_type == "e": return np.nan elif cell.data_type == "b": diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 431a50477fccc..858b41dae4020 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -22,6 +22,9 @@ marks=[ td.skip_if_no("xlrd"), pytest.mark.filterwarnings("ignore:.*(tree\\.iter|html argument)"), + pytest.mark.filterwarnings( + 'ignore:The Excel reader engine "xlrd" is deprecated,' + ), ], ), pytest.param( @@ -34,8 +37,8 @@ pytest.param( None, marks=[ - td.skip_if_no("xlrd"), - pytest.mark.filterwarnings("ignore:.*(tree\\.iter|html argument)"), + td.skip_if_no("openpyxl"), + pytest.mark.filterwarnings("ignore:.*html argument"), ], ), pytest.param("pyxlsb", marks=td.skip_if_no("pyxlsb")), @@ -51,6 +54,8 @@ def _is_valid_engine_ext_pair(engine, read_ext: str) -> bool: engine = engine.values[0] if engine == "openpyxl" and read_ext == ".xls": return False + if engine is None and read_ext == ".xls": + return False if engine == "odf" and read_ext != ".ods": return False if read_ext == ".ods" and engine != "odf": @@ -559,7 +564,7 @@ def test_date_conversion_overflow(self, read_ext): columns=["DateColWithBigInt", "StringCol"], ) - if pd.read_excel.keywords["engine"] == "openpyxl": + if pd.read_excel.keywords["engine"] in ["openpyxl", None]: pytest.xfail("Maybe not supported by openpyxl") result = pd.read_excel("testdateoverflow" + read_ext) @@ -942,7 +947,10 @@ def test_read_excel_squeeze(self, read_ext): expected = pd.Series([1, 2, 3], name="a") tm.assert_series_equal(actual, expected) - def test_deprecated_kwargs(self, read_ext): + def test_deprecated_kwargs(self, engine, read_ext): + if engine == "xlrd": + pytest.skip("Use of xlrd engine produces a FutureWarning as well") + with tm.assert_produces_warning(FutureWarning, raise_on_extra_warnings=False): pd.read_excel("test1" + read_ext, "Sheet1", 0) @@ -961,6 +969,19 @@ def test_no_header_with_list_index_col(self, read_ext): ) tm.assert_frame_equal(expected, result) + def test_excel_high_surrogate(self, engine, read_ext): + # GH 23809 + if read_ext != ".xlsx": + pytest.skip("Test is only applicable to .xlsx file") + if engine in ["openpyxl", None]: + pytest.skip("Test does not work for openpyxl") + + expected = pd.DataFrame(["\udc88"], columns=["Column1"]) + + # should not produce a segmentation violation + actual = pd.read_excel("high_surrogate.xlsx") + tm.assert_frame_equal(expected, actual) + class TestExcelFileRead: @pytest.fixture(autouse=True) @@ -1116,14 +1137,6 @@ def test_excel_read_binary(self, engine, read_ext): actual = pd.read_excel(data, engine=engine) tm.assert_frame_equal(expected, actual) - def test_excel_high_surrogate(self, engine): - # GH 23809 - expected = pd.DataFrame(["\udc88"], columns=["Column1"]) - - # should not produce a segmentation violation - actual = pd.read_excel("high_surrogate.xlsx") - tm.assert_frame_equal(expected, actual) - @pytest.mark.parametrize("filename", ["df_empty.xlsx", "df_equals.xlsx"]) def test_header_with_index_col(self, engine, filename): # GH 33476 diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index e3ee53b63e102..1063834b8367c 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -351,12 +351,16 @@ def test_excel_sheet_by_name_raise(self, path, engine): msg = "sheet 0 not found" with pytest.raises(ValueError, match=msg): pd.read_excel(xl, "0") - else: + elif engine == "xlwt": import xlrd msg = "No sheet named <'0'>" with pytest.raises(xlrd.XLRDError, match=msg): pd.read_excel(xl, sheet_name="0") + else: # openpyxl + msg = "Worksheet 0 does not exist." + with pytest.raises(KeyError, match=msg): + pd.read_excel(xl, sheet_name="0") def test_excel_writer_context_manager(self, frame, path): with ExcelWriter(path) as writer: @@ -1199,6 +1203,9 @@ def test_datetimes(self, path): tm.assert_series_equal(write_frame["A"], read_frame["A"]) + @pytest.mark.filterwarnings( + 'ignore:The Excel reader engine "xlrd" is deprecated:FutureWarning' + ) def test_bytes_io(self, engine): # see gh-7074 bio = BytesIO() @@ -1209,8 +1216,15 @@ def test_bytes_io(self, engine): df.to_excel(writer) writer.save() + if engine == "xlwt": + read_engine = "xlrd" + elif engine == "xlsxwriter": + read_engine = "openpyxl" + else: + read_engine = engine + bio.seek(0) - reread_df = pd.read_excel(bio, index_col=0) + reread_df = pd.read_excel(bio, index_col=0, engine=read_engine) tm.assert_frame_equal(df, reread_df) def test_write_lists_dict(self, path): diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py index 1c9c514b20f46..ec9386f367245 100644 --- a/pandas/tests/io/excel/test_xlrd.py +++ b/pandas/tests/io/excel/test_xlrd.py @@ -17,6 +17,9 @@ def skip_ods_and_xlsb_files(read_ext): pytest.skip("Not valid for xlrd") +@pytest.mark.filterwarnings( + 'ignore:The Excel reader engine "xlrd" is deprecated:FutureWarning' +) def test_read_xlrd_book(read_ext, frame): df = frame @@ -36,8 +39,31 @@ def test_read_xlrd_book(read_ext, frame): # TODO: test for openpyxl as well +@pytest.mark.filterwarnings( + 'ignore:The Excel reader engine "xlrd" is deprecated:FutureWarning' +) def test_excel_table_sheet_by_index(datapath, read_ext): path = datapath("io", "data", "excel", f"test1{read_ext}") - with pd.ExcelFile(path) as excel: + with pd.ExcelFile(path, engine="xlrd") as excel: with pytest.raises(xlrd.XLRDError): pd.read_excel(excel, sheet_name="asdf") + + +def test_excel_file_warning_with_xlsx_file(datapath): + # GH 29375 + path = datapath("io", "data", "excel", "test1.xlsx") + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=True, raise_on_extra_warnings=False + ) as w: + pd.ExcelFile(path, engine="xlrd") + assert '"xlrd" is deprecated, use "openpyxl" instead.' in str(w[0].message) + + +def test_read_excel_warning_with_xlsx_file(tmpdir, datapath): + # GH 29375 + path = datapath("io", "data", "excel", "test1.xlsx") + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False + ) as w: + pd.read_excel(path, "Sheet1", engine="xlrd") + assert '"xlrd" is deprecated, use "openpyxl" instead.' in str(w[0].message) From 101aa9766daabf57dca57882dbd0232b29450a38 Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Sun, 23 Aug 2020 17:18:17 +0200 Subject: [PATCH 02/14] Revert all changes related to switching to openpyxl as the default --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/io/excel/_base.py | 18 +++++++++-------- pandas/tests/io/excel/test_readers.py | 29 ++++++++++----------------- pandas/tests/io/excel/test_writers.py | 15 ++------------ 4 files changed, 24 insertions(+), 40 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 47a8cd40fe7fe..b12ac3c139dea 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -144,7 +144,7 @@ Deprecations ~~~~~~~~~~~~ - Deprecated parameter ``inplace`` in :meth:`MultiIndex.set_codes` and :meth:`MultiIndex.set_levels` (:issue:`35626`) - Deprecated parameter ``dtype`` in :~meth:`Index.copy` on method all index classes. Use the :meth:`Index.astype` method instead for changing dtype(:issue:`35853`) -- :func:`read_excel` default engine "xlrd" is replaced by "openpyxl" because "xlrd" is deprecated (:issue:`28547`). +- :func:`read_excel` "xlrd" engine is deprecated for all file types that can be handled by "openpyxl" because "xlrd" is no longer maintained (:issue:`28547`). - - diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 436d86d02a8d2..2c6e50e8ecd79 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -826,7 +826,8 @@ def _is_ods_stream(stream: Union[BufferedIOBase, RawIOBase]) -> bool: class ExcelFile: """ Class for parsing tabular excel sheets into DataFrame objects. - Uses xlrd, openpyxl or odf. See read_excel for more documentation + + Uses xlrd engine by default. See read_excel for more documentation Parameters ---------- @@ -837,7 +838,7 @@ class ExcelFile: engine : str, default None If io is not a buffer or path, this must be set to identify io. Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb``, - default ``openpyxl``, ``xlrd`` for .xls files, ``odf`` for .ods files. + default ``xlrd`` for .xls* files, ``odf`` for .ods files. Engine compatibility : - ``xlrd`` supports most old/new Excel file formats. - ``openpyxl`` supports newer Excel file formats. @@ -860,19 +861,20 @@ class ExcelFile: def __init__( self, path_or_buffer, engine=None, storage_options: StorageOptions = None ): + ext = None + if not isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)): + ext = os.path.splitext(str(path_or_buffer))[-1][1:] + if engine is None: - engine = "openpyxl" + engine = "xlrd" if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)): if _is_ods_stream(path_or_buffer): engine = "odf" else: - ext = os.path.splitext(str(path_or_buffer))[-1] - if ext == ".ods": + if ext == "ods": engine = "odf" - elif ext == ".xls": - engine = "xlrd" - elif engine == "xlrd": + elif engine == "xlrd" and ext in ("xlsx", "xlsm"): warnings.warn( 'The Excel reader engine "xlrd" is deprecated, use "openpyxl" instead. ' 'Specify engine="openpyxl" to suppress this warning.', diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 858b41dae4020..8bbdcbe12f192 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -37,8 +37,8 @@ pytest.param( None, marks=[ - td.skip_if_no("openpyxl"), - pytest.mark.filterwarnings("ignore:.*html argument"), + td.skip_if_no("xlrd"), + pytest.mark.filterwarnings("ignore:.*(tree\\.iter|html argument)"), ], ), pytest.param("pyxlsb", marks=td.skip_if_no("pyxlsb")), @@ -54,8 +54,6 @@ def _is_valid_engine_ext_pair(engine, read_ext: str) -> bool: engine = engine.values[0] if engine == "openpyxl" and read_ext == ".xls": return False - if engine is None and read_ext == ".xls": - return False if engine == "odf" and read_ext != ".ods": return False if read_ext == ".ods" and engine != "odf": @@ -564,7 +562,7 @@ def test_date_conversion_overflow(self, read_ext): columns=["DateColWithBigInt", "StringCol"], ) - if pd.read_excel.keywords["engine"] in ["openpyxl", None]: + if pd.read_excel.keywords["engine"] == "openpyxl": pytest.xfail("Maybe not supported by openpyxl") result = pd.read_excel("testdateoverflow" + read_ext) @@ -969,19 +967,6 @@ def test_no_header_with_list_index_col(self, read_ext): ) tm.assert_frame_equal(expected, result) - def test_excel_high_surrogate(self, engine, read_ext): - # GH 23809 - if read_ext != ".xlsx": - pytest.skip("Test is only applicable to .xlsx file") - if engine in ["openpyxl", None]: - pytest.skip("Test does not work for openpyxl") - - expected = pd.DataFrame(["\udc88"], columns=["Column1"]) - - # should not produce a segmentation violation - actual = pd.read_excel("high_surrogate.xlsx") - tm.assert_frame_equal(expected, actual) - class TestExcelFileRead: @pytest.fixture(autouse=True) @@ -1137,6 +1122,14 @@ def test_excel_read_binary(self, engine, read_ext): actual = pd.read_excel(data, engine=engine) tm.assert_frame_equal(expected, actual) + def test_excel_high_surrogate(self, engine): + # GH 23809 + expected = pd.DataFrame(["\udc88"], columns=["Column1"]) + + # should not produce a segmentation violation + actual = pd.read_excel("high_surrogate.xlsx") + tm.assert_frame_equal(expected, actual) + @pytest.mark.parametrize("filename", ["df_empty.xlsx", "df_equals.xlsx"]) def test_header_with_index_col(self, engine, filename): # GH 33476 diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 1063834b8367c..461ddbbbc77e3 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -351,16 +351,12 @@ def test_excel_sheet_by_name_raise(self, path, engine): msg = "sheet 0 not found" with pytest.raises(ValueError, match=msg): pd.read_excel(xl, "0") - elif engine == "xlwt": + else: import xlrd msg = "No sheet named <'0'>" with pytest.raises(xlrd.XLRDError, match=msg): pd.read_excel(xl, sheet_name="0") - else: # openpyxl - msg = "Worksheet 0 does not exist." - with pytest.raises(KeyError, match=msg): - pd.read_excel(xl, sheet_name="0") def test_excel_writer_context_manager(self, frame, path): with ExcelWriter(path) as writer: @@ -1216,15 +1212,8 @@ def test_bytes_io(self, engine): df.to_excel(writer) writer.save() - if engine == "xlwt": - read_engine = "xlrd" - elif engine == "xlsxwriter": - read_engine = "openpyxl" - else: - read_engine = engine - bio.seek(0) - reread_df = pd.read_excel(bio, index_col=0, engine=read_engine) + reread_df = pd.read_excel(bio, index_col=0) tm.assert_frame_equal(df, reread_df) def test_write_lists_dict(self, path): From 081ecf839117119916ceef79da6b3754c286a480 Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Wed, 26 Aug 2020 10:59:39 +0200 Subject: [PATCH 03/14] Reword whatsnew message for the benefit of end users. --- doc/source/whatsnew/v1.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index b12ac3c139dea..7ccb1e84c2d59 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -144,7 +144,7 @@ Deprecations ~~~~~~~~~~~~ - Deprecated parameter ``inplace`` in :meth:`MultiIndex.set_codes` and :meth:`MultiIndex.set_levels` (:issue:`35626`) - Deprecated parameter ``dtype`` in :~meth:`Index.copy` on method all index classes. Use the :meth:`Index.astype` method instead for changing dtype(:issue:`35853`) -- :func:`read_excel` "xlrd" engine is deprecated for all file types that can be handled by "openpyxl" because "xlrd" is no longer maintained (:issue:`28547`). +- :func:`read_excel` "xlrd" engine is deprecated. The recommended engine is "openpyxl" for "xlsx" and "xlsm" files, because "xlrd" is no longer maintained (:issue:`28547`). - - From 32333817a8c4ff0ac644bd6f65f3b6f69a46396d Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 25 Nov 2020 18:56:59 -0500 Subject: [PATCH 04/14] Fixed FutureWarning emitting logic, reverted openpyxl workaround --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/io/excel/_base.py | 35 +++++++++++++++------------ pandas/io/excel/_openpyxl.py | 7 +----- pandas/tests/io/excel/test_readers.py | 9 +------ pandas/tests/io/excel/test_writers.py | 3 --- pandas/tests/io/excel/test_xlrd.py | 24 ++++++++---------- 6 files changed, 33 insertions(+), 47 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 7f3c0bc916650..1a48a4de223de 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -484,7 +484,7 @@ Deprecations - Deprecated :meth:`Index.asi8` for :class:`Index` subclasses other than :class:`DatetimeIndex`, :class:`TimedeltaIndex`, and :class:`PeriodIndex` (:issue:`37877`) - The ``inplace`` parameter of :meth:`Categorical.remove_unused_categories` is deprecated and will be removed in a future version (:issue:`37643`) - The ``null_counts`` parameter of :meth:`DataFrame.info` is deprecated and replaced by ``show_counts``. It will be removed in a future version (:issue:`37999`) -- :func:`read_excel` "xlrd" engine is deprecated. The recommended engine is "openpyxl" for "xlsx" and "xlsm" files, because "xlrd" is no longer maintained (:issue:`28547`). +- Deprecated the default argument ``engine=None`` of the function :func:`read_excel`, which uses the no longer maintained xlrd engine. Not specifying the engine will raise a ``FutureWarning``. This argument will default to ``"openpyxl"`` in a future version, which is now the recommended engine for xlsx and xlsm files (:issue:`28547`) .. --------------------------------------------------------------------------- diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 5f9c973a98a8b..cc2e5937d1bbc 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -102,10 +102,15 @@ If io is not a buffer or path, this must be set to identify io. Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb", default "xlrd". Engine compatibility : - - "xlrd" supports most old/new Excel file formats. + - "xlrd" supports most old/new Excel file formats but is no longer maintained. - "openpyxl" supports newer Excel file formats. - "odf" supports OpenDocument file formats (.odf, .ods, .odt). - "pyxlsb" supports Binary Excel files. + + .. deprecated:: 1.2.0 + The default value ``None`` is deprecated and will be changed to ``"openpyxl"`` + in a future version. Not specifying an engine will raise a FutureWarning. + converters : dict, default None Dict of functions for converting values in certain columns. Keys can either be integers or column labels, values are functions that take one @@ -881,10 +886,15 @@ class ExcelFile: Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb``, default ``xlrd`` for .xls* files, ``odf`` for .ods files. Engine compatibility : - - ``xlrd`` supports most old/new Excel file formats. + - ``xlrd`` supports most old/new Excel file formats but is no longer maintained. - ``openpyxl`` supports newer Excel file formats. - ``odf`` supports OpenDocument file formats (.odf, .ods, .odt). - ``pyxlsb`` supports Binary Excel files. + + .. deprecated:: 1.2.0 + The default value ``None`` is deprecated and will be changed to + ``"openpyxl"`` in a future version. Not specifying an engine will + raise a FutureWarning. """ from pandas.io.excel._odfreader import ODFReader @@ -902,26 +912,21 @@ class ExcelFile: def __init__( self, path_or_buffer, engine=None, storage_options: StorageOptions = None ): - ext = None - if not isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)): - ext = os.path.splitext(str(path_or_buffer))[-1][1:] - if engine is None: + warnings.warn( + "The default argument engine=None is deprecated. " + "Specify the engine argument to suppress this warning.", + FutureWarning, + stacklevel=4, + ) engine = "xlrd" if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)): if _is_ods_stream(path_or_buffer): engine = "odf" else: - if ext == "ods": + ext = os.path.splitext(str(path_or_buffer))[-1] + if ext == ".ods": engine = "odf" - - elif engine == "xlrd" and ext in ("xlsx", "xlsm"): - warnings.warn( - 'The Excel reader engine "xlrd" is deprecated, use "openpyxl" instead. ' - 'Specify engine="openpyxl" to suppress this warning.', - FutureWarning, - stacklevel=2, - ) if engine not in self._engines: raise ValueError(f"Unknown engine: {engine}") diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 7678ea970b902..7de958df206d5 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -1,4 +1,3 @@ -from datetime import datetime from typing import TYPE_CHECKING, Dict, List, Optional import numpy as np @@ -503,11 +502,7 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: from openpyxl.cell.cell import TYPE_BOOL, TYPE_ERROR, TYPE_NUMERIC if cell.is_date: - try: - # workaround for inaccurate timestamp notation in excel - return datetime.fromtimestamp(round(cell.value.timestamp())) - except (AttributeError, OSError): - return cell.value + return cell.value elif cell.data_type == TYPE_ERROR: return np.nan elif cell.data_type == TYPE_BOOL: diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index b53e330df055a..c582a0fa23577 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -21,10 +21,6 @@ "xlrd", marks=[ td.skip_if_no("xlrd"), - pytest.mark.filterwarnings("ignore:.*(tree\\.iter|html argument)"), - pytest.mark.filterwarnings( - 'ignore:The Excel reader engine "xlrd" is deprecated,' - ), ], ), pytest.param( @@ -984,10 +980,7 @@ def test_read_excel_squeeze(self, read_ext): expected = Series([1, 2, 3], name="a") tm.assert_series_equal(actual, expected) - def test_deprecated_kwargs(self, engine, read_ext): - if engine == "xlrd": - pytest.skip("Use of xlrd engine produces a FutureWarning as well") - + def test_deprecated_kwargs(self, read_ext): with tm.assert_produces_warning(FutureWarning, raise_on_extra_warnings=False): pd.read_excel("test1" + read_ext, "Sheet1", 0) diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 91c484378145e..8da9c79160e91 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -1197,9 +1197,6 @@ def test_datetimes(self, path): tm.assert_series_equal(write_frame["A"], read_frame["A"]) - @pytest.mark.filterwarnings( - 'ignore:The Excel reader engine "xlrd" is deprecated:FutureWarning' - ) def test_bytes_io(self, engine): # see gh-7074 bio = BytesIO() diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py index fc48ddaf89805..1524dea25aa62 100644 --- a/pandas/tests/io/excel/test_xlrd.py +++ b/pandas/tests/io/excel/test_xlrd.py @@ -17,9 +17,6 @@ def skip_ods_and_xlsb_files(read_ext): pytest.skip("Not valid for xlrd") -@pytest.mark.filterwarnings( - 'ignore:The Excel reader engine "xlrd" is deprecated:FutureWarning' -) def test_read_xlrd_book(read_ext, frame): df = frame @@ -39,9 +36,6 @@ def test_read_xlrd_book(read_ext, frame): # TODO: test for openpyxl as well -@pytest.mark.filterwarnings( - 'ignore:The Excel reader engine "xlrd" is deprecated:FutureWarning' -) def test_excel_table_sheet_by_index(datapath, read_ext): path = datapath("io", "data", "excel", f"test1{read_ext}") with ExcelFile(path, engine="xlrd") as excel: @@ -52,18 +46,20 @@ def test_excel_table_sheet_by_index(datapath, read_ext): def test_excel_file_warning_with_xlsx_file(datapath): # GH 29375 path = datapath("io", "data", "excel", "test1.xlsx") + # DeprecationWarning: "This method will be removed in future versions. + # Use 'tree.iter()' or 'list(tree.iter())' instead." with tm.assert_produces_warning( - FutureWarning, check_stacklevel=True, raise_on_extra_warnings=False - ) as w: - pd.ExcelFile(path, engine="xlrd") - assert '"xlrd" is deprecated, use "openpyxl" instead.' in str(w[0].message) + FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False + ): + ExcelFile(path, engine=None) def test_read_excel_warning_with_xlsx_file(tmpdir, datapath): # GH 29375 path = datapath("io", "data", "excel", "test1.xlsx") + # DeprecationWarning: "This method will be removed in future versions. + # Use 'tree.iter()' or 'list(tree.iter())' instead." with tm.assert_produces_warning( - FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False - ) as w: - pd.read_excel(path, "Sheet1", engine="xlrd") - assert '"xlrd" is deprecated, use "openpyxl" instead.' in str(w[0].message) + FutureWarning, check_stacklevel=True, raise_on_extra_warnings=False + ): + pd.read_excel(path, "Sheet1", engine=None) From 0f4c8a1987fb532aa8afe8531aca2eeeae7b7b7b Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 25 Nov 2020 19:27:03 -0500 Subject: [PATCH 05/14] Revert change to stacklevel --- pandas/io/excel/_base.py | 2 +- pandas/tests/io/excel/test_readers.py | 4 +++- pandas/tests/io/excel/test_xlrd.py | 4 ++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index cc2e5937d1bbc..0cdc769047be0 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -917,7 +917,7 @@ def __init__( "The default argument engine=None is deprecated. " "Specify the engine argument to suppress this warning.", FutureWarning, - stacklevel=4, + stacklevel=2, ) engine = "xlrd" if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)): diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index c582a0fa23577..af911c1deffb6 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -981,7 +981,9 @@ def test_read_excel_squeeze(self, read_ext): tm.assert_series_equal(actual, expected) def test_deprecated_kwargs(self, read_ext): - with tm.assert_produces_warning(FutureWarning, raise_on_extra_warnings=False): + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False + ): pd.read_excel("test1" + read_ext, "Sheet1", 0) pd.read_excel("test1" + read_ext) diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py index 1524dea25aa62..108c55cf1948c 100644 --- a/pandas/tests/io/excel/test_xlrd.py +++ b/pandas/tests/io/excel/test_xlrd.py @@ -49,7 +49,7 @@ def test_excel_file_warning_with_xlsx_file(datapath): # DeprecationWarning: "This method will be removed in future versions. # Use 'tree.iter()' or 'list(tree.iter())' instead." with tm.assert_produces_warning( - FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False + FutureWarning, check_stacklevel=True, raise_on_extra_warnings=False ): ExcelFile(path, engine=None) @@ -60,6 +60,6 @@ def test_read_excel_warning_with_xlsx_file(tmpdir, datapath): # DeprecationWarning: "This method will be removed in future versions. # Use 'tree.iter()' or 'list(tree.iter())' instead." with tm.assert_produces_warning( - FutureWarning, check_stacklevel=True, raise_on_extra_warnings=False + FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False ): pd.read_excel(path, "Sheet1", engine=None) From 499f9a0cfaa81720c29bd37fe0f3da722f2879ac Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 27 Nov 2020 10:33:53 -0500 Subject: [PATCH 06/14] - --- .../intro_tutorials/02_read_write.rst | 2 +- pandas/io/excel/_base.py | 34 +++++++++++++++---- 2 files changed, 28 insertions(+), 8 deletions(-) diff --git a/doc/source/getting_started/intro_tutorials/02_read_write.rst b/doc/source/getting_started/intro_tutorials/02_read_write.rst index c9b6a12904311..59c25a8313eba 100644 --- a/doc/source/getting_started/intro_tutorials/02_read_write.rst +++ b/doc/source/getting_started/intro_tutorials/02_read_write.rst @@ -156,7 +156,7 @@ The equivalent read function :meth:`~DataFrame.read_excel` will reload the data .. ipython:: python - titanic = pd.read_excel("titanic.xlsx", sheet_name="passengers") + titanic = pd.read_excel("titanic.xlsx", sheet_name="passengers", engine="openpyxl") .. ipython:: python diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 0cdc769047be0..ea81155c02bee 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -27,6 +27,8 @@ ) from pandas.io.parsers import TextParser +from pandas.compat import PY39 + _read_excel_doc = ( """ Read an Excel file into a pandas DataFrame. @@ -102,6 +104,7 @@ If io is not a buffer or path, this must be set to identify io. Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb", default "xlrd". Engine compatibility : + - "xlrd" supports most old/new Excel file formats but is no longer maintained. - "openpyxl" supports newer Excel file formats. - "odf" supports OpenDocument file formats (.odf, .ods, .odt). @@ -913,20 +916,37 @@ def __init__( self, path_or_buffer, engine=None, storage_options: StorageOptions = None ): if engine is None: - warnings.warn( - "The default argument engine=None is deprecated. " - "Specify the engine argument to suppress this warning.", - FutureWarning, - stacklevel=2, - ) - engine = "xlrd" + # xlrd doesn't support py39 + engine = "openpyxl" if PY39 else "xlrd" if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)): + ext = None if _is_ods_stream(path_or_buffer): engine = "odf" else: ext = os.path.splitext(str(path_or_buffer))[-1] if ext == ".ods": engine = "odf" + + if engine == "xlrd" and ext != ".xls" and not PY39: + import inspect + caller = inspect.stack()[1] + if ( + caller.filename.endswith("pandas/io/excel/_base.py") + and caller.function == "read_excel" + ): + stacklevel = 4 + else: + stacklevel = 2 + warnings.warn( + "The default argument engine=None is deprecated. Using None " + "defaults to the xlrd engine which is no longer maintained. " + "The default value will be 'openpyxl' in a future version of " + "pandas, although xlrd will continue to be allowed for the " + "indefinite future. Either install openpyxl and specify it as " + "the engine or specify 'xlrd' to suppress this warning.", + DeprecationWarning, + stacklevel=stacklevel, + ) if engine not in self._engines: raise ValueError(f"Unknown engine: {engine}") From 88093f60fcf888b761d269e7a2b70c9fbf426bd3 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 28 Nov 2020 18:07:40 -0500 Subject: [PATCH 07/14] Changes from review - Added engine= throughout the docs - Warning suppressed for xls file or xlrd is not installed - Changed FutureWarning -> DeprecationWarning - Added "not supported for Python >= 3.9" note to docstrings and warning --- doc/source/user_guide/10min.rst | 4 ++- doc/source/user_guide/io.rst | 8 +++--- doc/source/whatsnew/v0.17.0.rst | 2 +- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/io/excel/_base.py | 39 +++++++++++++++++++-------- pandas/tests/io/excel/test_readers.py | 4 +-- pandas/tests/io/excel/test_xlrd.py | 4 +-- 7 files changed, 41 insertions(+), 22 deletions(-) diff --git a/doc/source/user_guide/10min.rst b/doc/source/user_guide/10min.rst index cf548ba5d1133..7a61e3e7c5e06 100644 --- a/doc/source/user_guide/10min.rst +++ b/doc/source/user_guide/10min.rst @@ -808,7 +808,9 @@ Reading from an excel file. .. ipython:: python - pd.read_excel("foo.xlsx", "Sheet1", index_col=None, na_values=["NA"]) + pd.read_excel( + "foo.xlsx", "Sheet1", index_col=None, na_values=["NA"], engine="openpyxl" + ) .. ipython:: python :suppress: diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 1bd35131622ab..582750861da94 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -2982,7 +2982,7 @@ For example, to read in a ``MultiIndex`` index without names: index=pd.MultiIndex.from_product([["a", "b"], ["c", "d"]]), ) df.to_excel("path_to_file.xlsx") - df = pd.read_excel("path_to_file.xlsx", index_col=[0, 1]) + df = pd.read_excel("path_to_file.xlsx", index_col=[0, 1], engine="openpyxl") df If the index has level names, they will parsed as well, using the same @@ -2992,7 +2992,7 @@ parameters. df.index = df.index.set_names(["lvl1", "lvl2"]) df.to_excel("path_to_file.xlsx") - df = pd.read_excel("path_to_file.xlsx", index_col=[0, 1]) + df = pd.read_excel("path_to_file.xlsx", index_col=[0, 1], engine="openpyxl") df @@ -3003,7 +3003,9 @@ should be passed to ``index_col`` and ``header``: df.columns = pd.MultiIndex.from_product([["a"], ["b", "d"]], names=["c1", "c2"]) df.to_excel("path_to_file.xlsx") - df = pd.read_excel("path_to_file.xlsx", index_col=[0, 1], header=[0, 1]) + df = pd.read_excel( + "path_to_file.xlsx", index_col=[0, 1], header=[0, 1], engine="openpyxl" + ) df .. ipython:: python diff --git a/doc/source/whatsnew/v0.17.0.rst b/doc/source/whatsnew/v0.17.0.rst index d8f39a7d6e3c0..a9492c2e1d226 100644 --- a/doc/source/whatsnew/v0.17.0.rst +++ b/doc/source/whatsnew/v0.17.0.rst @@ -308,7 +308,7 @@ See the :ref:`documentation ` for more details. df df.to_excel("test.xlsx") - df = pd.read_excel("test.xlsx", header=[0, 1], index_col=[0, 1]) + df = pd.read_excel("test.xlsx", header=[0, 1], index_col=[0, 1], engine="openpyxl") df .. ipython:: python diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 03caafa5cd77d..7d38184e1a946 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -487,7 +487,7 @@ Deprecations - Deprecated :meth:`Index.asi8` for :class:`Index` subclasses other than :class:`.DatetimeIndex`, :class:`.TimedeltaIndex`, and :class:`PeriodIndex` (:issue:`37877`) - The ``inplace`` parameter of :meth:`Categorical.remove_unused_categories` is deprecated and will be removed in a future version (:issue:`37643`) - The ``null_counts`` parameter of :meth:`DataFrame.info` is deprecated and replaced by ``show_counts``. It will be removed in a future version (:issue:`37999`) -- Deprecated the default argument ``engine=None`` of the function :func:`read_excel`, which uses the no longer maintained xlrd engine. Not specifying the engine will raise a ``FutureWarning``. This argument will default to ``"openpyxl"`` in a future version, which is now the recommended engine for xlsx and xlsm files (:issue:`28547`) +- Deprecated the default argument ``engine=None`` of the function :func:`read_excel`, which uses the no longer maintained xlrd engine. Not specifying the engine will raise a ``DeprecationWarning``. This argument will default to ``"openpyxl"`` in a future version, which is now the recommended engine for xlsx and xlsm files (:issue:`28547`) .. --------------------------------------------------------------------------- diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index ea81155c02bee..0b3050d58efcc 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -10,6 +10,7 @@ from pandas._libs.parsers import STR_NA_VALUES from pandas._typing import Buffer, FilePathOrBuffer, StorageOptions +from pandas.compat._optional import import_optional_dependency from pandas.errors import EmptyDataError from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments @@ -27,8 +28,6 @@ ) from pandas.io.parsers import TextParser -from pandas.compat import PY39 - _read_excel_doc = ( """ Read an Excel file into a pandas DataFrame. @@ -104,15 +103,16 @@ If io is not a buffer or path, this must be set to identify io. Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb", default "xlrd". Engine compatibility : - - - "xlrd" supports most old/new Excel file formats but is no longer maintained. + + - "xlrd" supports most old/new Excel file formats but is no longer maintained + and not supported with Python >= 3.9. - "openpyxl" supports newer Excel file formats. - "odf" supports OpenDocument file formats (.odf, .ods, .odt). - "pyxlsb" supports Binary Excel files. .. deprecated:: 1.2.0 The default value ``None`` is deprecated and will be changed to ``"openpyxl"`` - in a future version. Not specifying an engine will raise a FutureWarning. + in a future version. Not specifying an engine will raise a DeprecationWarning. converters : dict, default None Dict of functions for converting values in certain columns. Keys can @@ -889,7 +889,9 @@ class ExcelFile: Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb``, default ``xlrd`` for .xls* files, ``odf`` for .ods files. Engine compatibility : - - ``xlrd`` supports most old/new Excel file formats but is no longer maintained. + + - ``xlrd`` supports most old/new Excel file formats but is no longer maintained + and is not supported with Python >= 3.9. - ``openpyxl`` supports newer Excel file formats. - ``odf`` supports OpenDocument file formats (.odf, .ods, .odt). - ``pyxlsb`` supports Binary Excel files. @@ -897,7 +899,7 @@ class ExcelFile: .. deprecated:: 1.2.0 The default value ``None`` is deprecated and will be changed to ``"openpyxl"`` in a future version. Not specifying an engine will - raise a FutureWarning. + raise a DeprecationWarning. """ from pandas.io.excel._odfreader import ODFReader @@ -916,8 +918,9 @@ def __init__( self, path_or_buffer, engine=None, storage_options: StorageOptions = None ): if engine is None: - # xlrd doesn't support py39 - engine = "openpyxl" if PY39 else "xlrd" + engine = "xlrd" + + # Determine ext and use odf for ods stream/file if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)): ext = None if _is_ods_stream(path_or_buffer): @@ -927,8 +930,21 @@ def __init__( if ext == ".ods": engine = "odf" - if engine == "xlrd" and ext != ".xls" and not PY39: + # GH 35029 - Default to openpyxl if xlrd is not installed for non-xls + if ( + engine == "xlrd" + and ext != ".xls" + and import_optional_dependency( + "xlrd", raise_on_missing=False, on_version="ignore" + ) + is None + ): + engine = "openpyxl" + + # GH 35029 - Don't warn with xls files as only xlrd can read them + if engine == "xlrd" and ext != ".xls": import inspect + caller = inspect.stack()[1] if ( caller.filename.endswith("pandas/io/excel/_base.py") @@ -939,7 +955,8 @@ def __init__( stacklevel = 2 warnings.warn( "The default argument engine=None is deprecated. Using None " - "defaults to the xlrd engine which is no longer maintained. " + "defaults to the xlrd engine which is no longer maintained, " + "and is not supported when using pandas with python >= 3.9. " "The default value will be 'openpyxl' in a future version of " "pandas, although xlrd will continue to be allowed for the " "indefinite future. Either install openpyxl and specify it as " diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index af911c1deffb6..c582a0fa23577 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -981,9 +981,7 @@ def test_read_excel_squeeze(self, read_ext): tm.assert_series_equal(actual, expected) def test_deprecated_kwargs(self, read_ext): - with tm.assert_produces_warning( - FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False - ): + with tm.assert_produces_warning(FutureWarning, raise_on_extra_warnings=False): pd.read_excel("test1" + read_ext, "Sheet1", 0) pd.read_excel("test1" + read_ext) diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py index 108c55cf1948c..678214c9cabfa 100644 --- a/pandas/tests/io/excel/test_xlrd.py +++ b/pandas/tests/io/excel/test_xlrd.py @@ -49,7 +49,7 @@ def test_excel_file_warning_with_xlsx_file(datapath): # DeprecationWarning: "This method will be removed in future versions. # Use 'tree.iter()' or 'list(tree.iter())' instead." with tm.assert_produces_warning( - FutureWarning, check_stacklevel=True, raise_on_extra_warnings=False + DeprecationWarning, check_stacklevel=False, raise_on_extra_warnings=False ): ExcelFile(path, engine=None) @@ -60,6 +60,6 @@ def test_read_excel_warning_with_xlsx_file(tmpdir, datapath): # DeprecationWarning: "This method will be removed in future versions. # Use 'tree.iter()' or 'list(tree.iter())' instead." with tm.assert_produces_warning( - FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False + DeprecationWarning, check_stacklevel=False, raise_on_extra_warnings=False ): pd.read_excel(path, "Sheet1", engine=None) From 44f157b08c2bdc822dc816522a60d30004d23360 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 29 Nov 2020 11:38:37 -0500 Subject: [PATCH 08/14] DeprecationWarning -> FutureWarning; added warning to io.rst/whatsnew --- doc/source/user_guide/io.rst | 12 ++++++++++++ doc/source/whatsnew/v1.2.0.rst | 13 ++++++++++++- pandas/io/excel/_base.py | 6 +++--- pandas/tests/io/excel/test_xlrd.py | 8 ++------ 4 files changed, 29 insertions(+), 10 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 582750861da94..c3217426f3e40 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -2832,6 +2832,18 @@ See the :ref:`cookbook` for some advanced strategies. Reading Excel files ''''''''''''''''''' +.. warning:: + + .. versionchanged:: 1.2.0 + + The default argument ``engine=None`` to ``pd.read_excel`` is + deprecated. Using None defaults to the xlrd engine which is no + longer maintained, and is not supported when using pandas with + python >= 3.9. The default value will be ``'openpyxl'`` in a future + version of pandas, although xlrd will continue to be allowed for the + indefinite future. Either install openpyxl and specify it as + the engine or specify ``'xlrd'`` to suppress this warning. + In the most basic use-case, ``read_excel`` takes a path to an Excel file, and the ``sheet_name`` indicating which sheet to parse. diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 7d38184e1a946..837582624ec19 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -8,6 +8,16 @@ including other versions of pandas. {{ header }} +.. warning:: + + The default argument ``engine=None`` to ``pd.read_excel`` is + deprecated. Using None defaults to the xlrd engine which is no + longer maintained, and is not supported when using pandas with + python >= 3.9. The default value will be ``'openpyxl'`` in a future + version of pandas, although xlrd will continue to be allowed for the + indefinite future. Either install openpyxl and specify it as + the engine or specify ``'xlrd'`` to suppress this warning. + .. --------------------------------------------------------------------------- Enhancements @@ -461,6 +471,7 @@ Other API changes Deprecations ~~~~~~~~~~~~ + - Deprecated parameter ``inplace`` in :meth:`MultiIndex.set_codes` and :meth:`MultiIndex.set_levels` (:issue:`35626`) - Deprecated parameter ``dtype`` of method :meth:`~Index.copy` for all :class:`Index` subclasses. Use the :meth:`~Index.astype` method instead for changing dtype (:issue:`35853`) - Deprecated parameters ``levels`` and ``codes`` in :meth:`MultiIndex.copy`. Use the :meth:`~MultiIndex.set_levels` and :meth:`~MultiIndex.set_codes` methods instead (:issue:`36685`) @@ -487,7 +498,7 @@ Deprecations - Deprecated :meth:`Index.asi8` for :class:`Index` subclasses other than :class:`.DatetimeIndex`, :class:`.TimedeltaIndex`, and :class:`PeriodIndex` (:issue:`37877`) - The ``inplace`` parameter of :meth:`Categorical.remove_unused_categories` is deprecated and will be removed in a future version (:issue:`37643`) - The ``null_counts`` parameter of :meth:`DataFrame.info` is deprecated and replaced by ``show_counts``. It will be removed in a future version (:issue:`37999`) -- Deprecated the default argument ``engine=None`` of the function :func:`read_excel`, which uses the no longer maintained xlrd engine. Not specifying the engine will raise a ``DeprecationWarning``. This argument will default to ``"openpyxl"`` in a future version, which is now the recommended engine for xlsx and xlsm files (:issue:`28547`) +- Deprecated the default argument ``engine=None`` of the function :func:`read_excel`, which uses the no longer maintained xlrd engine. Not specifying the engine will raise a ``FutureWarning``. This argument will default to ``"openpyxl"`` in a future version, which is now the recommended engine for xlsx and xlsm files (:issue:`28547`) .. --------------------------------------------------------------------------- diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 0b3050d58efcc..365526d699eac 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -112,7 +112,7 @@ .. deprecated:: 1.2.0 The default value ``None`` is deprecated and will be changed to ``"openpyxl"`` - in a future version. Not specifying an engine will raise a DeprecationWarning. + in a future version. Not specifying an engine will raise a FutureWarning. converters : dict, default None Dict of functions for converting values in certain columns. Keys can @@ -899,7 +899,7 @@ class ExcelFile: .. deprecated:: 1.2.0 The default value ``None`` is deprecated and will be changed to ``"openpyxl"`` in a future version. Not specifying an engine will - raise a DeprecationWarning. + raise a FutureWarning. """ from pandas.io.excel._odfreader import ODFReader @@ -961,7 +961,7 @@ def __init__( "pandas, although xlrd will continue to be allowed for the " "indefinite future. Either install openpyxl and specify it as " "the engine or specify 'xlrd' to suppress this warning.", - DeprecationWarning, + FutureWarning, stacklevel=stacklevel, ) if engine not in self._engines: diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py index 678214c9cabfa..7e2e770a2926a 100644 --- a/pandas/tests/io/excel/test_xlrd.py +++ b/pandas/tests/io/excel/test_xlrd.py @@ -48,9 +48,7 @@ def test_excel_file_warning_with_xlsx_file(datapath): path = datapath("io", "data", "excel", "test1.xlsx") # DeprecationWarning: "This method will be removed in future versions. # Use 'tree.iter()' or 'list(tree.iter())' instead." - with tm.assert_produces_warning( - DeprecationWarning, check_stacklevel=False, raise_on_extra_warnings=False - ): + with tm.assert_produces_warning(FutureWarning, raise_on_extra_warnings=False): ExcelFile(path, engine=None) @@ -59,7 +57,5 @@ def test_read_excel_warning_with_xlsx_file(tmpdir, datapath): path = datapath("io", "data", "excel", "test1.xlsx") # DeprecationWarning: "This method will be removed in future versions. # Use 'tree.iter()' or 'list(tree.iter())' instead." - with tm.assert_produces_warning( - DeprecationWarning, check_stacklevel=False, raise_on_extra_warnings=False - ): + with tm.assert_produces_warning(FutureWarning, raise_on_extra_warnings=False): pd.read_excel(path, "Sheet1", engine=None) From fffbacbc8bc0733dd1e453aea7386c7f14fbff2d Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 29 Nov 2020 11:52:32 -0500 Subject: [PATCH 09/14] "to suppress this warning." -> "to avoid raising a FutureWarning." --- doc/source/user_guide/io.rst | 2 +- doc/source/whatsnew/v1.2.0.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index c3217426f3e40..600f943d103e1 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -2842,7 +2842,7 @@ Reading Excel files python >= 3.9. The default value will be ``'openpyxl'`` in a future version of pandas, although xlrd will continue to be allowed for the indefinite future. Either install openpyxl and specify it as - the engine or specify ``'xlrd'`` to suppress this warning. + the engine or specify ``'xlrd'`` to avoid raising a ``FutureWarning``. In the most basic use-case, ``read_excel`` takes a path to an Excel file, and the ``sheet_name`` indicating which sheet to parse. diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 837582624ec19..d02dfa0462649 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -16,7 +16,7 @@ including other versions of pandas. python >= 3.9. The default value will be ``'openpyxl'`` in a future version of pandas, although xlrd will continue to be allowed for the indefinite future. Either install openpyxl and specify it as - the engine or specify ``'xlrd'`` to suppress this warning. + the engine or specify ``'xlrd'`` to avoid raising a ``FutureWarning``. .. --------------------------------------------------------------------------- From bb5372538b6c0552f77aa2fd69c75a6623286d57 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 29 Nov 2020 21:38:21 -0500 Subject: [PATCH 10/14] Changed engine=None to mostly using openpyxl --- .../intro_tutorials/02_read_write.rst | 2 +- doc/source/user_guide/10min.rst | 4 +- doc/source/user_guide/io.rst | 20 +--- doc/source/whatsnew/v0.17.0.rst | 2 +- doc/source/whatsnew/v1.2.0.rst | 20 ++-- pandas/io/excel/_base.py | 110 +++++++++++------- pandas/tests/io/excel/test_readers.py | 6 +- pandas/tests/io/excel/test_writers.py | 11 +- pandas/tests/io/excel/test_xlrd.py | 18 --- 9 files changed, 98 insertions(+), 95 deletions(-) diff --git a/doc/source/getting_started/intro_tutorials/02_read_write.rst b/doc/source/getting_started/intro_tutorials/02_read_write.rst index 59c25a8313eba..c9b6a12904311 100644 --- a/doc/source/getting_started/intro_tutorials/02_read_write.rst +++ b/doc/source/getting_started/intro_tutorials/02_read_write.rst @@ -156,7 +156,7 @@ The equivalent read function :meth:`~DataFrame.read_excel` will reload the data .. ipython:: python - titanic = pd.read_excel("titanic.xlsx", sheet_name="passengers", engine="openpyxl") + titanic = pd.read_excel("titanic.xlsx", sheet_name="passengers") .. ipython:: python diff --git a/doc/source/user_guide/10min.rst b/doc/source/user_guide/10min.rst index 7a61e3e7c5e06..cf548ba5d1133 100644 --- a/doc/source/user_guide/10min.rst +++ b/doc/source/user_guide/10min.rst @@ -808,9 +808,7 @@ Reading from an excel file. .. ipython:: python - pd.read_excel( - "foo.xlsx", "Sheet1", index_col=None, na_values=["NA"], engine="openpyxl" - ) + pd.read_excel("foo.xlsx", "Sheet1", index_col=None, na_values=["NA"]) .. ipython:: python :suppress: diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 600f943d103e1..1bd35131622ab 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -2832,18 +2832,6 @@ See the :ref:`cookbook` for some advanced strategies. Reading Excel files ''''''''''''''''''' -.. warning:: - - .. versionchanged:: 1.2.0 - - The default argument ``engine=None`` to ``pd.read_excel`` is - deprecated. Using None defaults to the xlrd engine which is no - longer maintained, and is not supported when using pandas with - python >= 3.9. The default value will be ``'openpyxl'`` in a future - version of pandas, although xlrd will continue to be allowed for the - indefinite future. Either install openpyxl and specify it as - the engine or specify ``'xlrd'`` to avoid raising a ``FutureWarning``. - In the most basic use-case, ``read_excel`` takes a path to an Excel file, and the ``sheet_name`` indicating which sheet to parse. @@ -2994,7 +2982,7 @@ For example, to read in a ``MultiIndex`` index without names: index=pd.MultiIndex.from_product([["a", "b"], ["c", "d"]]), ) df.to_excel("path_to_file.xlsx") - df = pd.read_excel("path_to_file.xlsx", index_col=[0, 1], engine="openpyxl") + df = pd.read_excel("path_to_file.xlsx", index_col=[0, 1]) df If the index has level names, they will parsed as well, using the same @@ -3004,7 +2992,7 @@ parameters. df.index = df.index.set_names(["lvl1", "lvl2"]) df.to_excel("path_to_file.xlsx") - df = pd.read_excel("path_to_file.xlsx", index_col=[0, 1], engine="openpyxl") + df = pd.read_excel("path_to_file.xlsx", index_col=[0, 1]) df @@ -3015,9 +3003,7 @@ should be passed to ``index_col`` and ``header``: df.columns = pd.MultiIndex.from_product([["a"], ["b", "d"]], names=["c1", "c2"]) df.to_excel("path_to_file.xlsx") - df = pd.read_excel( - "path_to_file.xlsx", index_col=[0, 1], header=[0, 1], engine="openpyxl" - ) + df = pd.read_excel("path_to_file.xlsx", index_col=[0, 1], header=[0, 1]) df .. ipython:: python diff --git a/doc/source/whatsnew/v0.17.0.rst b/doc/source/whatsnew/v0.17.0.rst index a9492c2e1d226..d8f39a7d6e3c0 100644 --- a/doc/source/whatsnew/v0.17.0.rst +++ b/doc/source/whatsnew/v0.17.0.rst @@ -308,7 +308,7 @@ See the :ref:`documentation ` for more details. df df.to_excel("test.xlsx") - df = pd.read_excel("test.xlsx", header=[0, 1], index_col=[0, 1], engine="openpyxl") + df = pd.read_excel("test.xlsx", header=[0, 1], index_col=[0, 1]) df .. ipython:: python diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index d02dfa0462649..b3e2c8693f7ed 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -10,13 +10,17 @@ including other versions of pandas. .. warning:: - The default argument ``engine=None`` to ``pd.read_excel`` is - deprecated. Using None defaults to the xlrd engine which is no - longer maintained, and is not supported when using pandas with - python >= 3.9. The default value will be ``'openpyxl'`` in a future - version of pandas, although xlrd will continue to be allowed for the - indefinite future. Either install openpyxl and specify it as - the engine or specify ``'xlrd'`` to avoid raising a ``FutureWarning``. + Previously, the default argument ``engine=None`` to ``pd.read_excel`` + would result in using the xlrd engine in many cases. The engine xlrd is no longer + maintained, and is not supported with python >= 3.9. When ``engine=None``, the + following logic will be used to determine the engine. + + - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), then odf will be used. + - Otherwise if ``path_or_buffer`` is a bytes stream, the file has the extension `.xls`, or is an xlrd Book instance, then xlrd will be used. + - Otherwise if openpyxl is installed, then openpyxl will be used. + - Otherwise xlrd will be used and a FutureWarning will be raised. + + Specifying ``engine="xlrd"`` will continue to be allowed for the indefinite future. .. --------------------------------------------------------------------------- @@ -471,7 +475,6 @@ Other API changes Deprecations ~~~~~~~~~~~~ - - Deprecated parameter ``inplace`` in :meth:`MultiIndex.set_codes` and :meth:`MultiIndex.set_levels` (:issue:`35626`) - Deprecated parameter ``dtype`` of method :meth:`~Index.copy` for all :class:`Index` subclasses. Use the :meth:`~Index.astype` method instead for changing dtype (:issue:`35853`) - Deprecated parameters ``levels`` and ``codes`` in :meth:`MultiIndex.copy`. Use the :meth:`~MultiIndex.set_levels` and :meth:`~MultiIndex.set_codes` methods instead (:issue:`36685`) @@ -498,7 +501,6 @@ Deprecations - Deprecated :meth:`Index.asi8` for :class:`Index` subclasses other than :class:`.DatetimeIndex`, :class:`.TimedeltaIndex`, and :class:`PeriodIndex` (:issue:`37877`) - The ``inplace`` parameter of :meth:`Categorical.remove_unused_categories` is deprecated and will be removed in a future version (:issue:`37643`) - The ``null_counts`` parameter of :meth:`DataFrame.info` is deprecated and replaced by ``show_counts``. It will be removed in a future version (:issue:`37999`) -- Deprecated the default argument ``engine=None`` of the function :func:`read_excel`, which uses the no longer maintained xlrd engine. Not specifying the engine will raise a ``FutureWarning``. This argument will default to ``"openpyxl"`` in a future version, which is now the recommended engine for xlsx and xlsm files (:issue:`28547`) .. --------------------------------------------------------------------------- diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 365526d699eac..e321075898e81 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1,5 +1,6 @@ import abc import datetime +import inspect from io import BufferedIOBase, BytesIO, RawIOBase import os from textwrap import fill @@ -104,15 +105,25 @@ Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb", default "xlrd". Engine compatibility : - - "xlrd" supports most old/new Excel file formats but is no longer maintained - and not supported with Python >= 3.9. + - "xlrd" supports most old/new Excel file formats. - "openpyxl" supports newer Excel file formats. - "odf" supports OpenDocument file formats (.odf, .ods, .odt). - "pyxlsb" supports Binary Excel files. - .. deprecated:: 1.2.0 - The default value ``None`` is deprecated and will be changed to ``"openpyxl"`` - in a future version. Not specifying an engine will raise a FutureWarning. + .. versionchanged:: 1.2.0 + The engine xlrd is no longer maintained, and is not supported with + python >= 3.9. When ``engine="None"``, the following logic will be + used to determine the engine. + + - If path_or_buffer is an OpenDocument format (.odf, .ods, .odt), + then odf will be used. + - Otherwise if path_or_buffer is a bytes stream, the file has the + extension `.xls`, or is an xlrd Book instance, then xlrd will be used. + - Otherwise if openpyxl is installed, then openpyxl will be used. + - Otherwise xlrd will be used and a FutureWarning will be raised. + + Specifying ``engine="xlrd"`` will continue to be allowed for the + indefinite future. converters : dict, default None Dict of functions for converting values in certain columns. Keys can @@ -887,19 +898,29 @@ class ExcelFile: engine : str, default None If io is not a buffer or path, this must be set to identify io. Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb``, - default ``xlrd`` for .xls* files, ``odf`` for .ods files. + ``xlrd``, ``odf`` for .ods files. Engine compatibility : - - ``xlrd`` supports most old/new Excel file formats but is no longer maintained - and is not supported with Python >= 3.9. + - ``xlrd`` supports most old/new Excel file formats. - ``openpyxl`` supports newer Excel file formats. - ``odf`` supports OpenDocument file formats (.odf, .ods, .odt). - ``pyxlsb`` supports Binary Excel files. - .. deprecated:: 1.2.0 - The default value ``None`` is deprecated and will be changed to - ``"openpyxl"`` in a future version. Not specifying an engine will - raise a FutureWarning. + .. versionchanged:: 1.2.0 + + The engine xlrd is no longer maintained, and is not supported with + python >= 3.9. When ``engine="None"``, the following logic will be + used to determine the engine. + + - If path_or_buffer is an OpenDocument format (.odf, .ods, .odt), + then odf will be used. + - Otherwise if path_or_buffer is a bytes stream, the file has the + extension `.xls`, or is an xlrd Book instance, then xlrd will be used. + - Otherwise if openpyxl is installed, then openpyxl will be used. + - Otherwise xlrd will be used and a FutureWarning will be raised. + + Specifying ``engine="xlrd"`` will continue to be allowed for the + indefinite future. """ from pandas.io.excel._odfreader import ODFReader @@ -918,8 +939,6 @@ def __init__( self, path_or_buffer, engine=None, storage_options: StorageOptions = None ): if engine is None: - engine = "xlrd" - # Determine ext and use odf for ods stream/file if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)): ext = None @@ -930,40 +949,47 @@ def __init__( if ext == ".ods": engine = "odf" - # GH 35029 - Default to openpyxl if xlrd is not installed for non-xls if ( - engine == "xlrd" - and ext != ".xls" - and import_optional_dependency( + import_optional_dependency( "xlrd", raise_on_missing=False, on_version="ignore" ) - is None + is not None ): - engine = "openpyxl" - - # GH 35029 - Don't warn with xls files as only xlrd can read them - if engine == "xlrd" and ext != ".xls": - import inspect - - caller = inspect.stack()[1] - if ( - caller.filename.endswith("pandas/io/excel/_base.py") - and caller.function == "read_excel" + from xlrd import Book + + if isinstance(path_or_buffer, Book): + engine = "xlrd" + + # GH 35029 - Prefer openpyxl except for xls files + if engine is None: + if ext is None or isinstance(path_or_buffer, bytes) or ext == ".xls": + engine = "xlrd" + elif ( + import_optional_dependency( + "openpyxl", raise_on_missing=False, on_version="ignore" + ) + is not None ): - stacklevel = 4 + engine = "openpyxl" else: - stacklevel = 2 - warnings.warn( - "The default argument engine=None is deprecated. Using None " - "defaults to the xlrd engine which is no longer maintained, " - "and is not supported when using pandas with python >= 3.9. " - "The default value will be 'openpyxl' in a future version of " - "pandas, although xlrd will continue to be allowed for the " - "indefinite future. Either install openpyxl and specify it as " - "the engine or specify 'xlrd' to suppress this warning.", - FutureWarning, - stacklevel=stacklevel, - ) + caller = inspect.stack()[1] + if ( + caller.filename.endswith("pandas/io/excel/_base.py") + and caller.function == "read_excel" + ): + stacklevel = 4 + else: + stacklevel = 2 + warnings.warn( + "The xlrd engine is no longer maintained and is not " + "supported when using pandas with python >= 3.9. However, " + "the engine xlrd will continue to be allowed for the " + "indefinite future. Either install openpyxl or specify " + "engine='xlrd' to silence this warning.", + FutureWarning, + stacklevel=stacklevel, + ) + engine = "xlrd" if engine not in self._engines: raise ValueError(f"Unknown engine: {engine}") diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index c582a0fa23577..98a55ae39bd77 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -577,6 +577,10 @@ def test_date_conversion_overflow(self, read_ext): if pd.read_excel.keywords["engine"] == "openpyxl": pytest.xfail("Maybe not supported by openpyxl") + if pd.read_excel.keywords["engine"] is None: + # GH 35029 + pytest.xfail("Defaults to openpyxl, maybe not supported") + result = pd.read_excel("testdateoverflow" + read_ext) tm.assert_frame_equal(result, expected) @@ -1159,7 +1163,7 @@ def test_excel_high_surrogate(self, engine): expected = DataFrame(["\udc88"], columns=["Column1"]) # should not produce a segmentation violation - actual = pd.read_excel("high_surrogate.xlsx") + actual = pd.read_excel("high_surrogate.xlsx", engine="xlrd") tm.assert_frame_equal(expected, actual) @pytest.mark.parametrize("filename", ["df_empty.xlsx", "df_equals.xlsx"]) diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 8da9c79160e91..6267cbdf8808a 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -351,12 +351,15 @@ def test_excel_sheet_by_name_raise(self, path, engine): msg = "sheet 0 not found" with pytest.raises(ValueError, match=msg): pd.read_excel(xl, "0") - else: + elif engine == "xlwt": import xlrd msg = "No sheet named <'0'>" with pytest.raises(xlrd.XLRDError, match=msg): pd.read_excel(xl, sheet_name="0") + else: + with pytest.raises(KeyError, match="Worksheet 0 does not exist."): + pd.read_excel(xl, sheet_name="0") def test_excel_writer_context_manager(self, frame, path): with ExcelWriter(path) as writer: @@ -1174,7 +1177,7 @@ def test_comment_empty_line(self, path): result = pd.read_excel(path, comment="#") tm.assert_frame_equal(result, expected) - def test_datetimes(self, path): + def test_datetimes(self, path, engine): # Test writing and reading datetimes. For issue #9139. (xref #9185) datetimes = [ @@ -1193,7 +1196,9 @@ def test_datetimes(self, path): write_frame = DataFrame({"A": datetimes}) write_frame.to_excel(path, "Sheet1") - read_frame = pd.read_excel(path, sheet_name="Sheet1", header=0) + # GH 35029 - Default changed to openpyxl, but test is for odf/xlrd + engine = "odf" if path.endswith("ods") else "xlrd" + read_frame = pd.read_excel(path, sheet_name="Sheet1", header=0, engine=engine) tm.assert_series_equal(write_frame["A"], read_frame["A"]) diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py index 7e2e770a2926a..be345dcf7e169 100644 --- a/pandas/tests/io/excel/test_xlrd.py +++ b/pandas/tests/io/excel/test_xlrd.py @@ -41,21 +41,3 @@ def test_excel_table_sheet_by_index(datapath, read_ext): with ExcelFile(path, engine="xlrd") as excel: with pytest.raises(xlrd.XLRDError): pd.read_excel(excel, sheet_name="asdf") - - -def test_excel_file_warning_with_xlsx_file(datapath): - # GH 29375 - path = datapath("io", "data", "excel", "test1.xlsx") - # DeprecationWarning: "This method will be removed in future versions. - # Use 'tree.iter()' or 'list(tree.iter())' instead." - with tm.assert_produces_warning(FutureWarning, raise_on_extra_warnings=False): - ExcelFile(path, engine=None) - - -def test_read_excel_warning_with_xlsx_file(tmpdir, datapath): - # GH 29375 - path = datapath("io", "data", "excel", "test1.xlsx") - # DeprecationWarning: "This method will be removed in future versions. - # Use 'tree.iter()' or 'list(tree.iter())' instead." - with tm.assert_produces_warning(FutureWarning, raise_on_extra_warnings=False): - pd.read_excel(path, "Sheet1", engine=None) From d8dcb04132871b2da3cc3fa550dfb33b2dbc166f Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 29 Nov 2020 21:52:12 -0500 Subject: [PATCH 11/14] Minor doc touchups --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/io/excel/_base.py | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index b3e2c8693f7ed..599053d52911b 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -16,7 +16,7 @@ including other versions of pandas. following logic will be used to determine the engine. - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), then odf will be used. - - Otherwise if ``path_or_buffer`` is a bytes stream, the file has the extension `.xls`, or is an xlrd Book instance, then xlrd will be used. + - Otherwise if ``path_or_buffer`` is a bytes stream, the file has the extension ``.xls``, or is an xlrd Book instance, then xlrd will be used. - Otherwise if openpyxl is installed, then openpyxl will be used. - Otherwise xlrd will be used and a FutureWarning will be raised. diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index e321075898e81..dc9192a2dee93 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -102,7 +102,7 @@ of dtype conversion. engine : str, default None If io is not a buffer or path, this must be set to identify io. - Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb", default "xlrd". + Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb". Engine compatibility : - "xlrd" supports most old/new Excel file formats. @@ -118,7 +118,7 @@ - If path_or_buffer is an OpenDocument format (.odf, .ods, .odt), then odf will be used. - Otherwise if path_or_buffer is a bytes stream, the file has the - extension `.xls`, or is an xlrd Book instance, then xlrd will be used. + extension ``.xls``, or is an xlrd Book instance, then xlrd will be used. - Otherwise if openpyxl is installed, then openpyxl will be used. - Otherwise xlrd will be used and a FutureWarning will be raised. @@ -897,8 +897,7 @@ class ExcelFile: .xls, .xlsx, .xlsb, .xlsm, .odf, .ods, or .odt file. engine : str, default None If io is not a buffer or path, this must be set to identify io. - Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb``, - ``xlrd``, ``odf`` for .ods files. + Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb`` Engine compatibility : - ``xlrd`` supports most old/new Excel file formats. @@ -915,7 +914,7 @@ class ExcelFile: - If path_or_buffer is an OpenDocument format (.odf, .ods, .odt), then odf will be used. - Otherwise if path_or_buffer is a bytes stream, the file has the - extension `.xls`, or is an xlrd Book instance, then xlrd will be used. + extension ``.xls``, or is an xlrd Book instance, then xlrd will be used. - Otherwise if openpyxl is installed, then openpyxl will be used. - Otherwise xlrd will be used and a FutureWarning will be raised. From f9876ddb8f60fce84723dc899cc681172c6ab56e Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 30 Nov 2020 16:22:23 -0500 Subject: [PATCH 12/14] Re-added tests, minor doc touchups --- doc/source/whatsnew/v1.2.0.rst | 4 +-- pandas/io/excel/_base.py | 22 +++++++++------- pandas/tests/io/excel/test_writers.py | 2 +- pandas/tests/io/excel/test_xlrd.py | 38 +++++++++++++++++++++++++++ 4 files changed, 53 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 599053d52911b..a7e6a40f4d7c8 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -13,12 +13,12 @@ including other versions of pandas. Previously, the default argument ``engine=None`` to ``pd.read_excel`` would result in using the xlrd engine in many cases. The engine xlrd is no longer maintained, and is not supported with python >= 3.9. When ``engine=None``, the - following logic will be used to determine the engine. + following logic is now used to determine the engine. - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), then odf will be used. - Otherwise if ``path_or_buffer`` is a bytes stream, the file has the extension ``.xls``, or is an xlrd Book instance, then xlrd will be used. - Otherwise if openpyxl is installed, then openpyxl will be used. - - Otherwise xlrd will be used and a FutureWarning will be raised. + - Otherwise xlrd will be used and a ``FutureWarning`` will be raised. Specifying ``engine="xlrd"`` will continue to be allowed for the indefinite future. diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index dc9192a2dee93..4a2975e656c60 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -112,15 +112,15 @@ .. versionchanged:: 1.2.0 The engine xlrd is no longer maintained, and is not supported with - python >= 3.9. When ``engine="None"``, the following logic will be + python >= 3.9. When ``engine=None``, the following logic will be used to determine the engine. - - If path_or_buffer is an OpenDocument format (.odf, .ods, .odt), + - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), then odf will be used. - - Otherwise if path_or_buffer is a bytes stream, the file has the + - Otherwise if ``path_or_buffer`` is a bytes stream, the file has the extension ``.xls``, or is an xlrd Book instance, then xlrd will be used. - Otherwise if openpyxl is installed, then openpyxl will be used. - - Otherwise xlrd will be used and a FutureWarning will be raised. + - Otherwise xlrd will be used and a ``FutureWarning`` will be raised. Specifying ``engine="xlrd"`` will continue to be allowed for the indefinite future. @@ -908,15 +908,15 @@ class ExcelFile: .. versionchanged:: 1.2.0 The engine xlrd is no longer maintained, and is not supported with - python >= 3.9. When ``engine="None"``, the following logic will be + python >= 3.9. When ``engine=None``, the following logic will be used to determine the engine. - - If path_or_buffer is an OpenDocument format (.odf, .ods, .odt), + - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), then odf will be used. - - Otherwise if path_or_buffer is a bytes stream, the file has the + - Otherwise if ``path_or_buffer`` is a bytes stream, the file has the extension ``.xls``, or is an xlrd Book instance, then xlrd will be used. - Otherwise if openpyxl is installed, then openpyxl will be used. - - Otherwise xlrd will be used and a FutureWarning will be raised. + - Otherwise xlrd will be used and a ``FutureWarning`` will be raised. Specifying ``engine="xlrd"`` will continue to be allowed for the indefinite future. @@ -983,8 +983,10 @@ def __init__( "The xlrd engine is no longer maintained and is not " "supported when using pandas with python >= 3.9. However, " "the engine xlrd will continue to be allowed for the " - "indefinite future. Either install openpyxl or specify " - "engine='xlrd' to silence this warning.", + "indefinite future. Beginning with pandas 1.2.0, the " + "openpyxl engine will be used if it is installed and the " + "engine argument is not specified. Either install openpyxl " + "or specify engine='xlrd' to silence this warning.", FutureWarning, stacklevel=stacklevel, ) diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 6267cbdf8808a..0aaa8be616342 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -1177,7 +1177,7 @@ def test_comment_empty_line(self, path): result = pd.read_excel(path, comment="#") tm.assert_frame_equal(result, expected) - def test_datetimes(self, path, engine): + def test_datetimes(self, path): # Test writing and reading datetimes. For issue #9139. (xref #9185) datetimes = [ diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py index be345dcf7e169..f2b4e23614413 100644 --- a/pandas/tests/io/excel/test_xlrd.py +++ b/pandas/tests/io/excel/test_xlrd.py @@ -1,5 +1,7 @@ import pytest +from pandas.compat._optional import import_optional_dependency + import pandas as pd import pandas._testing as tm @@ -41,3 +43,39 @@ def test_excel_table_sheet_by_index(datapath, read_ext): with ExcelFile(path, engine="xlrd") as excel: with pytest.raises(xlrd.XLRDError): pd.read_excel(excel, sheet_name="asdf") + + +def test_excel_file_warning_with_xlsx_file(datapath): + # GH 29375 + path = datapath("io", "data", "excel", "test1.xlsx") + has_openpyxl = ( + import_optional_dependency( + "openpyxl", raise_on_missing=False, on_version="ignore" + ) + is not None + ) + if not has_openpyxl: + with tm.assert_produces_warning( + FutureWarning, + raise_on_extra_warnings=False, + match="The xlrd engine is no longer maintained", + ): + ExcelFile(path, engine=None) + + +def test_read_excel_warning_with_xlsx_file(tmpdir, datapath): + # GH 29375 + path = datapath("io", "data", "excel", "test1.xlsx") + has_openpyxl = ( + import_optional_dependency( + "openpyxl", raise_on_missing=False, on_version="ignore" + ) + is not None + ) + if not has_openpyxl: + with tm.assert_produces_warning( + FutureWarning, + raise_on_extra_warnings=False, + match="The xlrd engine is no longer maintained", + ): + pd.read_excel(path, "Sheet1", engine=None) From bc3ec473cb05cc49de7809091f674d411c7accfb Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 30 Nov 2020 16:37:28 -0500 Subject: [PATCH 13/14] Test for no warning as well --- pandas/tests/io/excel/test_xlrd.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py index f2b4e23614413..f2fbcbc2e2f04 100644 --- a/pandas/tests/io/excel/test_xlrd.py +++ b/pandas/tests/io/excel/test_xlrd.py @@ -61,6 +61,9 @@ def test_excel_file_warning_with_xlsx_file(datapath): match="The xlrd engine is no longer maintained", ): ExcelFile(path, engine=None) + else: + with tm.assert_produces_warning(None): + pd.read_excel(path, "Sheet1", engine=None) def test_read_excel_warning_with_xlsx_file(tmpdir, datapath): @@ -79,3 +82,6 @@ def test_read_excel_warning_with_xlsx_file(tmpdir, datapath): match="The xlrd engine is no longer maintained", ): pd.read_excel(path, "Sheet1", engine=None) + else: + with tm.assert_produces_warning(None): + pd.read_excel(path, "Sheet1", engine=None) From fe10a898b3a0f3d224e9f5635b62dc1e641e5152 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 1 Dec 2020 17:05:11 -0500 Subject: [PATCH 14/14] Doc tweaks --- doc/source/whatsnew/v1.2.0.rst | 15 +++++---------- pandas/io/excel/_base.py | 26 ++++++++++++++++---------- 2 files changed, 21 insertions(+), 20 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index a7e6a40f4d7c8..b1257fe893804 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -11,16 +11,11 @@ including other versions of pandas. .. warning:: Previously, the default argument ``engine=None`` to ``pd.read_excel`` - would result in using the xlrd engine in many cases. The engine xlrd is no longer - maintained, and is not supported with python >= 3.9. When ``engine=None``, the - following logic is now used to determine the engine. - - - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), then odf will be used. - - Otherwise if ``path_or_buffer`` is a bytes stream, the file has the extension ``.xls``, or is an xlrd Book instance, then xlrd will be used. - - Otherwise if openpyxl is installed, then openpyxl will be used. - - Otherwise xlrd will be used and a ``FutureWarning`` will be raised. - - Specifying ``engine="xlrd"`` will continue to be allowed for the indefinite future. + would result in using the `xlrd `_ engine in + many cases. The engine ``xlrd`` is no longer maintained, and is not supported with + python >= 3.9. If `openpyxl `_ is installed, + many of these cases will now default to using the ``openpyxl`` engine. See the + :func:`read_excel` documentation for more details. .. --------------------------------------------------------------------------- diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 4a2975e656c60..0235d6a3f6384 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -111,16 +111,19 @@ - "pyxlsb" supports Binary Excel files. .. versionchanged:: 1.2.0 - The engine xlrd is no longer maintained, and is not supported with + The engine `xlrd `_ + is no longer maintained, and is not supported with python >= 3.9. When ``engine=None``, the following logic will be used to determine the engine. - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), - then odf will be used. + then `odf `_ will be used. - Otherwise if ``path_or_buffer`` is a bytes stream, the file has the - extension ``.xls``, or is an xlrd Book instance, then xlrd will be used. - - Otherwise if openpyxl is installed, then openpyxl will be used. - - Otherwise xlrd will be used and a ``FutureWarning`` will be raised. + extension ``.xls``, or is an ``xlrd`` Book instance, then ``xlrd`` will + be used. + - Otherwise if `openpyxl `_ is installed, + then ``openpyxl`` will be used. + - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised. Specifying ``engine="xlrd"`` will continue to be allowed for the indefinite future. @@ -907,16 +910,19 @@ class ExcelFile: .. versionchanged:: 1.2.0 - The engine xlrd is no longer maintained, and is not supported with + The engine `xlrd `_ + is no longer maintained, and is not supported with python >= 3.9. When ``engine=None``, the following logic will be used to determine the engine. - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), - then odf will be used. + then `odf `_ will be used. - Otherwise if ``path_or_buffer`` is a bytes stream, the file has the - extension ``.xls``, or is an xlrd Book instance, then xlrd will be used. - - Otherwise if openpyxl is installed, then openpyxl will be used. - - Otherwise xlrd will be used and a ``FutureWarning`` will be raised. + extension ``.xls``, or is an ``xlrd`` Book instance, then ``xlrd`` + will be used. + - Otherwise if `openpyxl `_ is installed, + then ``openpyxl`` will be used. + - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised. Specifying ``engine="xlrd"`` will continue to be allowed for the indefinite future.