From 3f52447b80a03d58b7de1ba9569aef7f7eb1eee1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Thu, 3 Jun 2021 15:39:19 -0400 Subject: [PATCH] REGR: close corrupt files in ExcelFile --- doc/source/whatsnew/v1.2.5.rst | 1 + pandas/io/excel/_base.py | 12 +++++++++--- pandas/io/excel/_openpyxl.py | 6 ------ pandas/tests/io/excel/test_readers.py | 19 +++++++++++++++++++ 4 files changed, 29 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v1.2.5.rst b/doc/source/whatsnew/v1.2.5.rst index 42c71acdee7e0..011422ff14270 100644 --- a/doc/source/whatsnew/v1.2.5.rst +++ b/doc/source/whatsnew/v1.2.5.rst @@ -18,6 +18,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.sum` and :meth:`DataFrame.prod` when ``min_count`` and ``numeric_only`` are both given (:issue:`41074`) - Regression in :func:`read_csv` when using ``memory_map=True`` with an non-UTF8 encoding (:issue:`40986`) - Regression in :meth:`DataFrame.replace` and :meth:`Series.replace` when the values to replace is a NumPy float array (:issue:`40371`) +- Regression in :func:`ExcelFile` when a corrupt file is opened but not closed (:issue:`41778`) .. --------------------------------------------------------------------------- diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 42ca68376452d..f7a4bf7c5ede5 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -422,7 +422,11 @@ def __init__(self, filepath_or_buffer, storage_options: StorageOptions = None): elif hasattr(self.handles.handle, "read"): # N.B. xlrd.Book has a read attribute too self.handles.handle.seek(0) - self.book = self.load_workbook(self.handles.handle) + try: + self.book = self.load_workbook(self.handles.handle) + except Exception: + self.close() + raise elif isinstance(self.handles.handle, bytes): self.book = self.load_workbook(BytesIO(self.handles.handle)) else: @@ -440,8 +444,10 @@ def load_workbook(self, filepath_or_buffer): pass def close(self): - if hasattr(self.book, "close"): - # pyxlsb opens a TemporaryFile + if hasattr(self, "book") and hasattr(self.book, "close"): + # pyxlsb: opens a TemporaryFile + # openpyxl: https://stackoverflow.com/questions/31416842/ + # openpyxl-does-not-close-excel-workbook-in-read-only-mode self.book.close() self.handles.close() diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 2071076d04a24..bc067e216760c 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -528,12 +528,6 @@ def load_workbook(self, filepath_or_buffer: FilePathOrBuffer): filepath_or_buffer, read_only=True, data_only=True, keep_links=False ) - def close(self): - # https://stackoverflow.com/questions/31416842/ - # openpyxl-does-not-close-excel-workbook-in-read-only-mode - self.book.close() - super().close() - @property def sheet_names(self) -> list[str]: return self.book.sheetnames diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index a9e4f52ce0c28..d40fb3ce4a135 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -4,6 +4,7 @@ ) from functools import partial import os +from pathlib import Path from urllib.error import URLError from zipfile import BadZipFile @@ -1499,3 +1500,21 @@ def test_engine_invalid_option(self, read_ext): with pytest.raises(ValueError, match="Value must be one of *"): with pd.option_context(f"io.excel{read_ext}.reader", "abc"): pass + + def test_corrupt_files_closed(self, request, engine, read_ext): + # GH41778 + errors = (BadZipFile,) + if engine is None: + pytest.skip() + elif engine == "xlrd": + import xlrd + + errors = (BadZipFile, xlrd.biffh.XLRDError) + + with tm.ensure_clean(f"corrupt{read_ext}") as file: + Path(file).write_text("corrupt") + with tm.assert_produces_warning(False): + try: + pd.ExcelFile(file, engine=engine) + except errors: + pass