From 8885ee6b2feedbb95c6af118fd98025519897d57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Thu, 3 Jun 2021 15:39:19 -0400 Subject: [PATCH 1/2] REGR: close corrupt files in ExcelFile --- doc/source/whatsnew/v1.2.5.rst | 1 + pandas/io/excel/_base.py | 11 ++++++++++- pandas/io/excel/_openpyxl.py | 6 ------ pandas/tests/io/excel/test_readers.py | 19 +++++++++++++++++++ 4 files changed, 30 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v1.2.5.rst b/doc/source/whatsnew/v1.2.5.rst index 500030e1304c6..48ebff5574bd7 100644 --- a/doc/source/whatsnew/v1.2.5.rst +++ b/doc/source/whatsnew/v1.2.5.rst @@ -18,6 +18,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.sum` and :meth:`DataFrame.prod` when ``min_count`` and ``numeric_only`` are both given (:issue:`41074`) - Regression in :func:`read_csv` when using ``memory_map=True`` with an non-UTF8 encoding (:issue:`40986`) - Regression in :meth:`DataFrame.replace` and :meth:`Series.replace` when the values to replace is a NumPy float array (:issue:`40371`) +- Regression in :func:`ExcelFile` when a corrupt file is opened but not closed (:issue:`41778`) .. --------------------------------------------------------------------------- diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 850570fc743b7..11dee69651dad 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -388,7 +388,11 @@ def __init__(self, filepath_or_buffer, storage_options: StorageOptions = None): elif hasattr(self.handles.handle, "read"): # N.B. xlrd.Book has a read attribute too self.handles.handle.seek(0) - self.book = self.load_workbook(self.handles.handle) + try: + self.book = self.load_workbook(self.handles.handle) + except Exception: + self.close() + raise elif isinstance(self.handles.handle, bytes): self.book = self.load_workbook(BytesIO(self.handles.handle)) else: @@ -406,6 +410,11 @@ def load_workbook(self, filepath_or_buffer): pass def close(self): + if hasattr(self, "book") and hasattr(self.book, "close"): + # pyxlsb: opens a TemporaryFile + # openpyxl: https://stackoverflow.com/questions/31416842/ + # openpyxl-does-not-close-excel-workbook-in-read-only-mode + self.book.close() self.handles.close() @property diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index be1587dbc010c..cac6d8d1b8113 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -487,12 +487,6 @@ def load_workbook(self, filepath_or_buffer: FilePathOrBuffer): filepath_or_buffer, read_only=True, data_only=True, keep_links=False ) - def close(self): - # https://stackoverflow.com/questions/31416842/ - # openpyxl-does-not-close-excel-workbook-in-read-only-mode - self.book.close() - super().close() - @property def sheet_names(self) -> List[str]: return self.book.sheetnames diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index bd3bfa207c4b0..53eb9167c1096 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1,6 +1,7 @@ from datetime import datetime, time from functools import partial import os +from pathlib import Path from urllib.error import URLError from zipfile import BadZipFile @@ -1273,3 +1274,21 @@ def test_read_datetime_multiindex(self, engine, read_ext): expected = DataFrame([], columns=expected_column_index) tm.assert_frame_equal(expected, actual) + + def test_corrupt_files_closed(self, request, engine, read_ext): + # GH41778 + errors = (BadZipFile,) + if engine is None: + pytest.skip() + elif engine == "xlrd": + import xlrd + + errors = (BadZipFile, xlrd.biffh.XLRDError) + + with tm.ensure_clean(f"corrupt{read_ext}") as file: + Path(file).write_text("corrupt") + with tm.assert_produces_warning(False): + try: + pd.ExcelFile(file, engine=engine) + except errors: + pass From 1da6a5d2fd03424956442d4abb9ddb7edd7d84a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Sun, 6 Jun 2021 00:01:38 -0400 Subject: [PATCH 2/2] inspect_excel_format throws ValueError (azur:macos with xlrd-engine) --- pandas/tests/io/excel/test_readers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 53eb9167c1096..e4006d9068f2a 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1277,13 +1277,13 @@ def test_read_datetime_multiindex(self, engine, read_ext): def test_corrupt_files_closed(self, request, engine, read_ext): # GH41778 - errors = (BadZipFile,) + errors = (BadZipFile, ValueError) if engine is None: pytest.skip() elif engine == "xlrd": import xlrd - errors = (BadZipFile, xlrd.biffh.XLRDError) + errors = (BadZipFile, ValueError, xlrd.biffh.XLRDError) with tm.ensure_clean(f"corrupt{read_ext}") as file: Path(file).write_text("corrupt")