From 30da9a4c9e8e9f73dd12e2257a8e3049dc696cb3 Mon Sep 17 00:00:00 2001 From: Kostya Farber Date: Thu, 5 Jan 2023 08:39:15 +0000 Subject: [PATCH 01/18] ENH: add calamite excel reader and modify test to include engine --- doc/source/whatsnew/v2.0.0.rst | 2 +- environment.yml | 1 + pandas/io/excel/_base.py | 2 + pandas/io/excel/_calamitereader.py | 69 +++++++++++++++++++++++++++ pandas/tests/io/excel/test_readers.py | 1 + requirements-dev.txt | 1 + 6 files changed, 75 insertions(+), 1 deletion(-) create mode 100644 pandas/io/excel/_calamitereader.py diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 13b60f7f05352..0da227d3e7299 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -104,7 +104,7 @@ Other enhancements - Improved error message in :func:`to_datetime` for non-ISO8601 formats, informing users about the position of the first error (:issue:`50361`) - Improved error message when trying to align :class:`DataFrame` objects (for example, in :func:`DataFrame.compare`) to clarify that "identically labelled" refers to both index and columns (:issue:`50083`) - Performance improvement in :func:`to_datetime` when format is given or can be inferred (:issue:`50465`) -- +- Added ``calamite`` as an engine to ``read_excel`` (:issue: ``50395``) .. --------------------------------------------------------------------------- .. _whatsnew_200.notable_bug_fixes: diff --git a/environment.yml b/environment.yml index b6b8f7d6af1ba..27e469020e695 100644 --- a/environment.yml +++ b/environment.yml @@ -119,3 +119,4 @@ dependencies: - pip: - sphinx-toggleprompt + - python-calamite diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 6f706a4554855..493b057dbc7e7 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1435,6 +1435,7 @@ class ExcelFile: This is not supported, switch to using ``openpyxl`` instead. """ + from pandas.io.excel._calamitereader import CalamineExcelReader from pandas.io.excel._odfreader import ODFReader from pandas.io.excel._openpyxl import OpenpyxlReader from pandas.io.excel._pyxlsb import PyxlsbReader @@ -1445,6 +1446,7 @@ class ExcelFile: "openpyxl": OpenpyxlReader, "odf": ODFReader, "pyxlsb": PyxlsbReader, + "calamite": CalamineExcelReader, } def __init__( diff --git a/pandas/io/excel/_calamitereader.py b/pandas/io/excel/_calamitereader.py new file mode 100644 index 0000000000000..97d33776db2dd --- /dev/null +++ b/pandas/io/excel/_calamitereader.py @@ -0,0 +1,69 @@ +from __future__ import annotations + +from io import ( + BufferedReader, + BytesIO, +) +from pathlib import PurePath +from tempfile import NamedTemporaryFile +from typing import Any + +from python_calamine import ( + get_sheet_data, + get_sheet_names, +) + +from pandas.io.excel._base import ( + BaseExcelReader, + inspect_excel_format, +) + + +class __calamine__: + pass + + +class CalamineExcelReader(BaseExcelReader): + book: str + _sheet_names: list[str] | None = None + + @property + def _workbook_class(self) -> type[__calamine__]: + return __calamine__ + + def load_workbook( + self, filepath_or_buffer: str | PurePath | BufferedReader | BytesIO + ) -> str: + if isinstance(filepath_or_buffer, BufferedReader): + filepath_or_buffer = filepath_or_buffer.name + + elif isinstance(filepath_or_buffer, BytesIO): + ext = inspect_excel_format(filepath_or_buffer) + with NamedTemporaryFile(suffix=f".{ext}", delete=False) as tmp_file: + tmp_file.write(filepath_or_buffer.getvalue()) + filepath_or_buffer = tmp_file.name + + elif isinstance(filepath_or_buffer, PurePath): + filepath_or_buffer = filepath_or_buffer.as_posix() + + assert isinstance(filepath_or_buffer, str) + + self._sheet_names = get_sheet_names(filepath_or_buffer) + return filepath_or_buffer + + @property + def sheet_names(self) -> list[str]: + if self._sheet_names is None: + self._sheet_names = get_sheet_names(self.book) + return self._sheet_names + + def get_sheet_by_name(self, name: str) -> int: + self.raise_if_bad_sheet_by_name(name) + return self.sheet_names.index(name) + + def get_sheet_by_index(self, index: int) -> int: + self.raise_if_bad_sheet_by_index(index) + return index + + def get_sheet_data(self, sheet: int, convert_float: bool) -> list[list[Any]]: + return get_sheet_data(self.book, sheet) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index ad8f1ac7d7d52..194e5b9fd858e 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -52,6 +52,7 @@ ), pytest.param("pyxlsb", marks=td.skip_if_no("pyxlsb")), pytest.param("odf", marks=td.skip_if_no("odf")), + pytest.param("calamite"), ] diff --git a/requirements-dev.txt b/requirements-dev.txt index 4f2a80d932fd0..df9168d1343b3 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -88,4 +88,5 @@ feedparser pyyaml requests sphinx-toggleprompt +python-calamite setuptools>=61.0.0 From fd06ad9765d97826a91a43032670a6f4f86f8d9f Mon Sep 17 00:00:00 2001 From: Kostya Farber Date: Thu, 5 Jan 2023 14:14:28 +0000 Subject: [PATCH 02/18] fix deps for python-calamine --- ci/deps/actions-310.yaml | 1 + ci/deps/actions-38-downstream_compat.yaml | 1 + ci/deps/actions-38.yaml | 1 + ci/deps/actions-39.yaml | 1 + environment.yml | 2 +- requirements-dev.txt | 2 +- 6 files changed, 6 insertions(+), 2 deletions(-) diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index d787571d9d112..1b798d9f15307 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -44,6 +44,7 @@ dependencies: - pytables - pyarrow - pyreadstat + - python-calamine - python-snappy - pyxlsb - s3fs>=2021.08.0 diff --git a/ci/deps/actions-38-downstream_compat.yaml b/ci/deps/actions-38-downstream_compat.yaml index 95ec98d72ebcc..53d0418775ca3 100644 --- a/ci/deps/actions-38-downstream_compat.yaml +++ b/ci/deps/actions-38-downstream_compat.yaml @@ -44,6 +44,7 @@ dependencies: - pymysql - pyreadstat - pytables + - python-calamine - python-snappy - pyxlsb - s3fs>=2021.08.0 diff --git a/ci/deps/actions-38.yaml b/ci/deps/actions-38.yaml index f7de8bbee7d8a..d80730d16f9f7 100644 --- a/ci/deps/actions-38.yaml +++ b/ci/deps/actions-38.yaml @@ -44,6 +44,7 @@ dependencies: - pymysql - pyreadstat - pytables + - python-calamine - python-snappy - pyxlsb - s3fs>=2021.08.0 diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index 821ec9c5d4234..5a0df449e7bc6 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -44,6 +44,7 @@ dependencies: - pyarrow - pyreadstat - pytables + - python-calamine - python-snappy - pyxlsb - s3fs>=2021.08.0 diff --git a/environment.yml b/environment.yml index 27e469020e695..7c425529c3aa0 100644 --- a/environment.yml +++ b/environment.yml @@ -119,4 +119,4 @@ dependencies: - pip: - sphinx-toggleprompt - - python-calamite + - python-calamine diff --git a/requirements-dev.txt b/requirements-dev.txt index df9168d1343b3..5baa54e7ee63d 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -88,5 +88,5 @@ feedparser pyyaml requests sphinx-toggleprompt -python-calamite +python-calamine setuptools>=61.0.0 From 6a8d82293adee92d8addc6e2f915d312c2f20c8d Mon Sep 17 00:00:00 2001 From: Kostya Farber Date: Thu, 5 Jan 2023 14:33:56 +0000 Subject: [PATCH 03/18] fix deps for python-calamine, add as pip package --- ci/deps/actions-310.yaml | 4 +++- ci/deps/actions-38-downstream_compat.yaml | 4 +++- ci/deps/actions-38-minimum_versions.yaml | 1 + ci/deps/actions-38.yaml | 3 +++ ci/deps/actions-39.yaml | 4 +++- ci/deps/circle-38-arm64.yaml | 3 +++ pandas/compat/_optional.py | 1 + pyproject.toml | 1 + 8 files changed, 18 insertions(+), 3 deletions(-) diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 1b798d9f15307..b6d7b0a97238c 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -44,7 +44,6 @@ dependencies: - pytables - pyarrow - pyreadstat - - python-calamine - python-snappy - pyxlsb - s3fs>=2021.08.0 @@ -56,3 +55,6 @@ dependencies: - xlrd - xlsxwriter - zstandard + + - pip: + - python-calamine diff --git a/ci/deps/actions-38-downstream_compat.yaml b/ci/deps/actions-38-downstream_compat.yaml index 53d0418775ca3..3345d02aa6443 100644 --- a/ci/deps/actions-38-downstream_compat.yaml +++ b/ci/deps/actions-38-downstream_compat.yaml @@ -44,7 +44,6 @@ dependencies: - pymysql - pyreadstat - pytables - - python-calamine - python-snappy - pyxlsb - s3fs>=2021.08.0 @@ -70,3 +69,6 @@ dependencies: - pandas-gbq - pyyaml - py + + - pip: + - python-calamine diff --git a/ci/deps/actions-38-minimum_versions.yaml b/ci/deps/actions-38-minimum_versions.yaml index de7e793c46d19..9ecd92ea44ac6 100644 --- a/ci/deps/actions-38-minimum_versions.yaml +++ b/ci/deps/actions-38-minimum_versions.yaml @@ -61,3 +61,4 @@ dependencies: - pip: - pyqt5==5.15.1 + - python-calamine==0.0.5 diff --git a/ci/deps/actions-38.yaml b/ci/deps/actions-38.yaml index d80730d16f9f7..ab86aa2d8ab8c 100644 --- a/ci/deps/actions-38.yaml +++ b/ci/deps/actions-38.yaml @@ -55,3 +55,6 @@ dependencies: - xlrd - xlsxwriter - zstandard + + - pip: + - python-calamine diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index 5a0df449e7bc6..f7f867fbc3839 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -44,7 +44,6 @@ dependencies: - pyarrow - pyreadstat - pytables - - python-calamine - python-snappy - pyxlsb - s3fs>=2021.08.0 @@ -56,3 +55,6 @@ dependencies: - xlrd - xlsxwriter - zstandard + + - pip: + - python-calamine diff --git a/ci/deps/circle-38-arm64.yaml b/ci/deps/circle-38-arm64.yaml index c94ce79ea2ff8..9f29a42a40ad0 100644 --- a/ci/deps/circle-38-arm64.yaml +++ b/ci/deps/circle-38-arm64.yaml @@ -55,3 +55,6 @@ dependencies: - xlrd - xlsxwriter - zstandard + + - pip: + - python-calamine diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 9bd4b384fadb0..6554a316b1150 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -48,6 +48,7 @@ "tzdata": "2022.1", "qtpy": "2.2.0", "pyqt5": "5.15.1", + "python-calamine": "0.0.5", } # A mapping from import name to package name (on PyPI) for packages where diff --git a/pyproject.toml b/pyproject.toml index 385c1beb08121..7c6a6ba70a60b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -104,6 +104,7 @@ all = ['beautifulsoup4>=4.9.3', 'pytest>=6.0', 'pytest-xdist>=1.31', 'pytest-asyncio>=0.17.0', + 'python-calamine>=0.0.5', 'python-snappy>=0.6.0', 'pyxlsb>=1.0.8', 'qtpy>=2.2.0', From efcb2fcead8e3985e1664f903ee614d681ff1756 Mon Sep 17 00:00:00 2001 From: Kostya Farber Date: Sat, 7 Jan 2023 15:04:51 +0000 Subject: [PATCH 04/18] ENH: fix typo in engine declaration, add import_optional_dependency, fix actions-38.yaml --- ci/deps/actions-38.yaml | 1 - pandas/io/excel/_base.py | 2 +- pandas/io/excel/_calamitereader.py | 16 ++++++++-------- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/ci/deps/actions-38.yaml b/ci/deps/actions-38.yaml index ab86aa2d8ab8c..f76ea07ae8f2e 100644 --- a/ci/deps/actions-38.yaml +++ b/ci/deps/actions-38.yaml @@ -44,7 +44,6 @@ dependencies: - pymysql - pyreadstat - pytables - - python-calamine - python-snappy - pyxlsb - s3fs>=2021.08.0 diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 493b057dbc7e7..e366a95728b81 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1446,7 +1446,7 @@ class ExcelFile: "openpyxl": OpenpyxlReader, "odf": ODFReader, "pyxlsb": PyxlsbReader, - "calamite": CalamineExcelReader, + "calamine": CalamineExcelReader, } def __init__( diff --git a/pandas/io/excel/_calamitereader.py b/pandas/io/excel/_calamitereader.py index 97d33776db2dd..d279fb9709737 100644 --- a/pandas/io/excel/_calamitereader.py +++ b/pandas/io/excel/_calamitereader.py @@ -8,10 +8,7 @@ from tempfile import NamedTemporaryFile from typing import Any -from python_calamine import ( - get_sheet_data, - get_sheet_names, -) +from pandas.compat._optional import import_optional_dependency from pandas.io.excel._base import ( BaseExcelReader, @@ -26,6 +23,7 @@ class __calamine__: class CalamineExcelReader(BaseExcelReader): book: str _sheet_names: list[str] | None = None + import_optional_dependency("python_calamine") @property def _workbook_class(self) -> type[__calamine__]: @@ -48,11 +46,15 @@ def load_workbook( assert isinstance(filepath_or_buffer, str) + from python_calamine import get_sheet_names + self._sheet_names = get_sheet_names(filepath_or_buffer) return filepath_or_buffer @property def sheet_names(self) -> list[str]: + from python_calamine import get_sheet_names + if self._sheet_names is None: self._sheet_names = get_sheet_names(self.book) return self._sheet_names @@ -61,9 +63,7 @@ def get_sheet_by_name(self, name: str) -> int: self.raise_if_bad_sheet_by_name(name) return self.sheet_names.index(name) - def get_sheet_by_index(self, index: int) -> int: - self.raise_if_bad_sheet_by_index(index) - return index - def get_sheet_data(self, sheet: int, convert_float: bool) -> list[list[Any]]: + from python_calamine import get_sheet_data + return get_sheet_data(self.book, sheet) From 6b50e0c1999493fb9062b2ac252de87a07c21500 Mon Sep 17 00:00:00 2001 From: Dmitriy Date: Wed, 11 Jan 2023 15:36:56 +0600 Subject: [PATCH 05/18] calamite -> calamine, updated some tests for calamine --- ci/deps/actions-38-minimum_versions.yaml | 2 +- doc/source/whatsnew/v2.0.0.rst | 2 +- pandas/compat/_optional.py | 3 ++- pandas/io/excel/_base.py | 2 +- ...{_calamitereader.py => _calaminereader.py} | 12 ++++++++++- pandas/tests/io/excel/test_readers.py | 21 +++++++++++++------ pyproject.toml | 2 +- 7 files changed, 32 insertions(+), 12 deletions(-) rename pandas/io/excel/{_calamitereader.py => _calaminereader.py} (82%) diff --git a/ci/deps/actions-38-minimum_versions.yaml b/ci/deps/actions-38-minimum_versions.yaml index 9ecd92ea44ac6..dfa45a9668e2c 100644 --- a/ci/deps/actions-38-minimum_versions.yaml +++ b/ci/deps/actions-38-minimum_versions.yaml @@ -61,4 +61,4 @@ dependencies: - pip: - pyqt5==5.15.1 - - python-calamine==0.0.5 + - python-calamine==0.0.6 diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index e3eb5acd43608..8d147f35bf8bf 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -151,7 +151,7 @@ Other enhancements - Improved error message in :func:`to_datetime` for non-ISO8601 formats, informing users about the position of the first error (:issue:`50361`) - Improved error message when trying to align :class:`DataFrame` objects (for example, in :func:`DataFrame.compare`) to clarify that "identically labelled" refers to both index and columns (:issue:`50083`) - Performance improvement in :func:`to_datetime` when format is given or can be inferred (:issue:`50465`) -- Added ``calamite`` as an engine to ``read_excel`` (:issue: ``50395``) +- Added ``calamine`` as an engine to ``read_excel`` (:issue: ``50395``) - Added :meth:`DatetimeIndex.as_unit` and :meth:`TimedeltaIndex.as_unit` to convert to different resolutions; supported resolutions are "s", "ms", "us", and "ns" (:issue:`50616`) .. --------------------------------------------------------------------------- diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 6554a316b1150..408d6ef8c599a 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -34,6 +34,7 @@ "pyarrow": "6.0.0", "pyreadstat": "1.1.2", "pytest": "6.0", + "python-calamine": "0.0.6", "pyxlsb": "1.0.8", "s3fs": "2021.08.0", "scipy": "1.7.1", @@ -48,7 +49,6 @@ "tzdata": "2022.1", "qtpy": "2.2.0", "pyqt5": "5.15.1", - "python-calamine": "0.0.5", } # A mapping from import name to package name (on PyPI) for packages where @@ -62,6 +62,7 @@ "lxml.etree": "lxml", "odf": "odfpy", "pandas_gbq": "pandas-gbq", + "python_calamine": "python-calamine", "snappy": "python-snappy", "sqlalchemy": "SQLAlchemy", "tables": "pytables", diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index e366a95728b81..2c8386bea9ebf 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1435,7 +1435,7 @@ class ExcelFile: This is not supported, switch to using ``openpyxl`` instead. """ - from pandas.io.excel._calamitereader import CalamineExcelReader + from pandas.io.excel._calaminereader import CalamineExcelReader from pandas.io.excel._odfreader import ODFReader from pandas.io.excel._openpyxl import OpenpyxlReader from pandas.io.excel._pyxlsb import PyxlsbReader diff --git a/pandas/io/excel/_calamitereader.py b/pandas/io/excel/_calaminereader.py similarity index 82% rename from pandas/io/excel/_calamitereader.py rename to pandas/io/excel/_calaminereader.py index d279fb9709737..06f6db0b7fd2c 100644 --- a/pandas/io/excel/_calamitereader.py +++ b/pandas/io/excel/_calaminereader.py @@ -8,6 +8,7 @@ from tempfile import NamedTemporaryFile from typing import Any +from pandas._typing import StorageOptions from pandas.compat._optional import import_optional_dependency from pandas.io.excel._base import ( @@ -23,7 +24,12 @@ class __calamine__: class CalamineExcelReader(BaseExcelReader): book: str _sheet_names: list[str] | None = None - import_optional_dependency("python_calamine") + + def __init__( + self, filepath_or_buffer, storage_options: StorageOptions = None + ) -> None: + import_optional_dependency("python_calamine") + super().__init__(filepath_or_buffer, storage_options=storage_options) @property def _workbook_class(self) -> type[__calamine__]: @@ -63,6 +69,10 @@ def get_sheet_by_name(self, name: str) -> int: self.raise_if_bad_sheet_by_name(name) return self.sheet_names.index(name) + def get_sheet_by_index(self, index: int) -> int: + self.raise_if_bad_sheet_by_index(index) + return index + def get_sheet_data(self, sheet: int, convert_float: bool) -> list[list[Any]]: from python_calamine import get_sheet_data diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 5baf9b563e6f6..68a641c3c81eb 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -53,7 +53,7 @@ ), pytest.param("pyxlsb", marks=td.skip_if_no("pyxlsb")), pytest.param("odf", marks=td.skip_if_no("odf")), - pytest.param("calamite"), + pytest.param("calamine", marks=td.skip_if_no("python_calamine")), ] @@ -67,11 +67,11 @@ def _is_valid_engine_ext_pair(engine, read_ext: str) -> bool: return False if engine == "odf" and read_ext != ".ods": return False - if read_ext == ".ods" and engine != "odf": + if read_ext == ".ods" and engine not in {"odf", "calamine"}: return False if engine == "pyxlsb" and read_ext != ".xlsb": return False - if read_ext == ".xlsb" and engine != "pyxlsb": + if read_ext == ".xlsb" and engine not in {"pyxlsb", "calamine"}: return False if engine == "xlrd" and read_ext != ".xls": return False @@ -835,6 +835,11 @@ def test_corrupt_bytes_raises(self, engine): "Unsupported format, or corrupt file: Expected BOF " "record; found b'foo'" ) + elif engine == "calamine": + import python_calamine + + error = python_calamine._python_calamine.CalamineError + msg = "Cannot detect file format" else: error = BadZipFile msg = "File is not a zip file" @@ -1375,7 +1380,7 @@ def test_trailing_blanks(self, read_ext): def test_ignore_chartsheets_by_str(self, request, engine, read_ext): # GH 41448 - if engine == "odf": + if read_ext == ".ods": pytest.skip("chartsheets do not exist in the ODF format") if engine == "pyxlsb": request.node.add_marker( @@ -1388,7 +1393,7 @@ def test_ignore_chartsheets_by_str(self, request, engine, read_ext): def test_ignore_chartsheets_by_int(self, request, engine, read_ext): # GH 41448 - if engine == "odf": + if read_ext == ".ods": pytest.skip("chartsheets do not exist in the ODF format") if engine == "pyxlsb": request.node.add_marker( @@ -1657,7 +1662,7 @@ def test_engine_invalid_option(self, read_ext): def test_ignore_chartsheets(self, request, engine, read_ext): # GH 41448 - if engine == "odf": + if read_ext == ".ods": pytest.skip("chartsheets do not exist in the ODF format") if engine == "pyxlsb": request.node.add_marker( @@ -1677,6 +1682,10 @@ def test_corrupt_files_closed(self, engine, read_ext): import xlrd errors = (BadZipFile, xlrd.biffh.XLRDError) + elif engine == "calamine": + import python_calamine + + errors = (python_calamine._python_calamine.CalamineError,) with tm.ensure_clean(f"corrupt{read_ext}") as file: Path(file).write_text("corrupt") diff --git a/pyproject.toml b/pyproject.toml index 0368e40f938f5..198a5ced0d288 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -104,7 +104,7 @@ all = ['beautifulsoup4>=4.9.3', 'pytest>=6.0', 'pytest-xdist>=1.31', 'pytest-asyncio>=0.17.0', - 'python-calamine>=0.0.5', + 'python-calamine>=0.0.6', 'python-snappy>=0.6.0', 'pyxlsb>=1.0.8', 'qtpy>=2.2.0', From 0784733b2f4ebc72f9aeb09fac839ebec5d8ac59 Mon Sep 17 00:00:00 2001 From: Dmitriy Date: Thu, 12 Jan 2023 11:02:52 +0600 Subject: [PATCH 06/18] calamine excel engine: skip tests with datetime --- pandas/tests/io/excel/test_readers.py | 114 +++++++++++++------------- 1 file changed, 57 insertions(+), 57 deletions(-) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 68a641c3c81eb..c17e6db26c26d 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -168,11 +168,11 @@ def test_usecols_int(self, read_ext): usecols=3, ) - def test_usecols_list(self, request, read_ext, df_ref): - if read_ext == ".xlsb": + def test_usecols_list(self, request, engine, read_ext, df_ref): + if engine in {"calamine", "pyxlsb"}: request.node.add_marker( pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" + reason=f"Sheets containing datetimes not supported by {engine}" ) ) @@ -192,11 +192,11 @@ def test_usecols_list(self, request, read_ext, df_ref): tm.assert_frame_equal(df1, df_ref, check_names=False) tm.assert_frame_equal(df2, df_ref, check_names=False) - def test_usecols_str(self, request, read_ext, df_ref): - if read_ext == ".xlsb": + def test_usecols_str(self, request, engine, read_ext, df_ref): + if engine in {"calamine", "pyxlsb"}: request.node.add_marker( pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" + reason=f"Sheets containing datetimes not supported by {engine}" ) ) @@ -249,12 +249,12 @@ def test_usecols_str(self, request, read_ext, df_ref): "usecols", [[0, 1, 3], [0, 3, 1], [1, 0, 3], [1, 3, 0], [3, 0, 1], [3, 1, 0]] ) def test_usecols_diff_positional_int_columns_order( - self, request, read_ext, usecols, df_ref + self, request, engine, read_ext, usecols, df_ref ): - if read_ext == ".xlsb": + if engine in {"calamine", "pyxlsb"}: request.node.add_marker( pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" + reason=f"Sheets containing datetimes not supported by {engine}" ) ) @@ -272,11 +272,11 @@ def test_usecols_diff_positional_str_columns_order(self, read_ext, usecols, df_r result = pd.read_excel("test1" + read_ext, sheet_name="Sheet1", usecols=usecols) tm.assert_frame_equal(result, expected, check_names=False) - def test_read_excel_without_slicing(self, request, read_ext, df_ref): - if read_ext == ".xlsb": + def test_read_excel_without_slicing(self, request, engine, read_ext, df_ref): + if engine in {"calamine", "pyxlsb"}: request.node.add_marker( pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" + reason=f"Sheets containing datetimes not supported by {engine}" ) ) @@ -284,11 +284,11 @@ def test_read_excel_without_slicing(self, request, read_ext, df_ref): result = pd.read_excel("test1" + read_ext, sheet_name="Sheet1", index_col=0) tm.assert_frame_equal(result, expected, check_names=False) - def test_usecols_excel_range_str(self, request, read_ext, df_ref): - if read_ext == ".xlsb": + def test_usecols_excel_range_str(self, request, engine, read_ext, df_ref): + if engine in {"calamine", "pyxlsb"}: request.node.add_marker( pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" + reason=f"Sheets containing datetimes not supported by {engine}" ) ) @@ -364,11 +364,11 @@ def test_excel_stop_iterator(self, read_ext): expected = DataFrame([["aaaa", "bbbbb"]], columns=["Test", "Test1"]) tm.assert_frame_equal(parsed, expected) - def test_excel_cell_error_na(self, request, read_ext): - if read_ext == ".xlsb": + def test_excel_cell_error_na(self, request, engine, read_ext): + if engine in {"calamine", "pyxlsb"}: request.node.add_marker( pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" + reason=f"Sheets containing datetimes not supported by {engine}" ) ) @@ -376,11 +376,11 @@ def test_excel_cell_error_na(self, request, read_ext): expected = DataFrame([[np.nan]], columns=["Test"]) tm.assert_frame_equal(parsed, expected) - def test_excel_table(self, request, read_ext, df_ref): - if read_ext == ".xlsb": + def test_excel_table(self, request, engine, read_ext, df_ref): + if engine in {"calamine", "pyxlsb"}: request.node.add_marker( pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" + reason=f"Sheets containing datetimes not supported by {engine}" ) ) @@ -397,11 +397,11 @@ def test_excel_table(self, request, read_ext, df_ref): ) tm.assert_frame_equal(df3, df1.iloc[:-1]) - def test_reader_special_dtypes(self, request, read_ext): - if read_ext == ".xlsb": + def test_reader_special_dtypes(self, request, engine, read_ext): + if engine in {"calamine", "pyxlsb"}: request.node.add_marker( pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" + reason=f"Sheets containing datetimes not supported by {engine}" ) ) @@ -741,10 +741,10 @@ def test_exception_message_includes_sheet_name(self, read_ext): @pytest.mark.filterwarnings("ignore:Cell A4 is marked:UserWarning:openpyxl") def test_date_conversion_overflow(self, request, engine, read_ext): # GH 10001 : pandas.ExcelFile ignore parse_dates=False - if engine == "pyxlsb": + if engine in {"calamine", "pyxlsb"}: request.node.add_marker( pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" + reason=f"Sheets containing datetimes not supported by {engine}" ) ) @@ -771,11 +771,11 @@ def test_date_conversion_overflow(self, request, engine, read_ext): result = pd.read_excel("testdateoverflow" + read_ext) tm.assert_frame_equal(result, expected) - def test_sheet_name(self, request, read_ext, df_ref): - if read_ext == ".xlsb": + def test_sheet_name(self, request, read_ext, engine, df_ref): + if engine in {"calamine", "pyxlsb"}: request.node.add_marker( pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" + reason=f"Sheets containing datetimes not supported by {engine}" ) ) filename = "test1" @@ -946,10 +946,10 @@ def test_close_from_py_localpath(self, read_ext): f.read() def test_reader_seconds(self, request, engine, read_ext): - if engine == "pyxlsb": + if engine in {"calamine", "pyxlsb"}: request.node.add_marker( pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" + reason=f"Sheets containing datetimes not supported by {engine}" ) ) @@ -978,12 +978,12 @@ def test_reader_seconds(self, request, engine, read_ext): actual = pd.read_excel("times_1904" + read_ext, sheet_name="Sheet1") tm.assert_frame_equal(actual, expected) - def test_read_excel_multiindex(self, request, read_ext): + def test_read_excel_multiindex(self, request, engine, read_ext): # see gh-4679 - if read_ext == ".xlsb": + if engine in {"calamine", "pyxlsb"}: request.node.add_marker( pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" + reason=f"Sheets containing datetimes not supported by {engine}" ) ) @@ -1072,13 +1072,13 @@ def test_read_excel_multiindex(self, request, read_ext): ], ) def test_read_excel_multiindex_blank_after_name( - self, request, read_ext, sheet_name, idx_lvl2 + self, request, engine, read_ext, sheet_name, idx_lvl2 ): # GH34673 - if read_ext == ".xlsb": + if engine in {"calamine", "pyxlsb"}: request.node.add_marker( pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb (GH4679" + reason=f"Sheets containing datetimes not supported by {engine} (GH4679)" ) ) @@ -1195,12 +1195,12 @@ def test_read_excel_bool_header_arg(self, read_ext): with pytest.raises(TypeError, match=msg): pd.read_excel("test1" + read_ext, header=arg) - def test_read_excel_skiprows(self, request, read_ext): + def test_read_excel_skiprows(self, request, engine, read_ext): # GH 4903 - if read_ext == ".xlsb": + if engine in {"calamine", "pyxlsb"}: request.node.add_marker( pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" + reason=f"Sheets containing datetimes not supported by {engine}" ) ) @@ -1250,12 +1250,12 @@ def test_read_excel_skiprows(self, request, read_ext): ) tm.assert_frame_equal(actual, expected) - def test_read_excel_skiprows_callable_not_in(self, request, read_ext): + def test_read_excel_skiprows_callable_not_in(self, request, engine, read_ext): # GH 4903 - if read_ext == ".xlsb": + if engine in {"calamine", "pyxlsb"}: request.node.add_marker( pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" + reason=f"Sheets containing datetimes not supported by {engine}" ) ) @@ -1382,10 +1382,10 @@ def test_ignore_chartsheets_by_str(self, request, engine, read_ext): # GH 41448 if read_ext == ".ods": pytest.skip("chartsheets do not exist in the ODF format") - if engine == "pyxlsb": + if engine in {"pyxlsb", "calamine"}: request.node.add_marker( pytest.mark.xfail( - reason="pyxlsb can't distinguish chartsheets from worksheets" + reason=f"{engine} can't distinguish chartsheets from worksheets" ) ) with pytest.raises(ValueError, match="Worksheet named 'Chart1' not found"): @@ -1395,10 +1395,10 @@ def test_ignore_chartsheets_by_int(self, request, engine, read_ext): # GH 41448 if read_ext == ".ods": pytest.skip("chartsheets do not exist in the ODF format") - if engine == "pyxlsb": + if engine in {"pyxlsb", "calamine"}: request.node.add_marker( pytest.mark.xfail( - reason="pyxlsb can't distinguish chartsheets from worksheets" + reason=f"{engine} can't distinguish chartsheets from worksheets" ) ) with pytest.raises( @@ -1511,11 +1511,11 @@ def test_excel_passes_na_filter(self, read_ext, na_filter): expected = DataFrame(expected, columns=["Test"]) tm.assert_frame_equal(parsed, expected) - def test_excel_table_sheet_by_index(self, request, read_ext, df_ref): - if read_ext == ".xlsb": + def test_excel_table_sheet_by_index(self, request, engine, read_ext, df_ref): + if engine in {"calamine", "pyxlsb"}: request.node.add_marker( pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" + reason=f"Sheets containing datetimes not supported by {engine}" ) ) @@ -1540,11 +1540,11 @@ def test_excel_table_sheet_by_index(self, request, read_ext, df_ref): tm.assert_frame_equal(df3, df1.iloc[:-1]) - def test_sheet_name(self, request, read_ext, df_ref): - if read_ext == ".xlsb": + def test_sheet_name(self, request, engine, read_ext, df_ref): + if engine in {"calamine", "pyxlsb"}: request.node.add_marker( pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" + reason=f"Sheets containing datetimes not supported by {engine}" ) ) @@ -1633,10 +1633,10 @@ def test_header_with_index_col(self, filename): def test_read_datetime_multiindex(self, request, engine, read_ext): # GH 34748 - if engine == "pyxlsb": + if engine in {"calamine", "pyxlsb"}: request.node.add_marker( pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" + reason=f"Sheets containing datetimes not supported by {engine}" ) ) @@ -1664,10 +1664,10 @@ def test_ignore_chartsheets(self, request, engine, read_ext): # GH 41448 if read_ext == ".ods": pytest.skip("chartsheets do not exist in the ODF format") - if engine == "pyxlsb": + if engine in {"pyxlsb", "calamine"}: request.node.add_marker( pytest.mark.xfail( - reason="pyxlsb can't distinguish chartsheets from worksheets" + reason=f"{engine} can't distinguish chartsheets from worksheets" ) ) with pd.ExcelFile("chartsheet" + read_ext) as excel: From 038133e41a4ba87bc83dff539725a067f9578e82 Mon Sep 17 00:00:00 2001 From: Kostya Farber Date: Sun, 22 Jan 2023 10:30:44 +0000 Subject: [PATCH 07/18] ENH: change reader filename match library, fix typo in engine name in tests --- pandas/io/excel/_base.py | 2 +- pandas/io/excel/{_calamitereader.py => _calaminereader.py} | 0 pandas/tests/io/excel/test_readers.py | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) rename pandas/io/excel/{_calamitereader.py => _calaminereader.py} (100%) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index e366a95728b81..2c8386bea9ebf 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1435,7 +1435,7 @@ class ExcelFile: This is not supported, switch to using ``openpyxl`` instead. """ - from pandas.io.excel._calamitereader import CalamineExcelReader + from pandas.io.excel._calaminereader import CalamineExcelReader from pandas.io.excel._odfreader import ODFReader from pandas.io.excel._openpyxl import OpenpyxlReader from pandas.io.excel._pyxlsb import PyxlsbReader diff --git a/pandas/io/excel/_calamitereader.py b/pandas/io/excel/_calaminereader.py similarity index 100% rename from pandas/io/excel/_calamitereader.py rename to pandas/io/excel/_calaminereader.py diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 5f32883524a31..e7508870fac24 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -52,7 +52,7 @@ ), pytest.param("pyxlsb", marks=td.skip_if_no("pyxlsb")), pytest.param("odf", marks=td.skip_if_no("odf")), - pytest.param("calamite"), + pytest.param("calamine", marks=td.skip_if_no("calamine")), ] From a6140897abef0bfd890c8f0b214e0beea51b1052 Mon Sep 17 00:00:00 2001 From: Kostya Farber Date: Sun, 22 Jan 2023 15:52:34 +0000 Subject: [PATCH 08/18] ENH: add back get_sheet_by_index --- pandas/io/excel/_calaminereader.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/io/excel/_calaminereader.py b/pandas/io/excel/_calaminereader.py index 7eeec89f091df..789e41e324643 100644 --- a/pandas/io/excel/_calaminereader.py +++ b/pandas/io/excel/_calaminereader.py @@ -9,6 +9,7 @@ from typing import Any from pandas._typing import StorageOptions +from pandas.compat._optional import import_optional_dependency from pandas.io.excel._base import ( BaseExcelReader, @@ -68,6 +69,10 @@ def get_sheet_by_name(self, name: str) -> int: self.raise_if_bad_sheet_by_name(name) return self.sheet_names.index(name) + def get_sheet_by_index(self, index: int) -> int: + self.raise_if_bad_sheet_by_index(index) + return index + def get_sheet_data(self, sheet: int, convert_float: bool) -> list[list[Any]]: from python_calamine import get_sheet_data From 9fc22092809136bd71887496519e20b82dee045f Mon Sep 17 00:00:00 2001 From: Kostya Farber Date: Tue, 24 Jan 2023 21:13:40 +0000 Subject: [PATCH 09/18] ENH: fix mypy and trailing whitespace --- pandas/io/excel/_calaminereader.py | 8 +++++--- pandas/tests/io/excel/test_readers.py | 5 +++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/pandas/io/excel/_calaminereader.py b/pandas/io/excel/_calaminereader.py index 789e41e324643..d107c1d89cc39 100644 --- a/pandas/io/excel/_calaminereader.py +++ b/pandas/io/excel/_calaminereader.py @@ -6,9 +6,11 @@ ) from pathlib import PurePath from tempfile import NamedTemporaryFile -from typing import Any -from pandas._typing import StorageOptions +from pandas._typing import ( + Scalar, + StorageOptions, +) from pandas.compat._optional import import_optional_dependency from pandas.io.excel._base import ( @@ -73,7 +75,7 @@ def get_sheet_by_index(self, index: int) -> int: self.raise_if_bad_sheet_by_index(index) return index - def get_sheet_data(self, sheet: int, convert_float: bool) -> list[list[Any]]: + def get_sheet_data(self, sheet: int, convert_float: bool) -> list[list[Scalar]]: from python_calamine import get_sheet_data return get_sheet_data(self.book, sheet) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 606edd2303c2c..c43fa0d95ca5a 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -836,7 +836,7 @@ def test_corrupt_bytes_raises(self, engine): ) elif engine == "calamine": import python_calamine - + error = python_calamine._python_calamine.CalamineError msg = "Cannot detect file format" else: @@ -1077,7 +1077,8 @@ def test_read_excel_multiindex_blank_after_name( if engine in {"calamine", "pyxlsb"}: request.node.add_marker( pytest.mark.xfail( - reason=f"Sheets containing datetimes not supported by {engine} (GH4679)" + reason="Sheets containing datetimes" + f"not supported by {engine} (GH4679)" ) ) From 9019904bdf7a300d1c52408c2d5a8b8d81cc3ef6 Mon Sep 17 00:00:00 2001 From: Dmitriy <3132181+dimastbk@users.noreply.github.com> Date: Thu, 26 Jan 2023 13:03:06 +0600 Subject: [PATCH 10/18] added conversion date/time/float, support file_rows_needed, fixed support s3object?, more accuracy xfail in tests --- pandas/io/excel/_calaminereader.py | 65 +++++++--- pandas/tests/io/excel/test_readers.py | 180 +++++++++++++++++++++----- 2 files changed, 192 insertions(+), 53 deletions(-) diff --git a/pandas/io/excel/_calaminereader.py b/pandas/io/excel/_calaminereader.py index d107c1d89cc39..af7d2016648c6 100644 --- a/pandas/io/excel/_calaminereader.py +++ b/pandas/io/excel/_calaminereader.py @@ -1,23 +1,31 @@ from __future__ import annotations -from io import ( - BufferedReader, - BytesIO, +from datetime import ( + date, + datetime, + time, ) -from pathlib import PurePath from tempfile import NamedTemporaryFile +from typing import Union from pandas._typing import ( + FilePath, + ReadBuffer, Scalar, StorageOptions, ) from pandas.compat._optional import import_optional_dependency +import pandas as pd + +from pandas.io.common import stringify_path from pandas.io.excel._base import ( BaseExcelReader, inspect_excel_format, ) +ValueT = Union[int, float, str, bool, time, date, datetime] + class __calamine__: pass @@ -28,7 +36,9 @@ class CalamineExcelReader(BaseExcelReader): _sheet_names: list[str] | None = None def __init__( - self, filepath_or_buffer, storage_options: StorageOptions = None + self, + filepath_or_buffer: FilePath | ReadBuffer[bytes], + storage_options: StorageOptions = None, ) -> None: import_optional_dependency("python_calamine") super().__init__(filepath_or_buffer, storage_options=storage_options) @@ -37,20 +47,15 @@ def __init__( def _workbook_class(self) -> type[__calamine__]: return __calamine__ - def load_workbook( - self, filepath_or_buffer: str | PurePath | BufferedReader | BytesIO - ) -> str: - if isinstance(filepath_or_buffer, BufferedReader): - filepath_or_buffer = filepath_or_buffer.name - - elif isinstance(filepath_or_buffer, BytesIO): + def load_workbook(self, filepath_or_buffer) -> str: + if hasattr(filepath_or_buffer, "read") and hasattr(filepath_or_buffer, "seek"): ext = inspect_excel_format(filepath_or_buffer) with NamedTemporaryFile(suffix=f".{ext}", delete=False) as tmp_file: - tmp_file.write(filepath_or_buffer.getvalue()) + filepath_or_buffer.seek(0) + tmp_file.write(filepath_or_buffer.read()) filepath_or_buffer = tmp_file.name - - elif isinstance(filepath_or_buffer, PurePath): - filepath_or_buffer = filepath_or_buffer.as_posix() + else: + filepath_or_buffer = stringify_path(filepath_or_buffer) assert isinstance(filepath_or_buffer, str) @@ -75,7 +80,31 @@ def get_sheet_by_index(self, index: int) -> int: self.raise_if_bad_sheet_by_index(index) return index - def get_sheet_data(self, sheet: int, convert_float: bool) -> list[list[Scalar]]: + def get_sheet_data( + self, sheet: int, file_rows_needed: int | None = None + ) -> list[list[Scalar]]: + def _convert_cell(value: ValueT) -> Scalar: + if isinstance(value, float): + val = int(value) + if val == value: + return val + else: + return value + elif isinstance(value, date): + return pd.Timestamp(value) + elif isinstance(value, time): + return value.isoformat() + + return value + from python_calamine import get_sheet_data - return get_sheet_data(self.book, sheet) + rows = get_sheet_data(self.book, sheet) + data: list[list[Scalar]] = [] + + for row in rows: + data.append([_convert_cell(cell) for cell in row]) + if file_rows_needed is not None and len(data) >= file_rows_needed: + break + + return data diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 87191f451fb29..4e408a72db7c6 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -168,10 +168,16 @@ def test_usecols_int(self, read_ext): ) def test_usecols_list(self, request, engine, read_ext, df_ref): - if engine in {"calamine", "pyxlsb"}: + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( - reason=f"Sheets containing datetimes not supported by {engine}" + reason="Sheets containing datetimes not supported by pyxlsb" + ) + ) + if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine support parsing datetime only in xlsx" ) ) @@ -192,10 +198,16 @@ def test_usecols_list(self, request, engine, read_ext, df_ref): tm.assert_frame_equal(df2, df_ref, check_names=False) def test_usecols_str(self, request, engine, read_ext, df_ref): - if engine in {"calamine", "pyxlsb"}: + if engine == "pyxlsb": + request.node.add_marker( + pytest.mark.xfail( + reason="Sheets containing datetimes not supported by pyxlsb" + ) + ) + if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: request.node.add_marker( pytest.mark.xfail( - reason=f"Sheets containing datetimes not supported by {engine}" + reason="Calamine support parsing datetime only in xlsx" ) ) @@ -250,10 +262,16 @@ def test_usecols_str(self, request, engine, read_ext, df_ref): def test_usecols_diff_positional_int_columns_order( self, request, engine, read_ext, usecols, df_ref ): - if engine in {"calamine", "pyxlsb"}: + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( - reason=f"Sheets containing datetimes not supported by {engine}" + reason="Sheets containing datetimes not supported by pyxlsb" + ) + ) + if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine support parsing datetime only in xlsx" ) ) @@ -272,10 +290,16 @@ def test_usecols_diff_positional_str_columns_order(self, read_ext, usecols, df_r tm.assert_frame_equal(result, expected, check_names=False) def test_read_excel_without_slicing(self, request, engine, read_ext, df_ref): - if engine in {"calamine", "pyxlsb"}: + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( - reason=f"Sheets containing datetimes not supported by {engine}" + reason="Sheets containing datetimes not supported by pyxlsb" + ) + ) + if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine support parsing datetime only in xlsx" ) ) @@ -284,10 +308,16 @@ def test_read_excel_without_slicing(self, request, engine, read_ext, df_ref): tm.assert_frame_equal(result, expected, check_names=False) def test_usecols_excel_range_str(self, request, engine, read_ext, df_ref): - if engine in {"calamine", "pyxlsb"}: + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( - reason=f"Sheets containing datetimes not supported by {engine}" + reason="Sheets containing datetimes not supported by pyxlsb" + ) + ) + if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine support parsing datetime only in xlsx" ) ) @@ -364,10 +394,16 @@ def test_excel_stop_iterator(self, read_ext): tm.assert_frame_equal(parsed, expected) def test_excel_cell_error_na(self, request, engine, read_ext): - if engine in {"calamine", "pyxlsb"}: + if engine == "pyxlsb": + request.node.add_marker( + pytest.mark.xfail( + reason="Sheets containing datetimes not supported by pyxlsb" + ) + ) + if engine == "calamine" and read_ext in {".ods"}: request.node.add_marker( pytest.mark.xfail( - reason=f"Sheets containing datetimes not supported by {engine}" + reason="Calamine support parsing datetime only in xlsx" ) ) @@ -376,10 +412,16 @@ def test_excel_cell_error_na(self, request, engine, read_ext): tm.assert_frame_equal(parsed, expected) def test_excel_table(self, request, engine, read_ext, df_ref): - if engine in {"calamine", "pyxlsb"}: + if engine == "pyxlsb": + request.node.add_marker( + pytest.mark.xfail( + reason="Sheets containing datetimes not supported by pyxlsb" + ) + ) + if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: request.node.add_marker( pytest.mark.xfail( - reason=f"Sheets containing datetimes not supported by {engine}" + reason="Calamine support parsing datetime only in xlsx" ) ) @@ -397,10 +439,16 @@ def test_excel_table(self, request, engine, read_ext, df_ref): tm.assert_frame_equal(df3, df1.iloc[:-1]) def test_reader_special_dtypes(self, request, engine, read_ext): - if engine in {"calamine", "pyxlsb"}: + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( - reason=f"Sheets containing datetimes not supported by {engine}" + reason="Sheets containing datetimes not supported by pyxlsb" + ) + ) + if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine support parsing datetime only in xlsx" ) ) @@ -745,10 +793,16 @@ def test_exception_message_includes_sheet_name(self, read_ext): @pytest.mark.filterwarnings("ignore:Cell A4 is marked:UserWarning:openpyxl") def test_date_conversion_overflow(self, request, engine, read_ext): # GH 10001 : pandas.ExcelFile ignore parse_dates=False - if engine in {"calamine", "pyxlsb"}: + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( - reason=f"Sheets containing datetimes not supported by {engine}" + reason="Sheets containing datetimes not supported by pyxlsb" + ) + ) + if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine support parsing datetime only in xlsx" ) ) @@ -776,12 +830,19 @@ def test_date_conversion_overflow(self, request, engine, read_ext): tm.assert_frame_equal(result, expected) def test_sheet_name(self, request, read_ext, engine, df_ref): - if engine in {"calamine", "pyxlsb"}: + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( - reason=f"Sheets containing datetimes not supported by {engine}" + reason="Sheets containing datetimes not supported by pyxlsb" ) ) + if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine support parsing datetime only in xlsx" + ) + ) + filename = "test1" sheet_name = "Sheet1" @@ -950,10 +1011,16 @@ def test_close_from_py_localpath(self, read_ext): f.read() def test_reader_seconds(self, request, engine, read_ext): - if engine in {"calamine", "pyxlsb"}: + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( - reason=f"Sheets containing datetimes not supported by {engine}" + reason="Sheets containing datetimes not supported by pyxlsb" + ) + ) + if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine support parsing datetime only in xlsx" ) ) @@ -984,10 +1051,16 @@ def test_reader_seconds(self, request, engine, read_ext): def test_read_excel_multiindex(self, request, engine, read_ext): # see gh-4679 - if engine in {"calamine", "pyxlsb"}: + if engine == "pyxlsb": + request.node.add_marker( + pytest.mark.xfail( + reason="Sheets containing datetimes not supported by pyxlsb" + ) + ) + if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: request.node.add_marker( pytest.mark.xfail( - reason=f"Sheets containing datetimes not supported by {engine}" + reason="Calamine support parsing datetime only in xlsx" ) ) @@ -1079,13 +1152,19 @@ def test_read_excel_multiindex_blank_after_name( self, request, engine, read_ext, sheet_name, idx_lvl2 ): # GH34673 - if engine in {"calamine", "pyxlsb"}: + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes" f"not supported by {engine} (GH4679)" ) ) + if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine support parsing datetime only in xlsx" + ) + ) mi_file = "testmultiindex" + read_ext mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]], names=["c1", "c2"]) @@ -1202,10 +1281,16 @@ def test_read_excel_bool_header_arg(self, read_ext): def test_read_excel_skiprows(self, request, engine, read_ext): # GH 4903 - if engine in {"calamine", "pyxlsb"}: + if engine == "pyxlsb": + request.node.add_marker( + pytest.mark.xfail( + reason="Sheets containing datetimes not supported by pyxlsb" + ) + ) + if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: request.node.add_marker( pytest.mark.xfail( - reason=f"Sheets containing datetimes not supported by {engine}" + reason="Calamine support parsing datetime only in xlsx" ) ) @@ -1257,10 +1342,16 @@ def test_read_excel_skiprows(self, request, engine, read_ext): def test_read_excel_skiprows_callable_not_in(self, request, engine, read_ext): # GH 4903 - if engine in {"calamine", "pyxlsb"}: + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( - reason=f"Sheets containing datetimes not supported by {engine}" + reason="Sheets containing datetimes not supported by pyxlsb" + ) + ) + if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine support parsing datetime only in xlsx" ) ) @@ -1517,10 +1608,16 @@ def test_excel_passes_na_filter(self, read_ext, na_filter): tm.assert_frame_equal(parsed, expected) def test_excel_table_sheet_by_index(self, request, engine, read_ext, df_ref): - if engine in {"calamine", "pyxlsb"}: + if engine == "pyxlsb": + request.node.add_marker( + pytest.mark.xfail( + reason="Sheets containing datetimes not supported by pyxlsb" + ) + ) + if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: request.node.add_marker( pytest.mark.xfail( - reason=f"Sheets containing datetimes not supported by {engine}" + reason="Calamine support parsing datetime only in xlsx" ) ) @@ -1546,10 +1643,16 @@ def test_excel_table_sheet_by_index(self, request, engine, read_ext, df_ref): tm.assert_frame_equal(df3, df1.iloc[:-1]) def test_sheet_name(self, request, engine, read_ext, df_ref): - if engine in {"calamine", "pyxlsb"}: + if engine == "pyxlsb": + request.node.add_marker( + pytest.mark.xfail( + reason="Sheets containing datetimes not supported by pyxlsb" + ) + ) + if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: request.node.add_marker( pytest.mark.xfail( - reason=f"Sheets containing datetimes not supported by {engine}" + reason="Calamine support parsing datetime only in xlsx" ) ) @@ -1638,10 +1741,16 @@ def test_header_with_index_col(self, filename): def test_read_datetime_multiindex(self, request, engine, read_ext): # GH 34748 - if engine in {"calamine", "pyxlsb"}: + if engine == "pyxlsb": + request.node.add_marker( + pytest.mark.xfail( + reason="Sheets containing datetimes not supported by pyxlsb" + ) + ) + if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: request.node.add_marker( pytest.mark.xfail( - reason=f"Sheets containing datetimes not supported by {engine}" + reason="Calamine support parsing datetime only in xlsx" ) ) @@ -1699,3 +1808,4 @@ def test_corrupt_files_closed(self, engine, read_ext): pd.ExcelFile(file, engine=engine) except errors: pass +ф \ No newline at end of file From 08a561639e2e63bb7e570066a84ddd15e6980ea2 Mon Sep 17 00:00:00 2001 From: Dmitriy <3132181+dimastbk@users.noreply.github.com> Date: Thu, 26 Jan 2023 13:23:19 +0600 Subject: [PATCH 11/18] Update test_readers.py --- pandas/tests/io/excel/test_readers.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 4e408a72db7c6..cec4e636db400 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1808,4 +1808,3 @@ def test_corrupt_files_closed(self, engine, read_ext): pd.ExcelFile(file, engine=engine) except errors: pass -ф \ No newline at end of file From d817999a4aad006f70864bf8a3c9d94458904964 Mon Sep 17 00:00:00 2001 From: Dmitriy Date: Fri, 27 Jan 2023 22:10:09 +0600 Subject: [PATCH 12/18] update python-calamine to 0.0.7 --- ci/deps/actions-38-minimum_versions.yaml | 2 +- pandas/compat/_optional.py | 2 +- pandas/io/excel/_calaminereader.py | 2 +- pandas/tests/io/excel/test_readers.py | 8 ++++---- pyproject.toml | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/ci/deps/actions-38-minimum_versions.yaml b/ci/deps/actions-38-minimum_versions.yaml index b41ea60600a9f..e44a3e0d70f4a 100644 --- a/ci/deps/actions-38-minimum_versions.yaml +++ b/ci/deps/actions-38-minimum_versions.yaml @@ -61,4 +61,4 @@ dependencies: - pip: - pyqt5==5.15.1 - - python-calamine==0.0.6 + - python-calamine==0.0.7 diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index b1b8dd370f4ec..16d5c17ea0331 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -34,7 +34,7 @@ "pyarrow": "6.0.0", "pyreadstat": "1.1.2", "pytest": "7.0.0", - "python-calamine": "0.0.6", + "python-calamine": "0.0.7", "pyxlsb": "1.0.8", "s3fs": "2021.08.0", "scipy": "1.7.1", diff --git a/pandas/io/excel/_calaminereader.py b/pandas/io/excel/_calaminereader.py index af7d2016648c6..a72bc03957a59 100644 --- a/pandas/io/excel/_calaminereader.py +++ b/pandas/io/excel/_calaminereader.py @@ -99,7 +99,7 @@ def _convert_cell(value: ValueT) -> Scalar: from python_calamine import get_sheet_data - rows = get_sheet_data(self.book, sheet) + rows = get_sheet_data(self.book, sheet, skip_empty_area=False) data: list[list[Scalar]] = [] for row in rows: diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index cec4e636db400..d5b52b062e034 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -901,9 +901,9 @@ def test_corrupt_bytes_raises(self, engine): "record; found b'foo'" ) elif engine == "calamine": - import python_calamine + from python_calamine import CalamineError - error = python_calamine._python_calamine.CalamineError + error = CalamineError msg = "Cannot detect file format" else: error = BadZipFile @@ -1797,9 +1797,9 @@ def test_corrupt_files_closed(self, engine, read_ext): errors = (BadZipFile, xlrd.biffh.XLRDError) elif engine == "calamine": - import python_calamine + from python_calamine import CalamineError - errors = (python_calamine._python_calamine.CalamineError,) + errors = (CalamineError,) with tm.ensure_clean(f"corrupt{read_ext}") as file: Path(file).write_text("corrupt") diff --git a/pyproject.toml b/pyproject.toml index 01c0a0cd9e9cf..5fada3523c781 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -104,7 +104,7 @@ all = ['beautifulsoup4>=4.9.3', 'pytest>=7.0.0', 'pytest-xdist>=2.2.0', 'pytest-asyncio>=0.17.0', - 'python-calamine>=0.0.6', + 'python-calamine>=0.0.7', 'python-snappy>=0.6.0', 'pyxlsb>=1.0.8', 'qtpy>=2.2.0', From 5d94728efeaa9ee137573fde0bf53f41933f8ae9 Mon Sep 17 00:00:00 2001 From: Dmitriy Date: Wed, 1 Feb 2023 02:12:33 +0600 Subject: [PATCH 13/18] fix review: use CalamineReader/CalamineSheet --- ci/deps/actions-38-minimum_versions.yaml | 2 +- pandas/compat/_optional.py | 2 +- pandas/io/excel/_calaminereader.py | 38 +++++++++--------------- pyproject.toml | 2 +- 4 files changed, 17 insertions(+), 27 deletions(-) diff --git a/ci/deps/actions-38-minimum_versions.yaml b/ci/deps/actions-38-minimum_versions.yaml index e44a3e0d70f4a..5373d08415fd6 100644 --- a/ci/deps/actions-38-minimum_versions.yaml +++ b/ci/deps/actions-38-minimum_versions.yaml @@ -61,4 +61,4 @@ dependencies: - pip: - pyqt5==5.15.1 - - python-calamine==0.0.7 + - python-calamine==0.0.8 diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 16d5c17ea0331..707639498fa7a 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -34,7 +34,7 @@ "pyarrow": "6.0.0", "pyreadstat": "1.1.2", "pytest": "7.0.0", - "python-calamine": "0.0.7", + "python-calamine": "0.0.8", "pyxlsb": "1.0.8", "s3fs": "2021.08.0", "scipy": "1.7.1", diff --git a/pandas/io/excel/_calaminereader.py b/pandas/io/excel/_calaminereader.py index a72bc03957a59..493906cb515fe 100644 --- a/pandas/io/excel/_calaminereader.py +++ b/pandas/io/excel/_calaminereader.py @@ -27,12 +27,7 @@ ValueT = Union[int, float, str, bool, time, date, datetime] -class __calamine__: - pass - - class CalamineExcelReader(BaseExcelReader): - book: str _sheet_names: list[str] | None = None def __init__( @@ -44,10 +39,12 @@ def __init__( super().__init__(filepath_or_buffer, storage_options=storage_options) @property - def _workbook_class(self) -> type[__calamine__]: - return __calamine__ + def _workbook_class(self): + from python_calamine import CalamineReader + + return CalamineReader - def load_workbook(self, filepath_or_buffer) -> str: + def load_workbook(self, filepath_or_buffer): if hasattr(filepath_or_buffer, "read") and hasattr(filepath_or_buffer, "seek"): ext = inspect_excel_format(filepath_or_buffer) with NamedTemporaryFile(suffix=f".{ext}", delete=False) as tmp_file: @@ -59,29 +56,24 @@ def load_workbook(self, filepath_or_buffer) -> str: assert isinstance(filepath_or_buffer, str) - from python_calamine import get_sheet_names + from python_calamine import CalamineReader - self._sheet_names = get_sheet_names(filepath_or_buffer) - return filepath_or_buffer + return CalamineReader.from_path(filepath_or_buffer) @property def sheet_names(self) -> list[str]: - from python_calamine import get_sheet_names + return self.book.sheet_names - if self._sheet_names is None: - self._sheet_names = get_sheet_names(self.book) - return self._sheet_names - - def get_sheet_by_name(self, name: str) -> int: + def get_sheet_by_name(self, name: str): self.raise_if_bad_sheet_by_name(name) - return self.sheet_names.index(name) + return self.book.get_sheet_by_name(name) - def get_sheet_by_index(self, index: int) -> int: + def get_sheet_by_index(self, index: int): self.raise_if_bad_sheet_by_index(index) - return index + return self.book.get_sheet_by_index(index) def get_sheet_data( - self, sheet: int, file_rows_needed: int | None = None + self, sheet, file_rows_needed: int | None = None ) -> list[list[Scalar]]: def _convert_cell(value: ValueT) -> Scalar: if isinstance(value, float): @@ -97,9 +89,7 @@ def _convert_cell(value: ValueT) -> Scalar: return value - from python_calamine import get_sheet_data - - rows = get_sheet_data(self.book, sheet, skip_empty_area=False) + rows: list[list[ValueT]] = sheet.to_python(skip_empty_area=False) data: list[list[Scalar]] = [] for row in rows: diff --git a/pyproject.toml b/pyproject.toml index c272bdae69286..5f803eedfe04a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -104,7 +104,7 @@ all = ['beautifulsoup4>=4.9.3', 'pytest>=7.0.0', 'pytest-xdist>=2.2.0', 'pytest-asyncio>=0.17.0', - 'python-calamine>=0.0.7', + 'python-calamine>=0.0.8', 'python-snappy>=0.6.0', 'pyxlsb>=1.0.8', 'qtpy>=2.2.0', From 15874c33dded03b479b0d1e4b9f69a6539cdeb1b Mon Sep 17 00:00:00 2001 From: Dmitriy Date: Mon, 6 Feb 2023 11:28:44 +0600 Subject: [PATCH 14/18] fixed pyright, fixed docs in __init__ --- pandas/io/excel/_calaminereader.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/pandas/io/excel/_calaminereader.py b/pandas/io/excel/_calaminereader.py index 493906cb515fe..e5c183938c9a6 100644 --- a/pandas/io/excel/_calaminereader.py +++ b/pandas/io/excel/_calaminereader.py @@ -6,7 +6,10 @@ time, ) from tempfile import NamedTemporaryFile -from typing import Union +from typing import ( + Union, + cast, +) from pandas._typing import ( FilePath, @@ -15,8 +18,10 @@ StorageOptions, ) from pandas.compat._optional import import_optional_dependency +from pandas.util._decorators import doc import pandas as pd +from pandas.core.shared_docs import _shared_docs from pandas.io.common import stringify_path from pandas.io.excel._base import ( @@ -30,11 +35,21 @@ class CalamineExcelReader(BaseExcelReader): _sheet_names: list[str] | None = None + @doc(storage_options=_shared_docs["storage_options"]) def __init__( self, filepath_or_buffer: FilePath | ReadBuffer[bytes], storage_options: StorageOptions = None, ) -> None: + """ + Reader using calamine engine (xlsx/xls/xlsb/ods). + + Parameters + ---------- + filepath_or_buffer : str, path to be parsed or + an open readable stream. + {storage_options} + """ import_optional_dependency("python_calamine") super().__init__(filepath_or_buffer, storage_options=storage_options) @@ -44,8 +59,9 @@ def _workbook_class(self): return CalamineReader - def load_workbook(self, filepath_or_buffer): + def load_workbook(self, filepath_or_buffer: FilePath | ReadBuffer[bytes]): if hasattr(filepath_or_buffer, "read") and hasattr(filepath_or_buffer, "seek"): + filepath_or_buffer = cast(ReadBuffer, filepath_or_buffer) ext = inspect_excel_format(filepath_or_buffer) with NamedTemporaryFile(suffix=f".{ext}", delete=False) as tmp_file: filepath_or_buffer.seek(0) @@ -62,15 +78,15 @@ def load_workbook(self, filepath_or_buffer): @property def sheet_names(self) -> list[str]: - return self.book.sheet_names + return self.book.sheet_names # pyright: ignore def get_sheet_by_name(self, name: str): self.raise_if_bad_sheet_by_name(name) - return self.book.get_sheet_by_name(name) + return self.book.get_sheet_by_name(name) # pyright: ignore def get_sheet_by_index(self, index: int): self.raise_if_bad_sheet_by_index(index) - return self.book.get_sheet_by_index(index) + return self.book.get_sheet_by_index(index) # pyright: ignore def get_sheet_data( self, sheet, file_rows_needed: int | None = None From a6b6fb2a256daa716dd9a175d2a4358040103ccc Mon Sep 17 00:00:00 2001 From: Dmitriy Date: Sun, 26 Mar 2023 19:18:14 +0600 Subject: [PATCH 15/18] bump python-calamine to 0.1.0 --- ci/deps/actions-310.yaml | 2 +- ci/deps/actions-311.yaml | 1 + ci/deps/actions-38-minimum_versions.yaml | 2 +- ci/deps/actions-38.yaml | 2 +- ci/deps/actions-39.yaml | 2 +- ci/deps/circle-38-arm64.yaml | 6 +-- doc/source/getting_started/install.rst | 1 + doc/source/user_guide/io.rst | 3 +- doc/source/whatsnew/v2.0.0.rst | 1 - doc/source/whatsnew/v2.1.0.rst | 1 + pandas/compat/_optional.py | 2 +- pandas/core/config_init.py | 10 ++-- pandas/io/excel/_base.py | 14 +++-- pandas/io/excel/_calaminereader.py | 51 +++++++------------ pyproject.toml | 4 +- scripts/tests/data/deps_expected_random.yaml | 3 ++ scripts/tests/data/deps_minimum.toml | 3 +- .../tests/data/deps_unmodified_random.yaml | 3 ++ 18 files changed, 54 insertions(+), 57 deletions(-) diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index eaf1266f10b45..4cad04e68f476 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -55,5 +55,5 @@ dependencies: - zstandard>=0.15.2 - pip: - - tzdata>=2022.1 - python-calamine + - tzdata>=2022.1 diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index fa08bdf438dff..761a14fcdd6e0 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -55,4 +55,5 @@ dependencies: - zstandard>=0.15.2 - pip: + - python-calamine>=0.1.0 - tzdata>=2022.1 diff --git a/ci/deps/actions-38-minimum_versions.yaml b/ci/deps/actions-38-minimum_versions.yaml index 6d4ddd1dda640..120232929c47b 100644 --- a/ci/deps/actions-38-minimum_versions.yaml +++ b/ci/deps/actions-38-minimum_versions.yaml @@ -59,5 +59,5 @@ dependencies: - pip: - pyqt5==5.15.1 - - python-calamine==0.0.8 + - python-calamine==0.1.0 - tzdata==2022.1 diff --git a/ci/deps/actions-38.yaml b/ci/deps/actions-38.yaml index b5df6861e6e2a..97cde7919028b 100644 --- a/ci/deps/actions-38.yaml +++ b/ci/deps/actions-38.yaml @@ -55,5 +55,5 @@ dependencies: - zstandard>=0.15.2 - pip: + - python-calamine - tzdata>=2022.1 - - python-calamine \ No newline at end of file diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index 33f82cb12fb07..149d0ee7672e3 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -55,5 +55,5 @@ dependencies: - zstandard>=0.15.2 - pip: + - python-calamine - tzdata>=2022.1 - - python-calamine \ No newline at end of file diff --git a/ci/deps/circle-38-arm64.yaml b/ci/deps/circle-38-arm64.yaml index bd7643b63a1bb..6d69fc56e9592 100644 --- a/ci/deps/circle-38-arm64.yaml +++ b/ci/deps/circle-38-arm64.yaml @@ -54,6 +54,6 @@ dependencies: - xlrd>=2.0.1 - xlsxwriter>=1.4.3 - zstandard>=0.15.2 - - - pip: - - python-calamine \ No newline at end of file + + - pip: + - python-calamine diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index f34676edd26dc..60275f5e9f819 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -345,6 +345,7 @@ xlrd 2.0.1 excel Reading Excel xlsxwriter 1.4.3 excel Writing Excel openpyxl 3.0.7 excel Reading / writing for xlsx files pyxlsb 1.0.8 excel Reading for xlsb files +python-calamine 0.1.0 excel Reading for xls/xlsx/xlsb/ods files ========================= ================== =============== ============================================================= HTML diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index c33d4ab92d4c6..254803bbc972b 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -3420,7 +3420,8 @@ Excel files The :func:`~pandas.read_excel` method can read Excel 2007+ (``.xlsx``) files using the ``openpyxl`` Python module. Excel 2003 (``.xls``) files can be read using ``xlrd``. Binary Excel (``.xlsb``) -files can be read using ``pyxlsb``. +files can be read using ``pyxlsb``. Also, all this formats can be read using ``python-calamine``, +but this library has sime limitation, for example, can't detect date in most formats. The :meth:`~DataFrame.to_excel` instance method is used for saving a ``DataFrame`` to Excel. Generally the semantics are similar to working with :ref:`csv` data. diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 9e1f564e20bd1..769a3a3c306d5 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -275,7 +275,6 @@ Other enhancements - Improved error message in :func:`to_datetime` for non-ISO8601 formats, informing users about the position of the first error (:issue:`50361`) - Improved error message when trying to align :class:`DataFrame` objects (for example, in :func:`DataFrame.compare`) to clarify that "identically labelled" refers to both index and columns (:issue:`50083`) - Performance improvement in :func:`to_datetime` when format is given or can be inferred (:issue:`50465`) -- Added ``calamine`` as an engine to ``read_excel`` (:issue: ``50395``) - Added support for :meth:`Index.min` and :meth:`Index.max` for pyarrow string dtypes (:issue:`51397`) - Added :meth:`DatetimeIndex.as_unit` and :meth:`TimedeltaIndex.as_unit` to convert to different resolutions; supported resolutions are "s", "ms", "us", and "ns" (:issue:`50616`) - Added :meth:`Series.dt.unit` and :meth:`Series.dt.as_unit` to convert to different resolutions; supported resolutions are "s", "ms", "us", and "ns" (:issue:`51223`) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 1f5c3c88c5ff5..e9a169b19dd70 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -37,6 +37,7 @@ Other enhancements - Improve error message when having incompatible columns using :meth:`DataFrame.merge` (:issue:`51861`) - Improved error message when creating a DataFrame with empty data (0 rows), no index and an incorrect number of columns. (:issue:`52084`) - :meth:`arrays.SparseArray.map` now supports ``na_action`` (:issue:`52096`). +- Added ``calamine`` as an engine to ``read_excel`` (:issue:`50395`) .. --------------------------------------------------------------------------- .. _whatsnew_210.notable_bug_fixes: diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 3ebf7d0c22d51..6a5af656ac063 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -37,7 +37,7 @@ "pyarrow": "7.0.0", "pyreadstat": "1.1.2", "pytest": "7.0.0", - "python-calamine": "0.0.8", + "python-calamine": "0.1.0", "pyxlsb": "1.0.8", "s3fs": "2021.08.0", "scipy": "1.7.1", diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 54d1497ad05f3..10308ee5cb309 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -503,11 +503,11 @@ def use_inf_as_na_cb(key) -> None: auto, {others}. """ -_xls_options = ["xlrd"] -_xlsm_options = ["xlrd", "openpyxl"] -_xlsx_options = ["xlrd", "openpyxl"] -_ods_options = ["odf"] -_xlsb_options = ["pyxlsb"] +_xls_options = ["xlrd", "calamine"] +_xlsm_options = ["xlrd", "openpyxl", "calamine"] +_xlsx_options = ["xlrd", "openpyxl", "calamine"] +_ods_options = ["odf", "calamine"] +_xlsb_options = ["pyxlsb", "calamine"] with cf.config_prefix("io.excel.xls"): diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 1dff27770618a..c168abedaf971 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -149,13 +149,15 @@ of dtype conversion. engine : str, default None If io is not a buffer or path, this must be set to identify io. - Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb". + Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb", "calamine". Engine compatibility : - "xlrd" supports old-style Excel files (.xls). - "openpyxl" supports newer Excel file formats. - "odf" supports OpenDocument file formats (.odf, .ods, .odt). - "pyxlsb" supports Binary Excel files. + - "calamine" supports Excel (.xls, .xlsx, .xlsm, .xlsb) + and OpenDocument (.ods) file formats. .. versionchanged:: 1.2.0 The engine `xlrd `_ @@ -375,7 +377,7 @@ def read_excel( | Callable[[str], bool] | None = ..., dtype: DtypeArg | None = ..., - engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb"] | None = ..., + engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb", "calamine"] | None = ..., converters: dict[str, Callable] | dict[int, Callable] | None = ..., true_values: Iterable[Hashable] | None = ..., false_values: Iterable[Hashable] | None = ..., @@ -414,7 +416,7 @@ def read_excel( | Callable[[str], bool] | None = ..., dtype: DtypeArg | None = ..., - engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb"] | None = ..., + engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb", "calamine"] | None = ..., converters: dict[str, Callable] | dict[int, Callable] | None = ..., true_values: Iterable[Hashable] | None = ..., false_values: Iterable[Hashable] | None = ..., @@ -453,7 +455,7 @@ def read_excel( | Callable[[str], bool] | None = None, dtype: DtypeArg | None = None, - engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb"] | None = None, + engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb", "calamine"] | None = None, converters: dict[str, Callable] | dict[int, Callable] | None = None, true_values: Iterable[Hashable] | None = None, false_values: Iterable[Hashable] | None = None, @@ -1418,13 +1420,15 @@ class ExcelFile: .xls, .xlsx, .xlsb, .xlsm, .odf, .ods, or .odt file. engine : str, default None If io is not a buffer or path, this must be set to identify io. - Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb`` + Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb``, ``calamine`` Engine compatibility : - ``xlrd`` supports old-style Excel files (.xls). - ``openpyxl`` supports newer Excel file formats. - ``odf`` supports OpenDocument file formats (.odf, .ods, .odt). - ``pyxlsb`` supports Binary Excel files. + - ``calamine`` supports Excel (.xls, .xlsx, .xlsm, .xlsb) + and OpenDocument (.ods) file formats. .. versionchanged:: 1.2.0 diff --git a/pandas/io/excel/_calaminereader.py b/pandas/io/excel/_calaminereader.py index e5c183938c9a6..6b6865c54372c 100644 --- a/pandas/io/excel/_calaminereader.py +++ b/pandas/io/excel/_calaminereader.py @@ -5,36 +5,31 @@ datetime, time, ) -from tempfile import NamedTemporaryFile from typing import ( + TYPE_CHECKING, Union, - cast, ) -from pandas._typing import ( - FilePath, - ReadBuffer, - Scalar, - StorageOptions, -) from pandas.compat._optional import import_optional_dependency from pandas.util._decorators import doc import pandas as pd from pandas.core.shared_docs import _shared_docs -from pandas.io.common import stringify_path -from pandas.io.excel._base import ( - BaseExcelReader, - inspect_excel_format, -) +from pandas.io.excel._base import BaseExcelReader -ValueT = Union[int, float, str, bool, time, date, datetime] +if TYPE_CHECKING: + from pandas._typing import ( + FilePath, + ReadBuffer, + Scalar, + StorageOptions, + ) +_ValueT = Union[int, float, str, bool, time, date, datetime] -class CalamineExcelReader(BaseExcelReader): - _sheet_names: list[str] | None = None +class CalamineExcelReader(BaseExcelReader): @doc(storage_options=_shared_docs["storage_options"]) def __init__( self, @@ -55,26 +50,14 @@ def __init__( @property def _workbook_class(self): - from python_calamine import CalamineReader + from python_calamine import CalamineWorkbook - return CalamineReader + return CalamineWorkbook def load_workbook(self, filepath_or_buffer: FilePath | ReadBuffer[bytes]): - if hasattr(filepath_or_buffer, "read") and hasattr(filepath_or_buffer, "seek"): - filepath_or_buffer = cast(ReadBuffer, filepath_or_buffer) - ext = inspect_excel_format(filepath_or_buffer) - with NamedTemporaryFile(suffix=f".{ext}", delete=False) as tmp_file: - filepath_or_buffer.seek(0) - tmp_file.write(filepath_or_buffer.read()) - filepath_or_buffer = tmp_file.name - else: - filepath_or_buffer = stringify_path(filepath_or_buffer) - - assert isinstance(filepath_or_buffer, str) - - from python_calamine import CalamineReader + from python_calamine import load_workbook - return CalamineReader.from_path(filepath_or_buffer) + return load_workbook(filepath_or_buffer) # type: ignore[arg-type] @property def sheet_names(self) -> list[str]: @@ -91,7 +74,7 @@ def get_sheet_by_index(self, index: int): def get_sheet_data( self, sheet, file_rows_needed: int | None = None ) -> list[list[Scalar]]: - def _convert_cell(value: ValueT) -> Scalar: + def _convert_cell(value: _ValueT) -> Scalar: if isinstance(value, float): val = int(value) if val == value: @@ -105,7 +88,7 @@ def _convert_cell(value: ValueT) -> Scalar: return value - rows: list[list[ValueT]] = sheet.to_python(skip_empty_area=False) + rows: list[list[_ValueT]] = sheet.to_python(skip_empty_area=False) data: list[list[Scalar]] = [] for row in rows: diff --git a/pyproject.toml b/pyproject.toml index 315d5d40bc565..0d42b98b03655 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,7 +62,7 @@ computation = ['scipy>=1.7.1', 'xarray>=0.21.0'] fss = ['fsspec>=2021.07.0'] aws = ['s3fs>=2021.08.0'] gcp = ['gcsfs>=2021.07.0', 'pandas-gbq>=0.15.0'] -excel = ['odfpy>=1.4.1', 'openpyxl>=3.0.7', 'pyxlsb>=1.0.8', 'xlrd>=2.0.1', 'xlsxwriter>=1.4.3'] +excel = ['odfpy>=1.4.1', 'openpyxl>=3.0.7', 'python-calamine>=0.1.0', 'pyxlsb>=1.0.8', 'xlrd>=2.0.1', 'xlsxwriter>=1.4.3'] parquet = ['pyarrow>=7.0.0'] feather = ['pyarrow>=7.0.0'] hdf5 = [# blosc only available on conda (https://github.com/Blosc/python-blosc/issues/297) @@ -104,7 +104,7 @@ all = ['beautifulsoup4>=4.9.3', 'pytest>=7.0.0', 'pytest-xdist>=2.2.0', 'pytest-asyncio>=0.17.0', - 'python-calamine>=0.0.8', + 'python-calamine>=0.1.0', 'python-snappy>=0.6.0', 'pyxlsb>=1.0.8', 'qtpy>=2.2.0', diff --git a/scripts/tests/data/deps_expected_random.yaml b/scripts/tests/data/deps_expected_random.yaml index be5e467b57e10..f4140e40a5f1e 100644 --- a/scripts/tests/data/deps_expected_random.yaml +++ b/scripts/tests/data/deps_expected_random.yaml @@ -55,3 +55,6 @@ dependencies: - xlrd>=2.0.1 - xlsxwriter>=1.4.3 - zstandard>=0.15.2 + + - pip: + - python-calamine>=0.1.0 diff --git a/scripts/tests/data/deps_minimum.toml b/scripts/tests/data/deps_minimum.toml index 97a5ce1180bfb..04f31a84a5441 100644 --- a/scripts/tests/data/deps_minimum.toml +++ b/scripts/tests/data/deps_minimum.toml @@ -62,7 +62,7 @@ computation = ['scipy>=1.7.1', 'xarray>=0.21.0'] fss = ['fsspec>=2021.07.0'] aws = ['s3fs>=2021.08.0'] gcp = ['gcsfs>=2021.07.0', 'pandas-gbq>=0.15.0'] -excel = ['odfpy>=1.4.1', 'openpyxl>=3.0.7', 'pyxlsb>=1.0.8', 'xlrd>=2.0.1', 'xlsxwriter>=1.4.3'] +excel = ['odfpy>=1.4.1', 'openpyxl>=3.0.7', 'python-calamine>=0.1.0', 'pyxlsb>=1.0.8', 'xlrd>=2.0.1', 'xlsxwriter>=1.4.3'] parquet = ['pyarrow>=7.0.0'] feather = ['pyarrow>=7.0.0'] hdf5 = [# blosc only available on conda (https://github.com/Blosc/python-blosc/issues/297) @@ -104,6 +104,7 @@ all = ['beautifulsoup4>=5.9.3', 'pytest>=7.0.0', 'pytest-xdist>=2.2.0', 'pytest-asyncio>=0.17.0', + 'python-calamine>=0.1.0', 'python-snappy>=0.6.0', 'pyxlsb>=1.0.8', 'qtpy>=2.2.0', diff --git a/scripts/tests/data/deps_unmodified_random.yaml b/scripts/tests/data/deps_unmodified_random.yaml index 4ca758af1c8ad..40a726ed49af6 100644 --- a/scripts/tests/data/deps_unmodified_random.yaml +++ b/scripts/tests/data/deps_unmodified_random.yaml @@ -55,3 +55,6 @@ dependencies: - xlrd>=2.0.1 - xlsxwriter>=1.4.3 - zstandard>=0.15.2 + + - pip: + - python-calamine>=0.1.0 From 0a431c5a1d3357c8b163ceab2cc25317e040f32f Mon Sep 17 00:00:00 2001 From: Dmitriy Date: Wed, 29 Mar 2023 10:37:16 +0600 Subject: [PATCH 16/18] _ValueT -> _CellValueT --- pandas/io/excel/_calaminereader.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/io/excel/_calaminereader.py b/pandas/io/excel/_calaminereader.py index 6b6865c54372c..27703f76d669a 100644 --- a/pandas/io/excel/_calaminereader.py +++ b/pandas/io/excel/_calaminereader.py @@ -26,7 +26,7 @@ StorageOptions, ) -_ValueT = Union[int, float, str, bool, time, date, datetime] +_CellValueT = Union[int, float, str, bool, time, date, datetime] class CalamineExcelReader(BaseExcelReader): @@ -74,7 +74,7 @@ def get_sheet_by_index(self, index: int): def get_sheet_data( self, sheet, file_rows_needed: int | None = None ) -> list[list[Scalar]]: - def _convert_cell(value: _ValueT) -> Scalar: + def _convert_cell(value: _CellValueT) -> Scalar: if isinstance(value, float): val = int(value) if val == value: @@ -88,7 +88,7 @@ def _convert_cell(value: _ValueT) -> Scalar: return value - rows: list[list[_ValueT]] = sheet.to_python(skip_empty_area=False) + rows: list[list[_CellValueT]] = sheet.to_python(skip_empty_area=False) data: list[list[Scalar]] = [] for row in rows: From 2f5ffbaaded52c1862d3eef160e37ee27ea0921d Mon Sep 17 00:00:00 2001 From: Dmitriy Date: Tue, 4 Apr 2023 01:20:12 +0600 Subject: [PATCH 17/18] added xfail to tests, small fixes --- doc/source/whatsnew/v2.0.0.rst | 1 - doc/source/whatsnew/v2.1.0.rst | 2 +- pandas/io/excel/_base.py | 4 +- .../{_calaminereader.py => _calamine.py} | 2 +- pandas/tests/io/excel/test_readers.py | 80 +++++++++++++++---- 5 files changed, 70 insertions(+), 19 deletions(-) rename pandas/io/excel/{_calaminereader.py => _calamine.py} (98%) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 8406c6b567430..193f837400ac0 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -279,7 +279,6 @@ Other enhancements - :meth:`Series.dropna` and :meth:`DataFrame.dropna` has gained ``ignore_index`` keyword to reset index (:issue:`31725`) - Improved error message in :func:`to_datetime` for non-ISO8601 formats, informing users about the position of the first error (:issue:`50361`) - Improved error message when trying to align :class:`DataFrame` objects (for example, in :func:`DataFrame.compare`) to clarify that "identically labelled" refers to both index and columns (:issue:`50083`) -- Performance improvement in :func:`to_datetime` when format is given or can be inferred (:issue:`50465`) - Added support for :meth:`Index.min` and :meth:`Index.max` for pyarrow string dtypes (:issue:`51397`) - Added :meth:`DatetimeIndex.as_unit` and :meth:`TimedeltaIndex.as_unit` to convert to different resolutions; supported resolutions are "s", "ms", "us", and "ns" (:issue:`50616`) - Added :meth:`Series.dt.unit` and :meth:`Series.dt.as_unit` to convert to different resolutions; supported resolutions are "s", "ms", "us", and "ns" (:issue:`51223`) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 30d3445663b2d..2d06faaa0e36a 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -42,8 +42,8 @@ Other enhancements - Improved error message when creating a DataFrame with empty data (0 rows), no index and an incorrect number of columns. (:issue:`52084`) - :meth:`DataFrame.applymap` now uses the :meth:`~api.extensions.ExtensionArray.map` method of underlying :class:`api.extensions.ExtensionArray` instances (:issue:`52219`) - :meth:`arrays.SparseArray.map` now supports ``na_action`` (:issue:`52096`). -- Added ``calamine`` as an engine to ``read_excel`` (:issue:`50395`) - :meth:`Categorical.map` and :meth:`CategoricalIndex.map` now have a ``na_action`` parameter (:issue:`44279`) +- Added ``calamine`` as an engine to ``read_excel`` (:issue:`50395`) - Add dtype of categories to ``repr`` information of :class:`CategoricalDtype` (:issue:`52179`) .. --------------------------------------------------------------------------- diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index c36edcaf493ed..fb291d53f1556 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1456,7 +1456,7 @@ class ExcelFile: This is not supported, switch to using ``openpyxl`` instead. """ - from pandas.io.excel._calaminereader import CalamineExcelReader + from pandas.io.excel._calamine import CalamineReader from pandas.io.excel._odfreader import ODFReader from pandas.io.excel._openpyxl import OpenpyxlReader from pandas.io.excel._pyxlsb import PyxlsbReader @@ -1467,7 +1467,7 @@ class ExcelFile: "openpyxl": OpenpyxlReader, "odf": ODFReader, "pyxlsb": PyxlsbReader, - "calamine": CalamineExcelReader, + "calamine": CalamineReader, } def __init__( diff --git a/pandas/io/excel/_calaminereader.py b/pandas/io/excel/_calamine.py similarity index 98% rename from pandas/io/excel/_calaminereader.py rename to pandas/io/excel/_calamine.py index 27703f76d669a..c71c0c62ff682 100644 --- a/pandas/io/excel/_calaminereader.py +++ b/pandas/io/excel/_calamine.py @@ -29,7 +29,7 @@ _CellValueT = Union[int, float, str, bool, time, date, datetime] -class CalamineExcelReader(BaseExcelReader): +class CalamineReader(BaseExcelReader): @doc(storage_options=_shared_docs["storage_options"]) def __init__( self, diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 4c171c43843ef..76df9c21424e8 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -451,6 +451,10 @@ def test_reader_special_dtypes(self, request, engine, read_ext): reason="Calamine support parsing datetime only in xlsx" ) ) + if engine == "calamine": + request.node.add_marker( + pytest.mark.xfail(reason="Calamine can't parse this datetime format") + ) expected = DataFrame.from_dict( { @@ -584,11 +588,16 @@ def test_reader_dtype_str(self, read_ext, dtype, expected): actual = pd.read_excel(basename + read_ext, dtype=dtype) tm.assert_frame_equal(actual, expected) - def test_dtype_backend(self, read_ext, dtype_backend): + def test_dtype_backend(self, request, engine, read_ext, dtype_backend): # GH#36712 if read_ext in (".xlsb", ".xls"): pytest.skip(f"No engine for filetype: '{read_ext}'") + if engine == "calamine" and read_ext == ".ods": + request.node.add_marker( + pytest.mark.xfail(reason="Calamine doesn't support invalid ods") + ) + df = DataFrame( { "a": Series([1, 3], dtype="Int64"), @@ -629,11 +638,16 @@ def test_dtype_backend(self, read_ext, dtype_backend): expected = df tm.assert_frame_equal(result, expected) - def test_dtype_backend_and_dtype(self, read_ext): + def test_dtype_backend_and_dtype(self, request, engine, read_ext): # GH#36712 if read_ext in (".xlsb", ".xls"): pytest.skip(f"No engine for filetype: '{read_ext}'") + if engine == "calamine" and read_ext == ".ods": + request.node.add_marker( + pytest.mark.xfail(reason="Calamine doesn't support invalid ods") + ) + df = DataFrame({"a": [np.nan, 1.0], "b": [2.5, np.nan]}) with tm.ensure_clean(read_ext) as file_path: df.to_excel(file_path, "test", index=False) @@ -646,11 +660,16 @@ def test_dtype_backend_and_dtype(self, read_ext): tm.assert_frame_equal(result, df) @td.skip_if_no("pyarrow") - def test_dtype_backend_string(self, read_ext, string_storage): + def test_dtype_backend_string(self, request, engine, read_ext, string_storage): # GH#36712 if read_ext in (".xlsb", ".xls"): pytest.skip(f"No engine for filetype: '{read_ext}'") + if engine == "calamine" and read_ext == ".ods": + request.node.add_marker( + pytest.mark.xfail(reason="Calamine doesn't support invalid ods") + ) + import pyarrow as pa with pd.option_context("mode.string_storage", string_storage): @@ -694,8 +713,15 @@ def test_dtype_mangle_dup_cols(self, read_ext, dtypes, exp_value): assert dtype_dict == dtype_dict_copy, "dtype dict changed" tm.assert_frame_equal(result, expected) - def test_reader_spaces(self, read_ext): + def test_reader_spaces(self, request, engine, read_ext): # see gh-32207 + + # https://github.com/tafia/calamine/pull/289 + if engine == "calamine" and read_ext == ".ods": + request.node.add_marker( + pytest.mark.xfail(reason="Calamine doesn't respect spaces in ods") + ) + basename = "test_spaces" actual = pd.read_excel(basename + read_ext) @@ -790,12 +816,6 @@ def test_date_conversion_overflow(self, request, engine, read_ext): reason="Sheets containing datetimes not supported by pyxlsb" ) ) - if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: - request.node.add_marker( - pytest.mark.xfail( - reason="Calamine support parsing datetime only in xlsx" - ) - ) expected = DataFrame( [ @@ -806,6 +826,11 @@ def test_date_conversion_overflow(self, request, engine, read_ext): columns=["DateColWithBigInt", "StringCol"], ) + if engine == "calamine": + request.node.add_marker( + pytest.mark.xfail(reason="Maybe not supported by calamine") + ) + if engine == "openpyxl": request.node.add_marker( pytest.mark.xfail(reason="Maybe not supported by openpyxl") @@ -1008,6 +1033,12 @@ def test_reader_seconds(self, request, engine, read_ext): reason="Calamine support parsing datetime only in xlsx" ) ) + if engine == "calamine": + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine doesn't support parsing milliseconds in datetime" + ) + ) # Test reading times with and without milliseconds. GH5945. expected = DataFrame.from_dict( @@ -1174,10 +1205,17 @@ def test_read_excel_multiindex_blank_after_name( ) tm.assert_frame_equal(result, expected) - def test_read_excel_multiindex_header_only(self, read_ext): + def test_read_excel_multiindex_header_only(self, request, engine, read_ext): # see gh-11733. # # Don't try to parse a header name if there isn't one. + if engine == "calamine" and read_ext == ".ods": + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine doesn't support 'number-rows-repeated' in ods" + ) + ) + mi_file = "testmultiindex" + read_ext result = pd.read_excel(mi_file, sheet_name="index_col_none", header=[0, 1]) @@ -1418,8 +1456,15 @@ def test_deprecated_kwargs(self, read_ext): with pytest.raises(TypeError, match="but 3 positional arguments"): pd.read_excel("test1" + read_ext, "Sheet1", 0) - def test_no_header_with_list_index_col(self, read_ext): + def test_no_header_with_list_index_col(self, request, engine, read_ext): # GH 31783 + if engine == "calamine" and read_ext == ".ods": + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine doesn't support 'number-rows-repeated' in ods" + ) + ) + file_name = "testmultiindex" + read_ext data = [("B", "B"), ("key", "val"), (3, 4), (3, 4)] idx = MultiIndex.from_tuples( @@ -1439,8 +1484,15 @@ def test_one_col_noskip_blank_line(self, read_ext): result = pd.read_excel(file_name) tm.assert_frame_equal(result, expected) - def test_multiheader_two_blank_lines(self, read_ext): + def test_multiheader_two_blank_lines(self, request, engine, read_ext): # GH 40442 + if engine == "calamine" and read_ext == ".ods": + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine doesn't support 'number-rows-repeated' in ods" + ) + ) + file_name = "testmultiindex" + read_ext columns = MultiIndex.from_tuples([("a", "A"), ("b", "B")]) data = [[np.nan, np.nan], [np.nan, np.nan], [1, 3], [2, 4]] @@ -1703,7 +1755,7 @@ def test_excel_read_binary(self, engine, read_ext): def test_excel_read_binary_via_read_excel(self, read_ext, engine): # GH 38424 with open("test1" + read_ext, "rb") as f: - result = pd.read_excel(f) + result = pd.read_excel(f, engine=engine) expected = pd.read_excel("test1" + read_ext, engine=engine) tm.assert_frame_equal(result, expected) From 02c2e7f94c111cbeb7d289208e3665372ae3f071 Mon Sep 17 00:00:00 2001 From: Dmitriy Date: Mon, 1 May 2023 13:47:02 +0600 Subject: [PATCH 18/18] bump calamine to 0.1.1, update tests (472 passed, 75 xfailed), update docs --- ci/deps/actions-311.yaml | 2 +- ci/deps/actions-38-minimum_versions.yaml | 2 +- doc/source/getting_started/install.rst | 2 +- doc/source/user_guide/io.rst | 3 +- pandas/compat/_optional.py | 2 +- pandas/tests/io/excel/test_readers.py | 87 +++++++++---------- pyproject.toml | 4 +- scripts/tests/data/deps_expected_random.yaml | 2 +- scripts/tests/data/deps_minimum.toml | 4 +- .../tests/data/deps_unmodified_random.yaml | 2 +- 10 files changed, 53 insertions(+), 57 deletions(-) diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index d3ad1f0622466..9280aab30ff12 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -55,5 +55,5 @@ dependencies: - zstandard>=0.15.2 - pip: - - python-calamine>=0.1.0 + - python-calamine>=0.1.1 - tzdata>=2022.1 diff --git a/ci/deps/actions-38-minimum_versions.yaml b/ci/deps/actions-38-minimum_versions.yaml index 120232929c47b..5f13906b3535f 100644 --- a/ci/deps/actions-38-minimum_versions.yaml +++ b/ci/deps/actions-38-minimum_versions.yaml @@ -59,5 +59,5 @@ dependencies: - pip: - pyqt5==5.15.1 - - python-calamine==0.1.0 + - python-calamine==0.1.1 - tzdata==2022.1 diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 60275f5e9f819..ccdee39c4eb95 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -345,7 +345,7 @@ xlrd 2.0.1 excel Reading Excel xlsxwriter 1.4.3 excel Writing Excel openpyxl 3.0.7 excel Reading / writing for xlsx files pyxlsb 1.0.8 excel Reading for xlsb files -python-calamine 0.1.0 excel Reading for xls/xlsx/xlsb/ods files +python-calamine 0.1.1 excel Reading for xls/xlsx/xlsb/ods files ========================= ================== =============== ============================================================= HTML diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 254803bbc972b..985b13faa8cd7 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -3421,7 +3421,8 @@ The :func:`~pandas.read_excel` method can read Excel 2007+ (``.xlsx``) files using the ``openpyxl`` Python module. Excel 2003 (``.xls``) files can be read using ``xlrd``. Binary Excel (``.xlsb``) files can be read using ``pyxlsb``. Also, all this formats can be read using ``python-calamine``, -but this library has sime limitation, for example, can't detect date in most formats. +but this library has some limitation and different behavior from other libraries, +for example, can't detect date in some formats (xls and xlsb). The :meth:`~DataFrame.to_excel` instance method is used for saving a ``DataFrame`` to Excel. Generally the semantics are similar to working with :ref:`csv` data. diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 6a5af656ac063..6a848e6ccd22c 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -37,7 +37,7 @@ "pyarrow": "7.0.0", "pyreadstat": "1.1.2", "pytest": "7.0.0", - "python-calamine": "0.1.0", + "python-calamine": "0.1.1", "pyxlsb": "1.0.8", "s3fs": "2021.08.0", "scipy": "1.7.1", diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 76df9c21424e8..a5507040b107a 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -174,10 +174,10 @@ def test_usecols_list(self, request, engine, read_ext, df_ref): reason="Sheets containing datetimes not supported by pyxlsb" ) ) - if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: + if engine == "calamine" and read_ext in {".xls", ".xlsb"}: request.node.add_marker( pytest.mark.xfail( - reason="Calamine support parsing datetime only in xlsx" + reason="Calamine support parsing datetime only in xlsx/ods" ) ) @@ -204,10 +204,10 @@ def test_usecols_str(self, request, engine, read_ext, df_ref): reason="Sheets containing datetimes not supported by pyxlsb" ) ) - if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: + if engine == "calamine" and read_ext in {".xls", ".xlsb"}: request.node.add_marker( pytest.mark.xfail( - reason="Calamine support parsing datetime only in xlsx" + reason="Calamine support parsing datetime only in xlsx/ods" ) ) @@ -268,10 +268,10 @@ def test_usecols_diff_positional_int_columns_order( reason="Sheets containing datetimes not supported by pyxlsb" ) ) - if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: + if engine == "calamine" and read_ext in {".xls", ".xlsb"}: request.node.add_marker( pytest.mark.xfail( - reason="Calamine support parsing datetime only in xlsx" + reason="Calamine support parsing datetime only in xlsx/ods" ) ) @@ -296,10 +296,10 @@ def test_read_excel_without_slicing(self, request, engine, read_ext, df_ref): reason="Sheets containing datetimes not supported by pyxlsb" ) ) - if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: + if engine == "calamine" and read_ext in {".xls", ".xlsb"}: request.node.add_marker( pytest.mark.xfail( - reason="Calamine support parsing datetime only in xlsx" + reason="Calamine support parsing datetime only in xlsx/ods" ) ) @@ -314,10 +314,10 @@ def test_usecols_excel_range_str(self, request, engine, read_ext, df_ref): reason="Sheets containing datetimes not supported by pyxlsb" ) ) - if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: + if engine == "calamine" and read_ext in {".xls", ".xlsb"}: request.node.add_marker( pytest.mark.xfail( - reason="Calamine support parsing datetime only in xlsx" + reason="Calamine support parsing datetime only in xlsx/ods" ) ) @@ -400,11 +400,9 @@ def test_excel_cell_error_na(self, request, engine, read_ext): reason="Sheets containing datetimes not supported by pyxlsb" ) ) - if engine == "calamine" and read_ext in {".ods"}: + if engine == "calamine" and read_ext == ".ods": request.node.add_marker( - pytest.mark.xfail( - reason="Calamine support parsing datetime only in xlsx" - ) + pytest.mark.xfail(reason="Calamine returns 0 instead of NaN in ods") ) parsed = pd.read_excel("test3" + read_ext, sheet_name="Sheet1") @@ -418,10 +416,10 @@ def test_excel_table(self, request, engine, read_ext, df_ref): reason="Sheets containing datetimes not supported by pyxlsb" ) ) - if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: + if engine == "calamine" and read_ext in {".xls", ".xlsb"}: request.node.add_marker( pytest.mark.xfail( - reason="Calamine support parsing datetime only in xlsx" + reason="Calamine support parsing datetime only in xlsx/ods" ) ) @@ -445,15 +443,15 @@ def test_reader_special_dtypes(self, request, engine, read_ext): reason="Sheets containing datetimes not supported by pyxlsb" ) ) - if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: + if engine == "calamine" and read_ext in {".xls", ".xlsb"}: request.node.add_marker( pytest.mark.xfail( - reason="Calamine support parsing datetime only in xlsx" + reason="Calamine support parsing datetime only in xlsx/ods" ) ) - if engine == "calamine": + if engine == "calamine" and read_ext != ".ods": request.node.add_marker( - pytest.mark.xfail(reason="Calamine can't parse this datetime format") + pytest.mark.xfail(reason="Maybe not supported by calamine") ) expected = DataFrame.from_dict( @@ -715,13 +713,6 @@ def test_dtype_mangle_dup_cols(self, read_ext, dtypes, exp_value): def test_reader_spaces(self, request, engine, read_ext): # see gh-32207 - - # https://github.com/tafia/calamine/pull/289 - if engine == "calamine" and read_ext == ".ods": - request.node.add_marker( - pytest.mark.xfail(reason="Calamine doesn't respect spaces in ods") - ) - basename = "test_spaces" actual = pd.read_excel(basename + read_ext) @@ -826,7 +817,7 @@ def test_date_conversion_overflow(self, request, engine, read_ext): columns=["DateColWithBigInt", "StringCol"], ) - if engine == "calamine": + if engine == "calamine" and read_ext != ".ods": request.node.add_marker( pytest.mark.xfail(reason="Maybe not supported by calamine") ) @@ -852,10 +843,10 @@ def test_sheet_name(self, request, read_ext, engine, df_ref): reason="Sheets containing datetimes not supported by pyxlsb" ) ) - if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: + if engine == "calamine" and read_ext in {".xls", ".xlsb"}: request.node.add_marker( pytest.mark.xfail( - reason="Calamine support parsing datetime only in xlsx" + reason="Calamine support parsing datetime only in xlsx/ods" ) ) @@ -1027,10 +1018,10 @@ def test_reader_seconds(self, request, engine, read_ext): reason="Sheets containing datetimes not supported by pyxlsb" ) ) - if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: + if engine == "calamine" and read_ext in {".xls", ".xlsb"}: request.node.add_marker( pytest.mark.xfail( - reason="Calamine support parsing datetime only in xlsx" + reason="Calamine support parsing datetime only in xlsx/ods" ) ) if engine == "calamine": @@ -1073,12 +1064,16 @@ def test_read_excel_multiindex(self, request, engine, read_ext): reason="Sheets containing datetimes not supported by pyxlsb" ) ) - if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: + if engine == "calamine" and read_ext in {".xls", ".xlsb"}: request.node.add_marker( pytest.mark.xfail( - reason="Calamine support parsing datetime only in xlsx" + reason="Calamine support parsing datetime only in xlsx/ods" ) ) + if engine == "calamine" and read_ext == ".ods": + request.node.add_marker( + pytest.mark.xfail(reason="Last test fails in calamine") + ) mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]]) mi_file = "testmultiindex" + read_ext @@ -1175,10 +1170,10 @@ def test_read_excel_multiindex_blank_after_name( f"not supported by {engine} (GH4679)" ) ) - if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: + if engine == "calamine" and read_ext in {".xls", ".xlsb"}: request.node.add_marker( pytest.mark.xfail( - reason="Calamine support parsing datetime only in xlsx" + reason="Calamine support parsing datetime only in xlsx/ods" ) ) @@ -1310,10 +1305,10 @@ def test_read_excel_skiprows(self, request, engine, read_ext): reason="Sheets containing datetimes not supported by pyxlsb" ) ) - if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: + if engine == "calamine" and read_ext in {".xls", ".xlsb"}: request.node.add_marker( pytest.mark.xfail( - reason="Calamine support parsing datetime only in xlsx" + reason="Calamine support parsing datetime only in xlsx/ods" ) ) @@ -1371,10 +1366,10 @@ def test_read_excel_skiprows_callable_not_in(self, request, engine, read_ext): reason="Sheets containing datetimes not supported by pyxlsb" ) ) - if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: + if engine == "calamine" and read_ext in {".xls", ".xlsb"}: request.node.add_marker( pytest.mark.xfail( - reason="Calamine support parsing datetime only in xlsx" + reason="Calamine support parsing datetime only in xlsx/ods" ) ) @@ -1651,10 +1646,10 @@ def test_excel_table_sheet_by_index(self, request, engine, read_ext, df_ref): reason="Sheets containing datetimes not supported by pyxlsb" ) ) - if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: + if engine == "calamine" and read_ext in {".xls", ".xlsb"}: request.node.add_marker( pytest.mark.xfail( - reason="Calamine support parsing datetime only in xlsx" + reason="Calamine support parsing datetime only in xlsx/ods" ) ) @@ -1686,10 +1681,10 @@ def test_sheet_name(self, request, engine, read_ext, df_ref): reason="Sheets containing datetimes not supported by pyxlsb" ) ) - if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: + if engine == "calamine" and read_ext in {".xls", ".xlsb"}: request.node.add_marker( pytest.mark.xfail( - reason="Calamine support parsing datetime only in xlsx" + reason="Calamine support parsing datetime only in xlsx/ods" ) ) @@ -1784,10 +1779,10 @@ def test_read_datetime_multiindex(self, request, engine, read_ext): reason="Sheets containing datetimes not supported by pyxlsb" ) ) - if engine == "calamine" and read_ext in {".xls", ".xlsb", ".ods"}: + if engine == "calamine" and read_ext in {".xls", ".xlsb"}: request.node.add_marker( pytest.mark.xfail( - reason="Calamine support parsing datetime only in xlsx" + reason="Calamine support parsing datetime only in xlsx/ods" ) ) diff --git a/pyproject.toml b/pyproject.toml index 7363ddef14dd2..437793f51d156 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,7 +62,7 @@ computation = ['scipy>=1.7.1', 'xarray>=0.21.0'] fss = ['fsspec>=2021.07.0'] aws = ['s3fs>=2021.08.0'] gcp = ['gcsfs>=2021.07.0', 'pandas-gbq>=0.15.0'] -excel = ['odfpy>=1.4.1', 'openpyxl>=3.0.7', 'python-calamine>=0.1.0', 'pyxlsb>=1.0.8', 'xlrd>=2.0.1', 'xlsxwriter>=1.4.3'] +excel = ['odfpy>=1.4.1', 'openpyxl>=3.0.7', 'python-calamine>=0.1.1', 'pyxlsb>=1.0.8', 'xlrd>=2.0.1', 'xlsxwriter>=1.4.3'] parquet = ['pyarrow>=7.0.0'] feather = ['pyarrow>=7.0.0'] hdf5 = [# blosc only available on conda (https://github.com/Blosc/python-blosc/issues/297) @@ -104,7 +104,7 @@ all = ['beautifulsoup4>=4.9.3', 'pytest>=7.0.0', 'pytest-xdist>=2.2.0', 'pytest-asyncio>=0.17.0', - 'python-calamine>=0.1.0', + 'python-calamine>=0.1.1', 'python-snappy>=0.6.0', 'pyxlsb>=1.0.8', 'qtpy>=2.2.0', diff --git a/scripts/tests/data/deps_expected_random.yaml b/scripts/tests/data/deps_expected_random.yaml index f4140e40a5f1e..a6e78cfb51170 100644 --- a/scripts/tests/data/deps_expected_random.yaml +++ b/scripts/tests/data/deps_expected_random.yaml @@ -57,4 +57,4 @@ dependencies: - zstandard>=0.15.2 - pip: - - python-calamine>=0.1.0 + - python-calamine>=0.1.1 diff --git a/scripts/tests/data/deps_minimum.toml b/scripts/tests/data/deps_minimum.toml index 04f31a84a5441..c150a4cedeed0 100644 --- a/scripts/tests/data/deps_minimum.toml +++ b/scripts/tests/data/deps_minimum.toml @@ -62,7 +62,7 @@ computation = ['scipy>=1.7.1', 'xarray>=0.21.0'] fss = ['fsspec>=2021.07.0'] aws = ['s3fs>=2021.08.0'] gcp = ['gcsfs>=2021.07.0', 'pandas-gbq>=0.15.0'] -excel = ['odfpy>=1.4.1', 'openpyxl>=3.0.7', 'python-calamine>=0.1.0', 'pyxlsb>=1.0.8', 'xlrd>=2.0.1', 'xlsxwriter>=1.4.3'] +excel = ['odfpy>=1.4.1', 'openpyxl>=3.0.7', 'python-calamine>=0.1.1', 'pyxlsb>=1.0.8', 'xlrd>=2.0.1', 'xlsxwriter>=1.4.3'] parquet = ['pyarrow>=7.0.0'] feather = ['pyarrow>=7.0.0'] hdf5 = [# blosc only available on conda (https://github.com/Blosc/python-blosc/issues/297) @@ -104,7 +104,7 @@ all = ['beautifulsoup4>=5.9.3', 'pytest>=7.0.0', 'pytest-xdist>=2.2.0', 'pytest-asyncio>=0.17.0', - 'python-calamine>=0.1.0', + 'python-calamine>=0.1.1', 'python-snappy>=0.6.0', 'pyxlsb>=1.0.8', 'qtpy>=2.2.0', diff --git a/scripts/tests/data/deps_unmodified_random.yaml b/scripts/tests/data/deps_unmodified_random.yaml index 40a726ed49af6..01cac96ac7396 100644 --- a/scripts/tests/data/deps_unmodified_random.yaml +++ b/scripts/tests/data/deps_unmodified_random.yaml @@ -57,4 +57,4 @@ dependencies: - zstandard>=0.15.2 - pip: - - python-calamine>=0.1.0 + - python-calamine>=0.1.1