diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 47405b72476fd..f7f36f7fa657f 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -55,4 +55,5 @@ dependencies: - zstandard>=0.15.2 - pip: + - python-calamine - tzdata>=2022.1 diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 9ebfb710e0abb..9280aab30ff12 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -55,4 +55,5 @@ dependencies: - zstandard>=0.15.2 - pip: + - python-calamine>=0.1.1 - tzdata>=2022.1 diff --git a/ci/deps/actions-38-downstream_compat.yaml b/ci/deps/actions-38-downstream_compat.yaml index 3ed2786b76896..4c4312c176046 100644 --- a/ci/deps/actions-38-downstream_compat.yaml +++ b/ci/deps/actions-38-downstream_compat.yaml @@ -70,4 +70,5 @@ dependencies: - py - pip: + - python-calamine - tzdata>=2022.1 diff --git a/ci/deps/actions-38-minimum_versions.yaml b/ci/deps/actions-38-minimum_versions.yaml index 2eb2ade8a2934..5f13906b3535f 100644 --- a/ci/deps/actions-38-minimum_versions.yaml +++ b/ci/deps/actions-38-minimum_versions.yaml @@ -59,4 +59,5 @@ dependencies: - pip: - pyqt5==5.15.1 + - python-calamine==0.1.1 - tzdata==2022.1 diff --git a/ci/deps/actions-38.yaml b/ci/deps/actions-38.yaml index 4060a837d1757..2092db9ed176b 100644 --- a/ci/deps/actions-38.yaml +++ b/ci/deps/actions-38.yaml @@ -55,4 +55,5 @@ dependencies: - zstandard>=0.15.2 - pip: + - python-calamine - tzdata>=2022.1 diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index 53cd9c5635493..0ca6f63e030a7 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -55,4 +55,5 @@ dependencies: - zstandard>=0.15.2 - pip: + - python-calamine - tzdata>=2022.1 diff --git a/ci/deps/circle-38-arm64.yaml b/ci/deps/circle-38-arm64.yaml index 2e4070fa82010..610607a2d4491 100644 --- a/ci/deps/circle-38-arm64.yaml +++ b/ci/deps/circle-38-arm64.yaml @@ -54,3 +54,6 @@ dependencies: - xlrd>=2.0.1 - xlsxwriter>=1.4.3 - zstandard>=0.15.2 + + - pip: + - python-calamine diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index f34676edd26dc..ccdee39c4eb95 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -345,6 +345,7 @@ xlrd 2.0.1 excel Reading Excel xlsxwriter 1.4.3 excel Writing Excel openpyxl 3.0.7 excel Reading / writing for xlsx files pyxlsb 1.0.8 excel Reading for xlsb files +python-calamine 0.1.1 excel Reading for xls/xlsx/xlsb/ods files ========================= ================== =============== ============================================================= HTML diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index c33d4ab92d4c6..985b13faa8cd7 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -3420,7 +3420,9 @@ Excel files The :func:`~pandas.read_excel` method can read Excel 2007+ (``.xlsx``) files using the ``openpyxl`` Python module. Excel 2003 (``.xls``) files can be read using ``xlrd``. Binary Excel (``.xlsb``) -files can be read using ``pyxlsb``. +files can be read using ``pyxlsb``. Also, all this formats can be read using ``python-calamine``, +but this library has some limitation and different behavior from other libraries, +for example, can't detect date in some formats (xls and xlsb). The :meth:`~DataFrame.to_excel` instance method is used for saving a ``DataFrame`` to Excel. Generally the semantics are similar to working with :ref:`csv` data. diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 9f5d6011a7780..d906dcbe87bf4 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -86,8 +86,9 @@ Other enhancements - Improved error message when creating a DataFrame with empty data (0 rows), no index and an incorrect number of columns. (:issue:`52084`) - :meth:`DataFrame.applymap` now uses the :meth:`~api.extensions.ExtensionArray.map` method of underlying :class:`api.extensions.ExtensionArray` instances (:issue:`52219`) - :meth:`arrays.SparseArray.map` now supports ``na_action`` (:issue:`52096`). +- :meth:`Categorical.map` and :meth:`CategoricalIndex.map` now have a ``na_action`` parameter (:issue:`44279`) +- Added ``calamine`` as an engine to ``read_excel`` (:issue:`50395`) - Add dtype of categories to ``repr`` information of :class:`CategoricalDtype` (:issue:`52179`) -- .. --------------------------------------------------------------------------- .. _whatsnew_210.notable_bug_fixes: diff --git a/environment.yml b/environment.yml index e2edf967ed8b7..df70be38528a4 100644 --- a/environment.yml +++ b/environment.yml @@ -117,5 +117,6 @@ dependencies: - pip: - sphinx-toggleprompt + - python-calamine - typing_extensions; python_version<"3.11" - tzdata>=2022.1 diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index bcfd4ea790e64..6a848e6ccd22c 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -37,6 +37,7 @@ "pyarrow": "7.0.0", "pyreadstat": "1.1.2", "pytest": "7.0.0", + "python-calamine": "0.1.1", "pyxlsb": "1.0.8", "s3fs": "2021.08.0", "scipy": "1.7.1", @@ -64,6 +65,7 @@ "lxml.etree": "lxml", "odf": "odfpy", "pandas_gbq": "pandas-gbq", + "python_calamine": "python-calamine", "snappy": "python-snappy", "sqlalchemy": "SQLAlchemy", "tables": "pytables", diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index d3bdcee7a7341..eb4e46b8d3206 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -503,11 +503,11 @@ def use_inf_as_na_cb(key) -> None: auto, {others}. """ -_xls_options = ["xlrd"] -_xlsm_options = ["xlrd", "openpyxl"] -_xlsx_options = ["xlrd", "openpyxl"] -_ods_options = ["odf"] -_xlsb_options = ["pyxlsb"] +_xls_options = ["xlrd", "calamine"] +_xlsm_options = ["xlrd", "openpyxl", "calamine"] +_xlsx_options = ["xlrd", "openpyxl", "calamine"] +_ods_options = ["odf", "calamine"] +_xlsb_options = ["pyxlsb", "calamine"] with cf.config_prefix("io.excel.xls"): diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 8c3bbb7798f68..fb291d53f1556 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -149,13 +149,15 @@ of dtype conversion. engine : str, default None If io is not a buffer or path, this must be set to identify io. - Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb". + Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb", "calamine". Engine compatibility : - "xlrd" supports old-style Excel files (.xls). - "openpyxl" supports newer Excel file formats. - "odf" supports OpenDocument file formats (.odf, .ods, .odt). - "pyxlsb" supports Binary Excel files. + - "calamine" supports Excel (.xls, .xlsx, .xlsm, .xlsb) + and OpenDocument (.ods) file formats. .. versionchanged:: 1.2.0 The engine `xlrd `_ @@ -375,7 +377,7 @@ def read_excel( | Callable[[str], bool] | None = ..., dtype: DtypeArg | None = ..., - engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb"] | None = ..., + engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb", "calamine"] | None = ..., converters: dict[str, Callable] | dict[int, Callable] | None = ..., true_values: Iterable[Hashable] | None = ..., false_values: Iterable[Hashable] | None = ..., @@ -414,7 +416,7 @@ def read_excel( | Callable[[str], bool] | None = ..., dtype: DtypeArg | None = ..., - engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb"] | None = ..., + engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb", "calamine"] | None = ..., converters: dict[str, Callable] | dict[int, Callable] | None = ..., true_values: Iterable[Hashable] | None = ..., false_values: Iterable[Hashable] | None = ..., @@ -453,7 +455,7 @@ def read_excel( | Callable[[str], bool] | None = None, dtype: DtypeArg | None = None, - engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb"] | None = None, + engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb", "calamine"] | None = None, converters: dict[str, Callable] | dict[int, Callable] | None = None, true_values: Iterable[Hashable] | None = None, false_values: Iterable[Hashable] | None = None, @@ -1418,13 +1420,15 @@ class ExcelFile: .xls, .xlsx, .xlsb, .xlsm, .odf, .ods, or .odt file. engine : str, default None If io is not a buffer or path, this must be set to identify io. - Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb`` + Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb``, ``calamine`` Engine compatibility : - ``xlrd`` supports old-style Excel files (.xls). - ``openpyxl`` supports newer Excel file formats. - ``odf`` supports OpenDocument file formats (.odf, .ods, .odt). - ``pyxlsb`` supports Binary Excel files. + - ``calamine`` supports Excel (.xls, .xlsx, .xlsm, .xlsb) + and OpenDocument (.ods) file formats. .. versionchanged:: 1.2.0 @@ -1452,6 +1456,7 @@ class ExcelFile: This is not supported, switch to using ``openpyxl`` instead. """ + from pandas.io.excel._calamine import CalamineReader from pandas.io.excel._odfreader import ODFReader from pandas.io.excel._openpyxl import OpenpyxlReader from pandas.io.excel._pyxlsb import PyxlsbReader @@ -1462,6 +1467,7 @@ class ExcelFile: "openpyxl": OpenpyxlReader, "odf": ODFReader, "pyxlsb": PyxlsbReader, + "calamine": CalamineReader, } def __init__( diff --git a/pandas/io/excel/_calamine.py b/pandas/io/excel/_calamine.py new file mode 100644 index 0000000000000..c71c0c62ff682 --- /dev/null +++ b/pandas/io/excel/_calamine.py @@ -0,0 +1,99 @@ +from __future__ import annotations + +from datetime import ( + date, + datetime, + time, +) +from typing import ( + TYPE_CHECKING, + Union, +) + +from pandas.compat._optional import import_optional_dependency +from pandas.util._decorators import doc + +import pandas as pd +from pandas.core.shared_docs import _shared_docs + +from pandas.io.excel._base import BaseExcelReader + +if TYPE_CHECKING: + from pandas._typing import ( + FilePath, + ReadBuffer, + Scalar, + StorageOptions, + ) + +_CellValueT = Union[int, float, str, bool, time, date, datetime] + + +class CalamineReader(BaseExcelReader): + @doc(storage_options=_shared_docs["storage_options"]) + def __init__( + self, + filepath_or_buffer: FilePath | ReadBuffer[bytes], + storage_options: StorageOptions = None, + ) -> None: + """ + Reader using calamine engine (xlsx/xls/xlsb/ods). + + Parameters + ---------- + filepath_or_buffer : str, path to be parsed or + an open readable stream. + {storage_options} + """ + import_optional_dependency("python_calamine") + super().__init__(filepath_or_buffer, storage_options=storage_options) + + @property + def _workbook_class(self): + from python_calamine import CalamineWorkbook + + return CalamineWorkbook + + def load_workbook(self, filepath_or_buffer: FilePath | ReadBuffer[bytes]): + from python_calamine import load_workbook + + return load_workbook(filepath_or_buffer) # type: ignore[arg-type] + + @property + def sheet_names(self) -> list[str]: + return self.book.sheet_names # pyright: ignore + + def get_sheet_by_name(self, name: str): + self.raise_if_bad_sheet_by_name(name) + return self.book.get_sheet_by_name(name) # pyright: ignore + + def get_sheet_by_index(self, index: int): + self.raise_if_bad_sheet_by_index(index) + return self.book.get_sheet_by_index(index) # pyright: ignore + + def get_sheet_data( + self, sheet, file_rows_needed: int | None = None + ) -> list[list[Scalar]]: + def _convert_cell(value: _CellValueT) -> Scalar: + if isinstance(value, float): + val = int(value) + if val == value: + return val + else: + return value + elif isinstance(value, date): + return pd.Timestamp(value) + elif isinstance(value, time): + return value.isoformat() + + return value + + rows: list[list[_CellValueT]] = sheet.to_python(skip_empty_area=False) + data: list[list[Scalar]] = [] + + for row in rows: + data.append([_convert_cell(cell) for cell in row]) + if file_rows_needed is not None and len(data) >= file_rows_needed: + break + + return data diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index c22051912d293..a5507040b107a 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -52,6 +52,7 @@ ), pytest.param("pyxlsb", marks=td.skip_if_no("pyxlsb")), pytest.param("odf", marks=td.skip_if_no("odf")), + pytest.param("calamine", marks=td.skip_if_no("python_calamine")), ] @@ -65,11 +66,11 @@ def _is_valid_engine_ext_pair(engine, read_ext: str) -> bool: return False if engine == "odf" and read_ext != ".ods": return False - if read_ext == ".ods" and engine != "odf": + if read_ext == ".ods" and engine not in {"odf", "calamine"}: return False if engine == "pyxlsb" and read_ext != ".xlsb": return False - if read_ext == ".xlsb" and engine != "pyxlsb": + if read_ext == ".xlsb" and engine not in {"pyxlsb", "calamine"}: return False if engine == "xlrd" and read_ext != ".xls": return False @@ -166,13 +167,19 @@ def test_usecols_int(self, read_ext): usecols=3, ) - def test_usecols_list(self, request, read_ext, df_ref): - if read_ext == ".xlsb": + def test_usecols_list(self, request, engine, read_ext, df_ref): + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" ) ) + if engine == "calamine" and read_ext in {".xls", ".xlsb"}: + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine support parsing datetime only in xlsx/ods" + ) + ) df_ref = df_ref.reindex(columns=["B", "C"]) df1 = pd.read_excel( @@ -190,13 +197,19 @@ def test_usecols_list(self, request, read_ext, df_ref): tm.assert_frame_equal(df1, df_ref, check_names=False) tm.assert_frame_equal(df2, df_ref, check_names=False) - def test_usecols_str(self, request, read_ext, df_ref): - if read_ext == ".xlsb": + def test_usecols_str(self, request, engine, read_ext, df_ref): + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" ) ) + if engine == "calamine" and read_ext in {".xls", ".xlsb"}: + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine support parsing datetime only in xlsx/ods" + ) + ) df1 = df_ref.reindex(columns=["A", "B", "C"]) df2 = pd.read_excel( @@ -247,14 +260,20 @@ def test_usecols_str(self, request, read_ext, df_ref): "usecols", [[0, 1, 3], [0, 3, 1], [1, 0, 3], [1, 3, 0], [3, 0, 1], [3, 1, 0]] ) def test_usecols_diff_positional_int_columns_order( - self, request, read_ext, usecols, df_ref + self, request, engine, read_ext, usecols, df_ref ): - if read_ext == ".xlsb": + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" ) ) + if engine == "calamine" and read_ext in {".xls", ".xlsb"}: + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine support parsing datetime only in xlsx/ods" + ) + ) expected = df_ref[["A", "C"]] result = pd.read_excel( @@ -270,25 +289,37 @@ def test_usecols_diff_positional_str_columns_order(self, read_ext, usecols, df_r result = pd.read_excel("test1" + read_ext, sheet_name="Sheet1", usecols=usecols) tm.assert_frame_equal(result, expected, check_names=False) - def test_read_excel_without_slicing(self, request, read_ext, df_ref): - if read_ext == ".xlsb": + def test_read_excel_without_slicing(self, request, engine, read_ext, df_ref): + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" ) ) + if engine == "calamine" and read_ext in {".xls", ".xlsb"}: + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine support parsing datetime only in xlsx/ods" + ) + ) expected = df_ref result = pd.read_excel("test1" + read_ext, sheet_name="Sheet1", index_col=0) tm.assert_frame_equal(result, expected, check_names=False) - def test_usecols_excel_range_str(self, request, read_ext, df_ref): - if read_ext == ".xlsb": + def test_usecols_excel_range_str(self, request, engine, read_ext, df_ref): + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" ) ) + if engine == "calamine" and read_ext in {".xls", ".xlsb"}: + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine support parsing datetime only in xlsx/ods" + ) + ) expected = df_ref[["C", "D"]] result = pd.read_excel( @@ -362,25 +393,35 @@ def test_excel_stop_iterator(self, read_ext): expected = DataFrame([["aaaa", "bbbbb"]], columns=["Test", "Test1"]) tm.assert_frame_equal(parsed, expected) - def test_excel_cell_error_na(self, request, read_ext): - if read_ext == ".xlsb": + def test_excel_cell_error_na(self, request, engine, read_ext): + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" ) ) + if engine == "calamine" and read_ext == ".ods": + request.node.add_marker( + pytest.mark.xfail(reason="Calamine returns 0 instead of NaN in ods") + ) parsed = pd.read_excel("test3" + read_ext, sheet_name="Sheet1") expected = DataFrame([[np.nan]], columns=["Test"]) tm.assert_frame_equal(parsed, expected) - def test_excel_table(self, request, read_ext, df_ref): - if read_ext == ".xlsb": + def test_excel_table(self, request, engine, read_ext, df_ref): + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" ) ) + if engine == "calamine" and read_ext in {".xls", ".xlsb"}: + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine support parsing datetime only in xlsx/ods" + ) + ) df1 = pd.read_excel("test1" + read_ext, sheet_name="Sheet1", index_col=0) df2 = pd.read_excel( @@ -395,13 +436,23 @@ def test_excel_table(self, request, read_ext, df_ref): ) tm.assert_frame_equal(df3, df1.iloc[:-1]) - def test_reader_special_dtypes(self, request, read_ext): - if read_ext == ".xlsb": + def test_reader_special_dtypes(self, request, engine, read_ext): + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" ) ) + if engine == "calamine" and read_ext in {".xls", ".xlsb"}: + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine support parsing datetime only in xlsx/ods" + ) + ) + if engine == "calamine" and read_ext != ".ods": + request.node.add_marker( + pytest.mark.xfail(reason="Maybe not supported by calamine") + ) expected = DataFrame.from_dict( { @@ -535,11 +586,16 @@ def test_reader_dtype_str(self, read_ext, dtype, expected): actual = pd.read_excel(basename + read_ext, dtype=dtype) tm.assert_frame_equal(actual, expected) - def test_dtype_backend(self, read_ext, dtype_backend): + def test_dtype_backend(self, request, engine, read_ext, dtype_backend): # GH#36712 if read_ext in (".xlsb", ".xls"): pytest.skip(f"No engine for filetype: '{read_ext}'") + if engine == "calamine" and read_ext == ".ods": + request.node.add_marker( + pytest.mark.xfail(reason="Calamine doesn't support invalid ods") + ) + df = DataFrame( { "a": Series([1, 3], dtype="Int64"), @@ -580,11 +636,16 @@ def test_dtype_backend(self, read_ext, dtype_backend): expected = df tm.assert_frame_equal(result, expected) - def test_dtype_backend_and_dtype(self, read_ext): + def test_dtype_backend_and_dtype(self, request, engine, read_ext): # GH#36712 if read_ext in (".xlsb", ".xls"): pytest.skip(f"No engine for filetype: '{read_ext}'") + if engine == "calamine" and read_ext == ".ods": + request.node.add_marker( + pytest.mark.xfail(reason="Calamine doesn't support invalid ods") + ) + df = DataFrame({"a": [np.nan, 1.0], "b": [2.5, np.nan]}) with tm.ensure_clean(read_ext) as file_path: df.to_excel(file_path, "test", index=False) @@ -597,11 +658,16 @@ def test_dtype_backend_and_dtype(self, read_ext): tm.assert_frame_equal(result, df) @td.skip_if_no("pyarrow") - def test_dtype_backend_string(self, read_ext, string_storage): + def test_dtype_backend_string(self, request, engine, read_ext, string_storage): # GH#36712 if read_ext in (".xlsb", ".xls"): pytest.skip(f"No engine for filetype: '{read_ext}'") + if engine == "calamine" and read_ext == ".ods": + request.node.add_marker( + pytest.mark.xfail(reason="Calamine doesn't support invalid ods") + ) + import pyarrow as pa with pd.option_context("mode.string_storage", string_storage): @@ -645,7 +711,7 @@ def test_dtype_mangle_dup_cols(self, read_ext, dtypes, exp_value): assert dtype_dict == dtype_dict_copy, "dtype dict changed" tm.assert_frame_equal(result, expected) - def test_reader_spaces(self, read_ext): + def test_reader_spaces(self, request, engine, read_ext): # see gh-32207 basename = "test_spaces" @@ -751,6 +817,11 @@ def test_date_conversion_overflow(self, request, engine, read_ext): columns=["DateColWithBigInt", "StringCol"], ) + if engine == "calamine" and read_ext != ".ods": + request.node.add_marker( + pytest.mark.xfail(reason="Maybe not supported by calamine") + ) + if engine == "openpyxl": request.node.add_marker( pytest.mark.xfail(reason="Maybe not supported by openpyxl") @@ -765,13 +836,20 @@ def test_date_conversion_overflow(self, request, engine, read_ext): result = pd.read_excel("testdateoverflow" + read_ext) tm.assert_frame_equal(result, expected) - def test_sheet_name(self, request, read_ext, df_ref): - if read_ext == ".xlsb": + def test_sheet_name(self, request, read_ext, engine, df_ref): + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" ) ) + if engine == "calamine" and read_ext in {".xls", ".xlsb"}: + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine support parsing datetime only in xlsx/ods" + ) + ) + filename = "test1" sheet_name = "Sheet1" @@ -828,6 +906,11 @@ def test_corrupt_bytes_raises(self, engine): "Unsupported format, or corrupt file: Expected BOF " "record; found b'foo'" ) + elif engine == "calamine": + from python_calamine import CalamineError + + error = CalamineError + msg = "Cannot detect file format" else: error = BadZipFile msg = "File is not a zip file" @@ -935,6 +1018,18 @@ def test_reader_seconds(self, request, engine, read_ext): reason="Sheets containing datetimes not supported by pyxlsb" ) ) + if engine == "calamine" and read_ext in {".xls", ".xlsb"}: + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine support parsing datetime only in xlsx/ods" + ) + ) + if engine == "calamine": + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine doesn't support parsing milliseconds in datetime" + ) + ) # Test reading times with and without milliseconds. GH5945. expected = DataFrame.from_dict( @@ -961,14 +1056,24 @@ def test_reader_seconds(self, request, engine, read_ext): actual = pd.read_excel("times_1904" + read_ext, sheet_name="Sheet1") tm.assert_frame_equal(actual, expected) - def test_read_excel_multiindex(self, request, read_ext): + def test_read_excel_multiindex(self, request, engine, read_ext): # see gh-4679 - if read_ext == ".xlsb": + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" ) ) + if engine == "calamine" and read_ext in {".xls", ".xlsb"}: + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine support parsing datetime only in xlsx/ods" + ) + ) + if engine == "calamine" and read_ext == ".ods": + request.node.add_marker( + pytest.mark.xfail(reason="Last test fails in calamine") + ) mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]]) mi_file = "testmultiindex" + read_ext @@ -1055,13 +1160,20 @@ def test_read_excel_multiindex(self, request, read_ext): ], ) def test_read_excel_multiindex_blank_after_name( - self, request, read_ext, sheet_name, idx_lvl2 + self, request, engine, read_ext, sheet_name, idx_lvl2 ): # GH34673 - if read_ext == ".xlsb": + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb (GH4679" + reason="Sheets containing datetimes" + f"not supported by {engine} (GH4679)" + ) + ) + if engine == "calamine" and read_ext in {".xls", ".xlsb"}: + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine support parsing datetime only in xlsx/ods" ) ) @@ -1088,10 +1200,17 @@ def test_read_excel_multiindex_blank_after_name( ) tm.assert_frame_equal(result, expected) - def test_read_excel_multiindex_header_only(self, read_ext): + def test_read_excel_multiindex_header_only(self, request, engine, read_ext): # see gh-11733. # # Don't try to parse a header name if there isn't one. + if engine == "calamine" and read_ext == ".ods": + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine doesn't support 'number-rows-repeated' in ods" + ) + ) + mi_file = "testmultiindex" + read_ext result = pd.read_excel(mi_file, sheet_name="index_col_none", header=[0, 1]) @@ -1178,14 +1297,20 @@ def test_read_excel_bool_header_arg(self, read_ext): with pytest.raises(TypeError, match=msg): pd.read_excel("test1" + read_ext, header=arg) - def test_read_excel_skiprows(self, request, read_ext): + def test_read_excel_skiprows(self, request, engine, read_ext): # GH 4903 - if read_ext == ".xlsb": + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" ) ) + if engine == "calamine" and read_ext in {".xls", ".xlsb"}: + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine support parsing datetime only in xlsx/ods" + ) + ) actual = pd.read_excel( "testskiprows" + read_ext, sheet_name="skiprows_list", skiprows=[0, 2] @@ -1233,14 +1358,20 @@ def test_read_excel_skiprows(self, request, read_ext): ) tm.assert_frame_equal(actual, expected) - def test_read_excel_skiprows_callable_not_in(self, request, read_ext): + def test_read_excel_skiprows_callable_not_in(self, request, engine, read_ext): # GH 4903 - if read_ext == ".xlsb": + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" ) ) + if engine == "calamine" and read_ext in {".xls", ".xlsb"}: + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine support parsing datetime only in xlsx/ods" + ) + ) actual = pd.read_excel( "testskiprows" + read_ext, @@ -1320,8 +1451,15 @@ def test_deprecated_kwargs(self, read_ext): with pytest.raises(TypeError, match="but 3 positional arguments"): pd.read_excel("test1" + read_ext, "Sheet1", 0) - def test_no_header_with_list_index_col(self, read_ext): + def test_no_header_with_list_index_col(self, request, engine, read_ext): # GH 31783 + if engine == "calamine" and read_ext == ".ods": + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine doesn't support 'number-rows-repeated' in ods" + ) + ) + file_name = "testmultiindex" + read_ext data = [("B", "B"), ("key", "val"), (3, 4), (3, 4)] idx = MultiIndex.from_tuples( @@ -1341,8 +1479,15 @@ def test_one_col_noskip_blank_line(self, read_ext): result = pd.read_excel(file_name) tm.assert_frame_equal(result, expected) - def test_multiheader_two_blank_lines(self, read_ext): + def test_multiheader_two_blank_lines(self, request, engine, read_ext): # GH 40442 + if engine == "calamine" and read_ext == ".ods": + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine doesn't support 'number-rows-repeated' in ods" + ) + ) + file_name = "testmultiindex" + read_ext columns = MultiIndex.from_tuples([("a", "A"), ("b", "B")]) data = [[np.nan, np.nan], [np.nan, np.nan], [1, 3], [2, 4]] @@ -1363,12 +1508,12 @@ def test_trailing_blanks(self, read_ext): def test_ignore_chartsheets_by_str(self, request, engine, read_ext): # GH 41448 - if engine == "odf": + if read_ext == ".ods": pytest.skip("chartsheets do not exist in the ODF format") - if engine == "pyxlsb": + if engine in {"pyxlsb", "calamine"}: request.node.add_marker( pytest.mark.xfail( - reason="pyxlsb can't distinguish chartsheets from worksheets" + reason=f"{engine} can't distinguish chartsheets from worksheets" ) ) with pytest.raises(ValueError, match="Worksheet named 'Chart1' not found"): @@ -1376,12 +1521,12 @@ def test_ignore_chartsheets_by_str(self, request, engine, read_ext): def test_ignore_chartsheets_by_int(self, request, engine, read_ext): # GH 41448 - if engine == "odf": + if read_ext == ".ods": pytest.skip("chartsheets do not exist in the ODF format") - if engine == "pyxlsb": + if engine in {"pyxlsb", "calamine"}: request.node.add_marker( pytest.mark.xfail( - reason="pyxlsb can't distinguish chartsheets from worksheets" + reason=f"{engine} can't distinguish chartsheets from worksheets" ) ) with pytest.raises( @@ -1494,13 +1639,19 @@ def test_excel_passes_na_filter(self, read_ext, na_filter): expected = DataFrame(expected, columns=["Test"]) tm.assert_frame_equal(parsed, expected) - def test_excel_table_sheet_by_index(self, request, read_ext, df_ref): - if read_ext == ".xlsb": + def test_excel_table_sheet_by_index(self, request, engine, read_ext, df_ref): + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" ) ) + if engine == "calamine" and read_ext in {".xls", ".xlsb"}: + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine support parsing datetime only in xlsx/ods" + ) + ) with pd.ExcelFile("test1" + read_ext) as excel: df1 = pd.read_excel(excel, sheet_name=0, index_col=0) @@ -1523,13 +1674,19 @@ def test_excel_table_sheet_by_index(self, request, read_ext, df_ref): tm.assert_frame_equal(df3, df1.iloc[:-1]) - def test_sheet_name(self, request, read_ext, df_ref): - if read_ext == ".xlsb": + def test_sheet_name(self, request, engine, read_ext, df_ref): + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" ) ) + if engine == "calamine" and read_ext in {".xls", ".xlsb"}: + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine support parsing datetime only in xlsx/ods" + ) + ) filename = "test1" sheet_name = "Sheet1" @@ -1593,7 +1750,7 @@ def test_excel_read_binary(self, engine, read_ext): def test_excel_read_binary_via_read_excel(self, read_ext, engine): # GH 38424 with open("test1" + read_ext, "rb") as f: - result = pd.read_excel(f) + result = pd.read_excel(f, engine=engine) expected = pd.read_excel("test1" + read_ext, engine=engine) tm.assert_frame_equal(result, expected) @@ -1622,6 +1779,12 @@ def test_read_datetime_multiindex(self, request, engine, read_ext): reason="Sheets containing datetimes not supported by pyxlsb" ) ) + if engine == "calamine" and read_ext in {".xls", ".xlsb"}: + request.node.add_marker( + pytest.mark.xfail( + reason="Calamine support parsing datetime only in xlsx/ods" + ) + ) f = "test_datetime_mi" + read_ext with pd.ExcelFile(f) as excel: @@ -1645,12 +1808,12 @@ def test_engine_invalid_option(self, read_ext): def test_ignore_chartsheets(self, request, engine, read_ext): # GH 41448 - if engine == "odf": + if read_ext == ".ods": pytest.skip("chartsheets do not exist in the ODF format") - if engine == "pyxlsb": + if engine in {"pyxlsb", "calamine"}: request.node.add_marker( pytest.mark.xfail( - reason="pyxlsb can't distinguish chartsheets from worksheets" + reason=f"{engine} can't distinguish chartsheets from worksheets" ) ) with pd.ExcelFile("chartsheet" + read_ext) as excel: @@ -1665,6 +1828,10 @@ def test_corrupt_files_closed(self, engine, read_ext): import xlrd errors = (BadZipFile, xlrd.biffh.XLRDError) + elif engine == "calamine": + from python_calamine import CalamineError + + errors = (CalamineError,) with tm.ensure_clean(f"corrupt{read_ext}") as file: Path(file).write_text("corrupt") diff --git a/pyproject.toml b/pyproject.toml index ac6a4a7b2a61b..437793f51d156 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,7 +62,7 @@ computation = ['scipy>=1.7.1', 'xarray>=0.21.0'] fss = ['fsspec>=2021.07.0'] aws = ['s3fs>=2021.08.0'] gcp = ['gcsfs>=2021.07.0', 'pandas-gbq>=0.15.0'] -excel = ['odfpy>=1.4.1', 'openpyxl>=3.0.7', 'pyxlsb>=1.0.8', 'xlrd>=2.0.1', 'xlsxwriter>=1.4.3'] +excel = ['odfpy>=1.4.1', 'openpyxl>=3.0.7', 'python-calamine>=0.1.1', 'pyxlsb>=1.0.8', 'xlrd>=2.0.1', 'xlsxwriter>=1.4.3'] parquet = ['pyarrow>=7.0.0'] feather = ['pyarrow>=7.0.0'] hdf5 = [# blosc only available on conda (https://github.com/Blosc/python-blosc/issues/297) @@ -104,6 +104,7 @@ all = ['beautifulsoup4>=4.9.3', 'pytest>=7.0.0', 'pytest-xdist>=2.2.0', 'pytest-asyncio>=0.17.0', + 'python-calamine>=0.1.1', 'python-snappy>=0.6.0', 'pyxlsb>=1.0.8', 'qtpy>=2.2.0', diff --git a/requirements-dev.txt b/requirements-dev.txt index 2dadaee4a71e0..c310777c413ff 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -86,6 +86,7 @@ pyyaml requests pygments sphinx-toggleprompt +python-calamine typing_extensions; python_version<"3.11" tzdata>=2022.1 setuptools>=61.0.0 diff --git a/scripts/tests/data/deps_expected_random.yaml b/scripts/tests/data/deps_expected_random.yaml index be5e467b57e10..a6e78cfb51170 100644 --- a/scripts/tests/data/deps_expected_random.yaml +++ b/scripts/tests/data/deps_expected_random.yaml @@ -55,3 +55,6 @@ dependencies: - xlrd>=2.0.1 - xlsxwriter>=1.4.3 - zstandard>=0.15.2 + + - pip: + - python-calamine>=0.1.1 diff --git a/scripts/tests/data/deps_minimum.toml b/scripts/tests/data/deps_minimum.toml index 97a5ce1180bfb..c150a4cedeed0 100644 --- a/scripts/tests/data/deps_minimum.toml +++ b/scripts/tests/data/deps_minimum.toml @@ -62,7 +62,7 @@ computation = ['scipy>=1.7.1', 'xarray>=0.21.0'] fss = ['fsspec>=2021.07.0'] aws = ['s3fs>=2021.08.0'] gcp = ['gcsfs>=2021.07.0', 'pandas-gbq>=0.15.0'] -excel = ['odfpy>=1.4.1', 'openpyxl>=3.0.7', 'pyxlsb>=1.0.8', 'xlrd>=2.0.1', 'xlsxwriter>=1.4.3'] +excel = ['odfpy>=1.4.1', 'openpyxl>=3.0.7', 'python-calamine>=0.1.1', 'pyxlsb>=1.0.8', 'xlrd>=2.0.1', 'xlsxwriter>=1.4.3'] parquet = ['pyarrow>=7.0.0'] feather = ['pyarrow>=7.0.0'] hdf5 = [# blosc only available on conda (https://github.com/Blosc/python-blosc/issues/297) @@ -104,6 +104,7 @@ all = ['beautifulsoup4>=5.9.3', 'pytest>=7.0.0', 'pytest-xdist>=2.2.0', 'pytest-asyncio>=0.17.0', + 'python-calamine>=0.1.1', 'python-snappy>=0.6.0', 'pyxlsb>=1.0.8', 'qtpy>=2.2.0', diff --git a/scripts/tests/data/deps_unmodified_random.yaml b/scripts/tests/data/deps_unmodified_random.yaml index 4ca758af1c8ad..01cac96ac7396 100644 --- a/scripts/tests/data/deps_unmodified_random.yaml +++ b/scripts/tests/data/deps_unmodified_random.yaml @@ -55,3 +55,6 @@ dependencies: - xlrd>=2.0.1 - xlsxwriter>=1.4.3 - zstandard>=0.15.2 + + - pip: + - python-calamine>=0.1.1