pandas-dev · kostyafarber · Jan 5, 2023 · Jan 5, 2023 · Jan 5, 2023 · Jan 5, 2023
@@ -55,3 +55,6 @@ dependencies:
   - xlrd
   - xlsxwriter
   - zstandard
+
+  - pip:
+    - python-calamine
@@ -69,3 +69,6 @@ dependencies:
   - pandas-gbq
   - pyyaml
   - py
+
+  - pip:
+    - python-calamine
@@ -61,3 +61,4 @@ dependencies:
 
   - pip:
     - pyqt5==5.15.1
+    - python-calamine==0.0.7
@@ -54,3 +54,6 @@ dependencies:
   - xlrd
   - xlsxwriter
   - zstandard
+
+  - pip:
+    - python-calamine
@@ -55,3 +55,6 @@ dependencies:
   - xlrd
   - xlsxwriter
   - zstandard
+
+  - pip:
+    - python-calamine
@@ -55,3 +55,6 @@ dependencies:
   - xlrd
   - xlsxwriter
   - zstandard
+
+  - pip:
+    - python-calamine
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -185,9 +185,11 @@ Other enhancements
 - :meth:`Series.drop_duplicates` has gained ``ignore_index`` keyword to reset index (:issue:`48304`)
 - Improved error message in :func:`to_datetime` for non-ISO8601 formats, informing users about the position of the first error (:issue:`50361`)
 - Improved error message when trying to align :class:`DataFrame` objects (for example, in :func:`DataFrame.compare`) to clarify that "identically labelled" refers to both index and columns (:issue:`50083`)
+- Performance improvement in :func:`to_datetime` when format is given or can be inferred (:issue:`50465`)
+- Added ``calamine`` as an engine to ``read_excel`` (:issue: ``50395``)
 - Added :meth:`DatetimeIndex.as_unit` and :meth:`TimedeltaIndex.as_unit` to convert to different resolutions; supported resolutions are "s", "ms", "us", and "ns" (:issue:`50616`)
 - Added new argument ``dtype`` to :func:`read_sql` to be consistent with :func:`read_sql_query` (:issue:`50797`)
--
+
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_200.notable_bug_fixes:

diff --git a/environment.yml b/environment.yml
@@ -117,3 +117,4 @@ dependencies:
 
   - pip:
       - sphinx-toggleprompt
+      - python-calamine
diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py
@@ -34,6 +34,7 @@
     "pyarrow": "6.0.0",
     "pyreadstat": "1.1.2",
     "pytest": "7.0.0",
+    "python-calamine": "0.0.7",
     "pyxlsb": "1.0.8",
     "s3fs": "2021.08.0",
     "scipy": "1.7.1",
@@ -61,6 +62,7 @@
     "lxml.etree": "lxml",
     "odf": "odfpy",
     "pandas_gbq": "pandas-gbq",
+    "python_calamine": "python-calamine",
     "snappy": "python-snappy",
     "sqlalchemy": "SQLAlchemy",
     "tables": "pytables",

@@ -1450,6 +1450,7 @@ class ExcelFile:
             This is not supported, switch to using ``openpyxl`` instead.
     """
 
+    from pandas.io.excel._calaminereader import CalamineExcelReader
     from pandas.io.excel._odfreader import ODFReader
     from pandas.io.excel._openpyxl import OpenpyxlReader
     from pandas.io.excel._pyxlsb import PyxlsbReader
@@ -1460,6 +1461,7 @@ class ExcelFile:
         "openpyxl": OpenpyxlReader,
         "odf": ODFReader,
         "pyxlsb": PyxlsbReader,
+        "calamine": CalamineExcelReader,
     }
 
     def __init__(

diff --git a/pandas/io/excel/_calaminereader.py b/pandas/io/excel/_calaminereader.py
@@ -0,0 +1,110 @@
+from __future__ import annotations
+
+from datetime import (
+    date,
+    datetime,
+    time,
+)
+from tempfile import NamedTemporaryFile
+from typing import Union
+
+from pandas._typing import (
+    FilePath,
+    ReadBuffer,
+    Scalar,
+    StorageOptions,
+)
+from pandas.compat._optional import import_optional_dependency
+
+import pandas as pd
+
+from pandas.io.common import stringify_path
+from pandas.io.excel._base import (
+    BaseExcelReader,
+    inspect_excel_format,
+)
+
+ValueT = Union[int, float, str, bool, time, date, datetime]
+
+
+class __calamine__:
+    pass
+
+
+class CalamineExcelReader(BaseExcelReader):
+    book: str
+    _sheet_names: list[str] | None = None
+
+    def __init__(
+        self,
+        filepath_or_buffer: FilePath | ReadBuffer[bytes],
+        storage_options: StorageOptions = None,
+    ) -> None:
+        import_optional_dependency("python_calamine")
+        super().__init__(filepath_or_buffer, storage_options=storage_options)
+
+    @property
+    def _workbook_class(self) -> type[__calamine__]:
+        return __calamine__
+
+    def load_workbook(self, filepath_or_buffer) -> str:
+        if hasattr(filepath_or_buffer, "read") and hasattr(filepath_or_buffer, "seek"):
+            ext = inspect_excel_format(filepath_or_buffer)
+            with NamedTemporaryFile(suffix=f".{ext}", delete=False) as tmp_file:
+                filepath_or_buffer.seek(0)
+                tmp_file.write(filepath_or_buffer.read())
+                filepath_or_buffer = tmp_file.name
+        else:
+            filepath_or_buffer = stringify_path(filepath_or_buffer)
+
+        assert isinstance(filepath_or_buffer, str)
+
+        from python_calamine import get_sheet_names
+
+        self._sheet_names = get_sheet_names(filepath_or_buffer)
+        return filepath_or_buffer
+
+    @property
+    def sheet_names(self) -> list[str]:
+        from python_calamine import get_sheet_names
+
+        if self._sheet_names is None:
+            self._sheet_names = get_sheet_names(self.book)
+        return self._sheet_names
+
+    def get_sheet_by_name(self, name: str) -> int:
+        self.raise_if_bad_sheet_by_name(name)
+        return self.sheet_names.index(name)
+
+    def get_sheet_by_index(self, index: int) -> int:
+        self.raise_if_bad_sheet_by_index(index)
+        return index
+
+    def get_sheet_data(
+        self, sheet: int, file_rows_needed: int | None = None
+    ) -> list[list[Scalar]]:
+        def _convert_cell(value: ValueT) -> Scalar:
+            if isinstance(value, float):
+                val = int(value)
+                if val == value:
+                    return val
+                else:
+                    return value
+            elif isinstance(value, date):
+                return pd.Timestamp(value)
+            elif isinstance(value, time):
+                return value.isoformat()
+
+            return value
+
+        from python_calamine import get_sheet_data
+
+        rows = get_sheet_data(self.book, sheet, skip_empty_area=False)
+        data: list[list[Scalar]] = []
+
+        for row in rows:
+            data.append([_convert_cell(cell) for cell in row])
+            if file_rows_needed is not None and len(data) >= file_rows_needed:
+                break
+
+        return data
-Original file line number
+Diff line change
@@ Expand Up / @@ -69,3 +69,6 @@ dependencies: @@
       - pandas-gbq
       - pyyaml
       - py
+      - pip:
+        - python-calamine
Original file line number	Diff line number	Diff line change
Expand Up		@@ -61,3 +61,4 @@ dependencies:

		- pip:
		- pyqt5==5.15.1
		- python-calamine==0.0.7
Original file line number	Diff line number	Diff line change
Expand Up		@@ -117,3 +117,4 @@ dependencies:

		- pip:
		- sphinx-toggleprompt
		- python-calamine