diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py index 80af2cff41769..1eaccb9f2d897 100644 --- a/asv_bench/benchmarks/io/excel.py +++ b/asv_bench/benchmarks/io/excel.py @@ -11,7 +11,7 @@ def _generate_dataframe(): - N = 2000 + N = 20000 C = 5 df = DataFrame( np.random.randn(N, C), @@ -69,5 +69,9 @@ def time_read_excel(self, engine): fname = self.fname_odf if engine == "odf" else self.fname_excel read_excel(fname, engine=engine) + def time_read_excel_nrows(self, engine): + fname = self.fname_odf if engine == "odf" else self.fname_excel + read_excel(fname, engine=engine, nrows=1) + from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index dbc88d0b371e8..e28ecc16fcb7b 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -207,6 +207,7 @@ Performance improvements - Performance improvements when creating Series with dtype `str` or :class:`StringDtype` from array with many string elements (:issue:`36304`, :issue:`36317`) - Performance improvement in :meth:`GroupBy.agg` with the ``numba`` engine (:issue:`35759`) +- Performance improvement in `read_excel` for when ``nrows`` is much smaller than the length of the file (:issue:`33281`). - Performance improvements when creating :meth:`pd.Series.map` from a huge dictionary (:issue:`34717`) - Performance improvement in :meth:`GroupBy.transform` with the ``numba`` engine (:issue:`36240`) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 65e95fd321772..e80072fad8896 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -3,12 +3,12 @@ from io import BufferedIOBase, BytesIO, RawIOBase import os from textwrap import fill -from typing import Any, Mapping, Union +from typing import Any, List, Mapping, Optional, Union from pandas._config import config from pandas._libs.parsers import STR_NA_VALUES -from pandas._typing import StorageOptions +from pandas._typing import Scalar, StorageOptions from pandas.errors import EmptyDataError from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments @@ -394,7 +394,14 @@ def get_sheet_by_index(self, index): pass @abc.abstractmethod - def get_sheet_data(self, sheet, convert_float): + def get_sheet_data( + self, + sheet, + convert_float: bool, + header_nrows: int, + skiprows_nrows: int, + nrows: Optional[int], + ) -> List[List[Scalar]]: pass def parse( @@ -450,7 +457,22 @@ def parse( else: # assume an integer if not a string sheet = self.get_sheet_by_index(asheetname) - data = self.get_sheet_data(sheet, convert_float) + if isinstance(header, int): + header_nrows = header + elif header is None: + header_nrows = 0 + else: + header_nrows = max(header) + if isinstance(skiprows, int): + skiprows_nrows = skiprows + elif skiprows is None: + skiprows_nrows = 0 + else: + skiprows_nrows = len(skiprows) + + data = self.get_sheet_data( + sheet, convert_float, header_nrows, skiprows_nrows, nrows + ) usecols = maybe_convert_usecols(usecols) if not data: diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index ffb599cdfaaf8..6b3bf4f1375ad 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -1,4 +1,4 @@ -from typing import List, cast +from typing import List, Optional, cast import numpy as np @@ -71,7 +71,14 @@ def get_sheet_by_name(self, name: str): raise ValueError(f"sheet {name} not found") - def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: + def get_sheet_data( + self, + sheet, + convert_float: bool, + header_nrows: int, + skiprows_nrows: int, + nrows: Optional[int], + ) -> List[List[Scalar]]: """ Parse an ODF Table into a list of lists """ @@ -87,6 +94,8 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: table: List[List[Scalar]] = [] + if isinstance(nrows, int): + sheet_rows = sheet_rows[: header_nrows + skiprows_nrows + nrows + 1] for i, sheet_row in enumerate(sheet_rows): sheet_cells = [x for x in sheet_row.childNodes if x.qname in cell_names] empty_cells = 0 diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index a5cadf4d93389..bc7b168eeaaa2 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -508,7 +508,14 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: return cell.value - def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: + def get_sheet_data( + self, + sheet, + convert_float: bool, + header_nrows: int, + skiprows_nrows: int, + nrows: Optional[int], + ) -> List[List[Scalar]]: data: List[List[Scalar]] = [] for row in sheet.rows: data.append([self._convert_cell(cell, convert_float) for cell in row]) diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py index ac94f4dd3df74..cf3dcebdff6eb 100644 --- a/pandas/io/excel/_pyxlsb.py +++ b/pandas/io/excel/_pyxlsb.py @@ -1,4 +1,4 @@ -from typing import List +from typing import List, Optional from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions from pandas.compat._optional import import_optional_dependency @@ -68,7 +68,14 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: return cell.v - def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: + def get_sheet_data( + self, + sheet, + convert_float: bool, + header_nrows: int, + skiprows_nrows: int, + nrows: Optional[int], + ) -> List[List[Scalar]]: return [ [self._convert_cell(c, convert_float) for c in r] for r in sheet.rows(sparse=False) diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index dfd5dde0329ae..e5d0d66f9570a 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -1,8 +1,9 @@ from datetime import time +from typing import List, Optional import numpy as np -from pandas._typing import StorageOptions +from pandas._typing import Scalar, StorageOptions from pandas.compat._optional import import_optional_dependency from pandas.io.excel._base import BaseExcelReader @@ -49,7 +50,14 @@ def get_sheet_by_name(self, name): def get_sheet_by_index(self, index): return self.book.sheet_by_index(index) - def get_sheet_data(self, sheet, convert_float): + def get_sheet_data( + self, + sheet, + convert_float: bool, + header_nrows: int, + skiprows_nrows: int, + nrows: Optional[int], + ) -> List[List[Scalar]]: from xlrd import ( XL_CELL_BOOLEAN, XL_CELL_DATE, @@ -98,9 +106,14 @@ def _parse_cell(cell_contents, cell_typ): cell_contents = val return cell_contents - data = [] + data: List[List[Scalar]] = [] - for i in range(sheet.nrows): + sheet_nrows = sheet.nrows + + if isinstance(nrows, int): + sheet_nrows = min(header_nrows + skiprows_nrows + nrows + 1, sheet_nrows) + + for i in range(sheet_nrows): row = [ _parse_cell(value, typ) for value, typ in zip(sheet.row_values(i), sheet.row_types(i)) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 431a50477fccc..b312f67349658 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1153,5 +1153,21 @@ def test_read_datetime_multiindex(self, engine, read_ext): ], ) expected = pd.DataFrame([], columns=expected_column_index) + tm.assert_frame_equal(expected, actual) + @pytest.mark.parametrize( + "header, skiprows", [(1, 2), (0, 3), (1, [0, 1]), ([2], 1)] + ) + @td.check_file_leaks + def test_header_skiprows_nrows(self, engine, read_ext, header, skiprows): + # GH 32727 + data = pd.read_excel("test1" + read_ext, engine=engine) + expected = ( + DataFrame(data.iloc[3:6]) + .reset_index(drop=True) + .rename(columns=data.iloc[2].rename(None)) + ) + actual = pd.read_excel( + "test1" + read_ext, engine=engine, header=header, skiprows=skiprows, nrows=3 + ) tm.assert_frame_equal(expected, actual)