diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py index 80af2cff41769..1eaccb9f2d897 100644 --- a/asv_bench/benchmarks/io/excel.py +++ b/asv_bench/benchmarks/io/excel.py @@ -11,7 +11,7 @@ def _generate_dataframe(): - N = 2000 + N = 20000 C = 5 df = DataFrame( np.random.randn(N, C), @@ -69,5 +69,9 @@ def time_read_excel(self, engine): fname = self.fname_odf if engine == "odf" else self.fname_excel read_excel(fname, engine=engine) + def time_read_excel_nrows(self, engine): + fname = self.fname_odf if engine == "odf" else self.fname_excel + read_excel(fname, engine=engine, nrows=1) + from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 2243790a663df..e513ae17704b9 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -775,6 +775,7 @@ Performance improvements - Performance improvement in :class:`pandas.core.groupby.RollingGroupby` (:issue:`34052`) - Performance improvement in arithmetic operations (sub, add, mul, div) for MultiIndex (:issue:`34297`) - Performance improvement in `DataFrame[bool_indexer]` when `bool_indexer` is a list (:issue:`33924`) +- Performance improvement in `read_excel` for integer ``header``, ``skiprows``, and ``nrows`` (:issue:`33281`). .. --------------------------------------------------------------------------- diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 6c3b49b9afc68..2216ad7eb6364 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -3,10 +3,12 @@ from io import BytesIO import os from textwrap import fill +from typing import List, Optional, Sequence from pandas._config import config from pandas._libs.parsers import STR_NA_VALUES +from pandas._typing import Scalar, Union from pandas.errors import EmptyDataError from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments @@ -383,9 +385,46 @@ def get_sheet_by_index(self, index): pass @abc.abstractmethod - def get_sheet_data(self, sheet, convert_float): + def get_sheet_data( + self, + sheet, + convert_float: bool, + header: Optional[Union[int, Sequence[int]]], + skiprows: Optional[Union[int, Sequence[int]]], + nrows: Optional[int], + ) -> List[List[Scalar]]: pass + def should_skip_row( + self, + index: int, + header: Optional[Union[int, Sequence[int]]], + skiprows: Optional[Union[int, Sequence[int]]], + nrows: Optional[int], + ) -> bool: + """ + Determines whether row should be read. + + Parameters + ---------- + index : int + Index of row. + header : int, list of int + Rows used as column labels. + skiprows : int, list of int + Rows to skip at the begining. + nrows : int + Number of rows to parse. + + Returns + ------- + Bool determining if row should be skipped. + """ + if nrows is not None and isinstance(header, int) and isinstance(skiprows, int): + if index < header + skiprows - 1: + return True + return False + def parse( self, sheet_name=0, @@ -439,7 +478,16 @@ def parse( else: # assume an integer if not a string sheet = self.get_sheet_by_index(asheetname) - data = self.get_sheet_data(sheet, convert_float) + gsd_header = 0 if header is None else header + gsd_skiprows = 0 if skiprows is None else skiprows + gsd_nrows = nrows if isinstance(nrows, int) else None + + if isinstance(gsd_header, list) or isinstance(gsd_skiprows, list): + gsd_nrows = None + + data = self.get_sheet_data( + sheet, convert_float, gsd_header, gsd_skiprows, gsd_nrows + ) usecols = _maybe_convert_usecols(usecols) if not data: diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 739c77d1c0b99..e30562ca23c3c 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -1,6 +1,6 @@ -from typing import List +from typing import List, Optional, Sequence -from pandas._typing import FilePathOrBuffer, Scalar +from pandas._typing import FilePathOrBuffer, Scalar, Union from pandas.compat._optional import import_optional_dependency import pandas as pd @@ -63,7 +63,14 @@ def get_sheet_by_name(self, name: str): raise ValueError(f"sheet {name} not found") - def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: + def get_sheet_data( + self, + sheet, + convert_float: bool, + header: Optional[Union[int, Sequence[int]]], + skiprows: Optional[Union[int, Sequence[int]]], + nrows: Optional[int], + ) -> List[List[Scalar]]: """ Parse an ODF Table into a list of lists """ @@ -79,7 +86,15 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: table: List[List[Scalar]] = [] + if nrows is not None and isinstance(header, int) and isinstance(skiprows, int): + sheet_rows = sheet_rows[0 : header + skiprows + nrows + 1] + for i, sheet_row in enumerate(sheet_rows): + + if self.should_skip_row(i, header, skiprows, nrows): + table.append([]) + continue + sheet_cells = [x for x in sheet_row.childNodes if x.qname in cell_names] empty_cells = 0 table_row: List[Scalar] = [] diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 0696d82e51f34..4cd9e07e1be12 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -1,8 +1,8 @@ -from typing import List +from typing import List, Optional, Sequence import numpy as np -from pandas._typing import FilePathOrBuffer, Scalar +from pandas._typing import FilePathOrBuffer, Scalar, Union from pandas.compat._optional import import_optional_dependency from pandas.io.excel._base import ExcelWriter, _BaseExcelReader @@ -529,9 +529,18 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: return cell.value - def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: + def get_sheet_data( + self, + sheet, + convert_float: bool, + header: Optional[Union[int, Sequence[int]]], + skiprows: Optional[Union[int, Sequence[int]]], + nrows: Optional[int], + ) -> List[List[Scalar]]: data: List[List[Scalar]] = [] + for row in sheet.rows: + data.append([self._convert_cell(cell, convert_float) for cell in row]) return data diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py index 0d96c8c4acdb8..a7962572775b5 100644 --- a/pandas/io/excel/_pyxlsb.py +++ b/pandas/io/excel/_pyxlsb.py @@ -1,6 +1,6 @@ -from typing import List +from typing import List, Optional, Sequence -from pandas._typing import FilePathOrBuffer, Scalar +from pandas._typing import FilePathOrBuffer, Scalar, Union from pandas.compat._optional import import_optional_dependency from pandas.io.excel._base import _BaseExcelReader @@ -62,7 +62,14 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: return cell.v - def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: + def get_sheet_data( + self, + sheet, + convert_float: bool, + header: Optional[Union[int, Sequence[int]]], + skiprows: Optional[Union[int, Sequence[int]]], + nrows: Optional[int], + ) -> List[List[Scalar]]: return [ [self._convert_cell(c, convert_float) for c in r] for r in sheet.rows(sparse=False) diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index 8f7d3b1368fc7..61040c4158240 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -1,7 +1,9 @@ from datetime import time +from typing import List, Optional, Sequence import numpy as np +from pandas._typing import Scalar, Union from pandas.compat._optional import import_optional_dependency from pandas.io.excel._base import _BaseExcelReader @@ -46,7 +48,14 @@ def get_sheet_by_name(self, name): def get_sheet_by_index(self, index): return self.book.sheet_by_index(index) - def get_sheet_data(self, sheet, convert_float): + def get_sheet_data( + self, + sheet, + convert_float: bool, + header: Optional[Union[int, Sequence[int]]], + skiprows: Optional[Union[int, Sequence[int]]], + nrows: Optional[int], + ) -> List[List[Scalar]]: from xlrd import ( xldate, XL_CELL_DATE, @@ -95,9 +104,18 @@ def _parse_cell(cell_contents, cell_typ): cell_contents = val return cell_contents - data = [] + data: List[List[Scalar]] = [] + + sheet_nrows = sheet.nrows + if nrows is not None and isinstance(header, int) and isinstance(skiprows, int): + sheet_nrows = min(header + skiprows + nrows + 1, sheet_nrows) + + for i in range(sheet_nrows): + + if self.should_skip_row(i, header, skiprows, nrows): + data.append([]) + continue - for i in range(sheet.nrows): row = [ _parse_cell(value, typ) for value, typ in zip(sheet.row_values(i), sheet.row_types(i)) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 955db982f8300..5e685decfa21d 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1143,3 +1143,18 @@ def test_header_with_index_col(self, engine, filename): filename, sheet_name="Sheet1", index_col=0, header=[0, 1] ) tm.assert_frame_equal(expected, result) + + @pytest.mark.parametrize("header, skiprows", [(1, 2), (0, 3)]) + @td.check_file_leaks + def test_header_skiprows_nrows(self, engine, read_ext, header, skiprows): + # GH 32727 + data = pd.read_excel("test1" + read_ext, engine=engine) + expected = ( + DataFrame(data.iloc[3:6]) + .reset_index(drop=True) + .rename(columns=data.iloc[2].rename(None)) + ) + actual = pd.read_excel( + "test1" + read_ext, engine=engine, header=header, skiprows=skiprows, nrows=3 + ) + tm.assert_frame_equal(expected, actual)