diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py index 3363b43f29b78..a2d989e787e0f 100644 --- a/asv_bench/benchmarks/io/excel.py +++ b/asv_bench/benchmarks/io/excel.py @@ -86,4 +86,15 @@ def time_read_excel(self, engine): read_excel(fname, engine=engine) +class ReadExcelNRows(ReadExcel): + def time_read_excel(self, engine): + if engine == "xlrd": + fname = self.fname_excel_xls + elif engine == "odf": + fname = self.fname_odf + else: + fname = self.fname_excel + read_excel(fname, engine=engine, nrows=10) + + from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 2efc6c9167a83..3bb9a72d6e2f9 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -573,6 +573,7 @@ Performance improvements - Performance improvement when setting values in a pyarrow backed string array (:issue:`46400`) - Performance improvement in :func:`factorize` (:issue:`46109`) - Performance improvement in :class:`DataFrame` and :class:`Series` constructors for extension dtype scalars (:issue:`45854`) +- Performance improvement in :func:`read_excel` when ``nrows`` argument provided (:issue:`32727`) .. --------------------------------------------------------------------------- .. _whatsnew_150.bug_fixes: diff --git a/pandas/io/common.py b/pandas/io/common.py index fdee1600c2a32..98964b100966f 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -28,6 +28,7 @@ Generic, Literal, Mapping, + Sequence, TypeVar, cast, overload, @@ -56,7 +57,12 @@ from pandas.util._decorators import doc from pandas.util._exceptions import find_stack_level -from pandas.core.dtypes.common import is_file_like +from pandas.core.dtypes.common import ( + is_bool, + is_file_like, + is_integer, + is_list_like, +) from pandas.core.shared_docs import _shared_docs @@ -177,12 +183,32 @@ def _expand_user(filepath_or_buffer: str | BaseBufferT) -> str | BaseBufferT: def validate_header_arg(header: object) -> None: - if isinstance(header, bool): + if header is None: + return + if is_integer(header): + header = cast(int, header) + if header < 0: + # GH 27779 + raise ValueError( + "Passing negative integer to header is invalid. " + "For no header, use header=None instead" + ) + return + if is_list_like(header, allow_sets=False): + header = cast(Sequence, header) + if not all(map(is_integer, header)): + raise ValueError("header must be integer or list of integers") + if any(i < 0 for i in header): + raise ValueError("cannot specify multi-index header with negative integers") + return + if is_bool(header): raise TypeError( "Passing a bool to header is invalid. Use header=None for no header or " "header=int or list-like of ints to specify " "the row(s) making up the column names" ) + # GH 16338 + raise ValueError("header must be integer or list of integers") @overload diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 030ae9fefda98..d20f347e54d6b 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -2,6 +2,7 @@ import abc import datetime +from functools import partial from io import BytesIO import os from textwrap import fill @@ -70,6 +71,7 @@ pop_header_name, ) from pandas.io.parsers import TextParser +from pandas.io.parsers.readers import validate_integer _read_excel_doc = ( """ @@ -563,7 +565,7 @@ def get_sheet_by_index(self, index: int): pass @abc.abstractmethod - def get_sheet_data(self, sheet, convert_float: bool): + def get_sheet_data(self, sheet, convert_float: bool, rows: int | None = None): pass def raise_if_bad_sheet_by_index(self, index: int) -> None: @@ -577,6 +579,99 @@ def raise_if_bad_sheet_by_name(self, name: str) -> None: if name not in self.sheet_names: raise ValueError(f"Worksheet named '{name}' not found") + def _check_skiprows_func( + self, + skiprows: Callable, + rows_to_use: int, + ) -> int: + """ + Determine how many file rows are required to obtain `nrows` data + rows when `skiprows` is a function. + + Parameters + ---------- + skiprows : function + The function passed to read_excel by the user. + rows_to_use : int + The number of rows that will be needed for the header and + the data. + + Returns + ------- + int + """ + i = 0 + rows_used_so_far = 0 + while rows_used_so_far < rows_to_use: + if not skiprows(i): + rows_used_so_far += 1 + i += 1 + return i + + def _calc_rows( + self, + header: int | Sequence[int] | None, + index_col: int | Sequence[int] | None, + skiprows: Sequence[int] | int | Callable[[int], object] | None, + nrows: int | None, + ) -> int | None: + """ + If nrows specified, find the number of rows needed from the + file, otherwise return None. + + + Parameters + ---------- + header : int, list of int, or None + See read_excel docstring. + index_col : int, list of int, or None + See read_excel docstring. + skiprows : list-like, int, callable, or None + See read_excel docstring. + nrows : int or None + See read_excel docstring. + + Returns + ------- + int or None + """ + if nrows is None: + return None + if header is None: + header_rows = 1 + elif is_integer(header): + header = cast(int, header) + header_rows = 1 + header + else: + header = cast(Sequence, header) + header_rows = 1 + header[-1] + # If there is a MultiIndex header and an index then there is also + # a row containing just the index name(s) + if is_list_like(header) and index_col is not None: + header = cast(Sequence, header) + if len(header) > 1: + header_rows += 1 + if skiprows is None: + return header_rows + nrows + if is_integer(skiprows): + skiprows = cast(int, skiprows) + return header_rows + nrows + skiprows + if is_list_like(skiprows): + + def f(skiprows: Sequence, x: int) -> bool: + return x in skiprows + + skiprows = cast(Sequence, skiprows) + return self._check_skiprows_func(partial(f, skiprows), header_rows + nrows) + if callable(skiprows): + return self._check_skiprows_func( + skiprows, + header_rows + nrows, + ) + # else unexpected skiprows type: read_excel will not optimize + # the number of rows read from file + return None + def parse( self, sheet_name: str | int | list[int] | list[str] | None = 0, @@ -613,6 +708,7 @@ def parse( ) validate_header_arg(header) + validate_integer("nrows", nrows) ret_dict = False @@ -643,7 +739,8 @@ def parse( else: # assume an integer if not a string sheet = self.get_sheet_by_index(asheetname) - data = self.get_sheet_data(sheet, convert_float) + file_rows_needed = self._calc_rows(header, index_col, skiprows, nrows) + data = self.get_sheet_data(sheet, convert_float, file_rows_needed) if hasattr(sheet, "close"): # pyxlsb opens two TemporaryFiles sheet.close() diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 5a7e5b0d8d325..075590f3535fe 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -90,7 +90,7 @@ def get_sheet_by_name(self, name: str): raise ValueError(f"sheet {name} not found") def get_sheet_data( - self, sheet, convert_float: bool + self, sheet, convert_float: bool, file_rows_needed: int | None = None ) -> list[list[Scalar | NaTType]]: """ Parse an ODF Table into a list of lists @@ -148,6 +148,8 @@ def get_sheet_data( empty_rows = 0 for _ in range(row_repeat): table.append(table_row) + if file_rows_needed is not None and len(table) >= file_rows_needed: + break # Make our table square for row in table: diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 6d70b3f319f37..8f4201d0befff 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -588,7 +588,9 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: return cell.value - def get_sheet_data(self, sheet, convert_float: bool) -> list[list[Scalar]]: + def get_sheet_data( + self, sheet, convert_float: bool, file_rows_needed: int | None = None + ) -> list[list[Scalar]]: if self.book.read_only: sheet.reset_dimensions() @@ -603,6 +605,8 @@ def get_sheet_data(self, sheet, convert_float: bool) -> list[list[Scalar]]: if converted_row: last_row_with_data = row_number data.append(converted_row) + if file_rows_needed is not None and len(data) >= file_rows_needed: + break # Trim trailing empty rows data = data[: last_row_with_data + 1] diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py index 36e2645560078..5d40ccdf2f8f3 100644 --- a/pandas/io/excel/_pyxlsb.py +++ b/pandas/io/excel/_pyxlsb.py @@ -79,7 +79,12 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: return cell.v - def get_sheet_data(self, sheet, convert_float: bool) -> list[list[Scalar]]: + def get_sheet_data( + self, + sheet, + convert_float: bool, + file_rows_needed: int | None = None, + ) -> list[list[Scalar]]: data: list[list[Scalar]] = [] prevous_row_number = -1 # When sparse=True the rows can have different lengths and empty rows are @@ -94,6 +99,8 @@ def get_sheet_data(self, sheet, convert_float: bool) -> list[list[Scalar]]: data.extend([[]] * (row_number - prevous_row_number - 1)) data.append(converted_row) prevous_row_number = row_number + if file_rows_needed is not None and len(data) >= file_rows_needed: + break if data: # extend rows to max_width max_width = max(len(data_row) for data_row in data) diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index f38a05e7a4e64..0bf3ac6134cf6 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -1,8 +1,13 @@ +from __future__ import annotations + from datetime import time import numpy as np -from pandas._typing import StorageOptions +from pandas._typing import ( + Scalar, + StorageOptions, +) from pandas.compat._optional import import_optional_dependency from pandas.util._decorators import doc @@ -56,7 +61,9 @@ def get_sheet_by_index(self, index): self.raise_if_bad_sheet_by_index(index) return self.book.sheet_by_index(index) - def get_sheet_data(self, sheet, convert_float): + def get_sheet_data( + self, sheet, convert_float: bool, file_rows_needed: int | None = None + ) -> list[list[Scalar]]: from xlrd import ( XL_CELL_BOOLEAN, XL_CELL_DATE, @@ -107,7 +114,10 @@ def _parse_cell(cell_contents, cell_typ): data = [] - for i in range(sheet.nrows): + nrows = sheet.nrows + if file_rows_needed is not None: + nrows = min(nrows, file_rows_needed) + for i in range(nrows): row = [ _parse_cell(value, typ) for value, typ in zip(sheet.row_values(i), sheet.row_types(i)) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 2851ea36c8a33..e9c39d5ff1996 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -120,13 +120,7 @@ def __init__(self, kwds) -> None: # validate header options for mi self.header = kwds.get("header") - if isinstance(self.header, (list, tuple, np.ndarray)): - if not all(map(is_integer, self.header)): - raise ValueError("header must be integer or list of integers") - if any(i < 0 for i in self.header): - raise ValueError( - "cannot specify multi-index header with negative integers" - ) + if is_list_like(self.header, allow_sets=False): if kwds.get("usecols"): raise ValueError( "cannot specify usecols when specifying a multi-index header" @@ -138,9 +132,8 @@ def __init__(self, kwds) -> None: # validate index_col that only contains integers if self.index_col is not None: - is_sequence = isinstance(self.index_col, (list, tuple, np.ndarray)) if not ( - is_sequence + is_list_like(self.index_col, allow_sets=False) and all(map(is_integer, self.index_col)) or is_integer(self.index_col) ): @@ -148,21 +141,11 @@ def __init__(self, kwds) -> None: "index_col must only contain row numbers " "when specifying a multi-index header" ) - elif self.header is not None: + elif self.header is not None and self.prefix is not None: # GH 27394 - if self.prefix is not None: - raise ValueError( - "Argument prefix must be None if argument header is not None" - ) - # GH 16338 - elif not is_integer(self.header): - raise ValueError("header must be integer or list of integers") - # GH 27779 - elif self.header < 0: - raise ValueError( - "Passing negative integer to header is invalid. " - "For no header, use header=None instead" - ) + raise ValueError( + "Argument prefix must be None if argument header is not None" + ) self._name_processed = False diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 1e0f74ea41453..c58896e9e1baf 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1219,6 +1219,42 @@ def test_read_excel_nrows_non_integer_parameter(self, read_ext): with pytest.raises(ValueError, match=msg): pd.read_excel("test1" + read_ext, nrows="5") + @pytest.mark.parametrize( + "filename,sheet_name,header,index_col,skiprows", + [ + ("testmultiindex", "mi_column", [0, 1], 0, None), + ("testmultiindex", "mi_index", None, [0, 1], None), + ("testmultiindex", "both", [0, 1], [0, 1], None), + ("testmultiindex", "mi_column_name", [0, 1], 0, None), + ("testskiprows", "skiprows_list", None, None, [0, 2]), + ("testskiprows", "skiprows_list", None, None, lambda x: x == 0 or x == 2), + ], + ) + def test_read_excel_nrows_params( + self, read_ext, filename, sheet_name, header, index_col, skiprows + ): + """ + For various parameters, we should get the same result whether we + limit the rows during load (nrows=3) or after (df.iloc[:3]). + """ + # GH 46894 + expected = pd.read_excel( + filename + read_ext, + sheet_name=sheet_name, + header=header, + index_col=index_col, + skiprows=skiprows, + ).iloc[:3] + actual = pd.read_excel( + filename + read_ext, + sheet_name=sheet_name, + header=header, + index_col=index_col, + skiprows=skiprows, + nrows=3, + ) + tm.assert_frame_equal(actual, expected) + def test_read_excel_squeeze(self, read_ext): # GH 12157 f = "test_squeeze" + read_ext