pandas-dev · WillAyd · Sep 21, 2020 · Mar 27, 2020 · Apr 4, 2020 · Apr 4, 2020
diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py
@@ -11,7 +11,7 @@
 
 
 def _generate_dataframe():
-    N = 2000
+    N = 20000
     C = 5
     df = DataFrame(
         np.random.randn(N, C),
@@ -69,5 +69,9 @@ def time_read_excel(self, engine):
         fname = self.fname_odf if engine == "odf" else self.fname_excel
         read_excel(fname, engine=engine)
 
+    def time_read_excel_nrows(self, engine):
+        fname = self.fname_odf if engine == "odf" else self.fname_excel
+        read_excel(fname, engine=engine, nrows=1)
+
 
 from ..pandas_vb_common import setup  # noqa: F401 isort:skip
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -869,6 +869,7 @@ Performance improvements
 - Performance improvement in arithmetic operations (``sub``, ``add``, ``mul``, ``div``) for :class:`MultiIndex` (:issue:`34297`)
 - Performance improvement in ``DataFrame[bool_indexer]`` when ``bool_indexer`` is a ``list`` (:issue:`33924`)
 - Significant performance improvement of :meth:`io.formats.style.Styler.render` with styles added with various ways such as :meth:`io.formats.style.Styler.apply`, :meth:`io.formats.style.Styler.applymap` or :meth:`io.formats.style.Styler.bar` (:issue:`19917`)
+- Performance improvement in `read_excel` for integer ``header``, ``skiprows``, and ``nrows`` (:issue:`33281`).
 
 .. ---------------------------------------------------------------------------
 

diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
@@ -3,12 +3,12 @@
 from io import BufferedIOBase, BytesIO, RawIOBase
 import os
 from textwrap import fill
-from typing import Any, Mapping, Union
+from typing import Any, List, Mapping, Optional, Sequence, Union
 
 from pandas._config import config
 
 from pandas._libs.parsers import STR_NA_VALUES
-from pandas._typing import StorageOptions
+from pandas._typing import Scalar, StorageOptions
 from pandas.errors import EmptyDataError
 from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments
 
@@ -397,9 +397,47 @@ def get_sheet_by_index(self, index):
         pass
 
     @abc.abstractmethod
-    def get_sheet_data(self, sheet, convert_float):
+    def get_sheet_data(
+        self,
+        sheet,
+        convert_float: bool,
+        header: Optional[Union[int, Sequence[int]]],
+        skiprows: Optional[Union[int, Sequence[int]]],
+        nrows: Optional[int],
+    ) -> List[List[Scalar]]:
         pass
 
+    def should_skip_row(
+        self,
+        index: int,
+        header: Optional[Union[int, Sequence[int]]],
+        skiprows: Optional[Union[int, Sequence[int]]],
+        nrows: Optional[int],
+    ) -> bool:
+        """
+        Determines whether row should be skipped.
+
+        Parameters
+        ----------
+        index : int
+            Index of row.
+        header : int, list of int
+            Rows used as column labels.
+        skiprows : int, list of int
+            Rows to skip at the begining.
+        nrows : int
+            Number of rows to parse.
+
+        Returns
+        -------
+        bool
+            Determines if row should be skipped.
+        """
+        if nrows is not None and isinstance(header, int) and isinstance(skiprows, int):
+            if index < header + skiprows - 1:
+                return True
+        return False
+
     def parse(
         self,
         sheet_name=0,
@@ -453,7 +491,20 @@ def parse(
             else:  # assume an integer if not a string
                 sheet = self.get_sheet_by_index(asheetname)
 
-            data = self.get_sheet_data(sheet, convert_float)
+            get_sheet_data_header = 0 if header is None else header
+            get_sheet_data_skiprows = 0 if skiprows is None else skiprows
+            get_sheet_data_nrows = nrows if isinstance(nrows, int) else None
+            if isinstance(get_sheet_data_header, list) or isinstance(
+                get_sheet_data_skiprows, list
+            ):
+                get_sheet_data_nrows = None
+            data = self.get_sheet_data(
+                sheet,
+                convert_float,
+                get_sheet_data_header,
+                get_sheet_data_skiprows,
+                get_sheet_data_nrows,
+            )
             usecols = _maybe_convert_usecols(usecols)
 
             if not data:

diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py
@@ -1,8 +1,8 @@
-from typing import List, cast
+from typing import List, Optional, Sequence, cast
 
 import numpy as np
 
-from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions
+from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions, Union
 from pandas.compat._optional import import_optional_dependency
 
 import pandas as pd
@@ -71,7 +71,14 @@ def get_sheet_by_name(self, name: str):
 
         raise ValueError(f"sheet {name} not found")
 
-    def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
+    def get_sheet_data(
+        self,
+        sheet,
+        convert_float: bool,
+        header: Optional[Union[int, Sequence[int]]],
+        skiprows: Optional[Union[int, Sequence[int]]],
+        nrows: Optional[int],
+    ) -> List[List[Scalar]]:
         """
         Parse an ODF Table into a list of lists
         """
@@ -87,7 +94,12 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
 
         table: List[List[Scalar]] = []
 
+        if nrows is not None and isinstance(header, int) and isinstance(skiprows, int):
+            sheet_rows = sheet_rows[: header + skiprows + nrows + 1]
         for i, sheet_row in enumerate(sheet_rows):
+            if self.should_skip_row(i, header, skiprows, nrows):
+                table.append([])
+                continue
             sheet_cells = [x for x in sheet_row.childNodes if x.qname in cell_names]
             empty_cells = 0
             table_row: List[Scalar] = []

diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py
@@ -1,8 +1,8 @@
-from typing import List
+from typing import List, Optional, Sequence
 
 import numpy as np
 
-from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions
+from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions, Union
 from pandas.compat._optional import import_optional_dependency
 
 from pandas.io.excel._base import ExcelWriter, _BaseExcelReader
@@ -535,7 +535,14 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar:
 
         return cell.value
 
-    def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
+    def get_sheet_data(
+        self,
+        sheet,
+        convert_float: bool,
+        header: Optional[Union[int, Sequence[int]]],
+        skiprows: Optional[Union[int, Sequence[int]]],
+        nrows: Optional[int],
+    ) -> List[List[Scalar]]:
         data: List[List[Scalar]] = []
         for row in sheet.rows:
             data.append([self._convert_cell(cell, convert_float) for cell in row])

diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py
@@ -1,6 +1,6 @@
-from typing import List
+from typing import List, Optional, Sequence
 
-from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions
+from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions, Union
 from pandas.compat._optional import import_optional_dependency
 
 from pandas.io.excel._base import _BaseExcelReader
@@ -68,7 +68,14 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar:
 
         return cell.v
 
-    def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
+    def get_sheet_data(
+        self,
+        sheet,
+        convert_float: bool,
+        header: Optional[Union[int, Sequence[int]]],
+        skiprows: Optional[Union[int, Sequence[int]]],
+        nrows: Optional[int],
+    ) -> List[List[Scalar]]:
         return [
             [self._convert_cell(c, convert_float) for c in r]
             for r in sheet.rows(sparse=False)

diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py
@@ -1,8 +1,9 @@
 from datetime import time
+from typing import List, Optional, Sequence
 
 import numpy as np
 
-from pandas._typing import StorageOptions
+from pandas._typing import Scalar, StorageOptions, Union
 from pandas.compat._optional import import_optional_dependency
 
 from pandas.io.excel._base import _BaseExcelReader
@@ -49,7 +50,14 @@ def get_sheet_by_name(self, name):
     def get_sheet_by_index(self, index):
         return self.book.sheet_by_index(index)
 
-    def get_sheet_data(self, sheet, convert_float):
+    def get_sheet_data(
+        self,
+        sheet,
+        convert_float: bool,
+        header: Optional[Union[int, Sequence[int]]],
+        skiprows: Optional[Union[int, Sequence[int]]],
+        nrows: Optional[int],
+    ) -> List[List[Scalar]]:
         from xlrd import (
             XL_CELL_BOOLEAN,
             XL_CELL_DATE,
@@ -98,9 +106,16 @@ def _parse_cell(cell_contents, cell_typ):
                     cell_contents = val
             return cell_contents
 
-        data = []
+        data: List[List[Scalar]] = []
 
-        for i in range(sheet.nrows):
+        sheet_nrows = sheet.nrows
+        if nrows is not None and isinstance(header, int) and isinstance(skiprows, int):
+            sheet_nrows = min(header + skiprows + nrows + 1, sheet_nrows)
+
+        for i in range(sheet_nrows):
+            if self.should_skip_row(i, header, skiprows, nrows):
+                data.append([])
+                continue
             row = [
                 _parse_cell(value, typ)
                 for value, typ in zip(sheet.row_values(i), sheet.row_types(i))

diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
@@ -1153,5 +1153,19 @@ def test_read_datetime_multiindex(self, engine, read_ext):
             ],
         )
         expected = pd.DataFrame([], columns=expected_column_index)
+        tm.assert_frame_equal(expected, actual)
 
+    @pytest.mark.parametrize("header, skiprows", [(1, 2), (0, 3)])
+    @td.check_file_leaks
+    def test_header_skiprows_nrows(self, engine, read_ext, header, skiprows):
+        # GH 32727
+        data = pd.read_excel("test1" + read_ext, engine=engine)
+        expected = (
+            DataFrame(data.iloc[3:6])
+            .reset_index(drop=True)
+            .rename(columns=data.iloc[2].rename(None))
+        )
+        actual = pd.read_excel(
+            "test1" + read_ext, engine=engine, header=header, skiprows=skiprows, nrows=3
+        )
         tm.assert_frame_equal(expected, actual)