2
2
3
3
import abc
4
4
import datetime
5
+ from functools import partial
5
6
from io import BytesIO
6
7
import os
7
8
from textwrap import fill
70
71
pop_header_name ,
71
72
)
72
73
from pandas .io .parsers import TextParser
74
+ from pandas .io .parsers .readers import validate_integer
73
75
74
76
_read_excel_doc = (
75
77
"""
@@ -563,7 +565,7 @@ def get_sheet_by_index(self, index: int):
563
565
pass
564
566
565
567
@abc .abstractmethod
566
- def get_sheet_data (self , sheet , convert_float : bool ):
568
+ def get_sheet_data (self , sheet , convert_float : bool , rows : int | None = None ):
567
569
pass
568
570
569
571
def raise_if_bad_sheet_by_index (self , index : int ) -> None :
@@ -577,6 +579,99 @@ def raise_if_bad_sheet_by_name(self, name: str) -> None:
577
579
if name not in self .sheet_names :
578
580
raise ValueError (f"Worksheet named '{ name } ' not found" )
579
581
582
+ def _check_skiprows_func (
583
+ self ,
584
+ skiprows : Callable ,
585
+ rows_to_use : int ,
586
+ ) -> int :
587
+ """
588
+ Determine how many file rows are required to obtain `nrows` data
589
+ rows when `skiprows` is a function.
590
+
591
+ Parameters
592
+ ----------
593
+ skiprows : function
594
+ The function passed to read_excel by the user.
595
+ rows_to_use : int
596
+ The number of rows that will be needed for the header and
597
+ the data.
598
+
599
+ Returns
600
+ -------
601
+ int
602
+ """
603
+ i = 0
604
+ rows_used_so_far = 0
605
+ while rows_used_so_far < rows_to_use :
606
+ if not skiprows (i ):
607
+ rows_used_so_far += 1
608
+ i += 1
609
+ return i
610
+
611
+ def _calc_rows (
612
+ self ,
613
+ header : int | Sequence [int ] | None ,
614
+ index_col : int | Sequence [int ] | None ,
615
+ skiprows : Sequence [int ] | int | Callable [[int ], object ] | None ,
616
+ nrows : int | None ,
617
+ ) -> int | None :
618
+ """
619
+ If nrows specified, find the number of rows needed from the
620
+ file, otherwise return None.
621
+
622
+
623
+ Parameters
624
+ ----------
625
+ header : int, list of int, or None
626
+ See read_excel docstring.
627
+ index_col : int, list of int, or None
628
+ See read_excel docstring.
629
+ skiprows : list-like, int, callable, or None
630
+ See read_excel docstring.
631
+ nrows : int or None
632
+ See read_excel docstring.
633
+
634
+ Returns
635
+ -------
636
+ int or None
637
+ """
638
+ if nrows is None :
639
+ return None
640
+ if header is None :
641
+ header_rows = 1
642
+ elif is_integer (header ):
643
+ header = cast (int , header )
644
+ header_rows = 1 + header
645
+ else :
646
+ header = cast (Sequence , header )
647
+ header_rows = 1 + header [- 1 ]
648
+ # If there is a MultiIndex header and an index then there is also
649
+ # a row containing just the index name(s)
650
+ if is_list_like (header ) and index_col is not None :
651
+ header = cast (Sequence , header )
652
+ if len (header ) > 1 :
653
+ header_rows += 1
654
+ if skiprows is None :
655
+ return header_rows + nrows
656
+ if is_integer (skiprows ):
657
+ skiprows = cast (int , skiprows )
658
+ return header_rows + nrows + skiprows
659
+ if is_list_like (skiprows ):
660
+
661
+ def f (skiprows : Sequence , x : int ) -> bool :
662
+ return x in skiprows
663
+
664
+ skiprows = cast (Sequence , skiprows )
665
+ return self ._check_skiprows_func (partial (f , skiprows ), header_rows + nrows )
666
+ if callable (skiprows ):
667
+ return self ._check_skiprows_func (
668
+ skiprows ,
669
+ header_rows + nrows ,
670
+ )
671
+ # else unexpected skiprows type: read_excel will not optimize
672
+ # the number of rows read from file
673
+ return None
674
+
580
675
def parse (
581
676
self ,
582
677
sheet_name : str | int | list [int ] | list [str ] | None = 0 ,
@@ -613,6 +708,7 @@ def parse(
613
708
)
614
709
615
710
validate_header_arg (header )
711
+ validate_integer ("nrows" , nrows )
616
712
617
713
ret_dict = False
618
714
@@ -643,7 +739,8 @@ def parse(
643
739
else : # assume an integer if not a string
644
740
sheet = self .get_sheet_by_index (asheetname )
645
741
646
- data = self .get_sheet_data (sheet , convert_float )
742
+ file_rows_needed = self ._calc_rows (header , index_col , skiprows , nrows )
743
+ data = self .get_sheet_data (sheet , convert_float , file_rows_needed )
647
744
if hasattr (sheet , "close" ):
648
745
# pyxlsb opens two TemporaryFiles
649
746
sheet .close ()
0 commit comments