@@ -780,143 +780,195 @@ def parse(
780
780
output [asheetname ] = DataFrame ()
781
781
continue
782
782
783
- is_list_header = False
784
- is_len_one_list_header = False
785
- if is_list_like (header ):
786
- assert isinstance (header , Sequence )
787
- is_list_header = True
788
- if len (header ) == 1 :
789
- is_len_one_list_header = True
790
-
791
- if is_len_one_list_header :
792
- header = cast (Sequence [int ], header )[0 ]
793
-
794
- # forward fill and pull out names for MultiIndex column
795
- header_names = None
796
- if header is not None and is_list_like (header ):
797
- assert isinstance (header , Sequence )
798
-
799
- header_names = []
800
- control_row = [True ] * len (data [0 ])
801
-
802
- for row in header :
803
- if is_integer (skiprows ):
804
- assert isinstance (skiprows , int )
805
- row += skiprows
806
-
807
- if row > len (data ) - 1 :
808
- raise ValueError (
809
- f"header index { row } exceeds maximum index "
810
- f"{ len (data ) - 1 } of data." ,
811
- )
812
-
813
- data [row ], control_row = fill_mi_header (data [row ], control_row )
814
-
815
- if index_col is not None :
816
- header_name , _ = pop_header_name (data [row ], index_col )
817
- header_names .append (header_name )
818
-
819
- # If there is a MultiIndex header and an index then there is also
820
- # a row containing just the index name(s)
821
- has_index_names = False
822
- if is_list_header and not is_len_one_list_header and index_col is not None :
823
- index_col_list : Sequence [int ]
824
- if isinstance (index_col , int ):
825
- index_col_list = [index_col ]
826
- else :
827
- assert isinstance (index_col , Sequence )
828
- index_col_list = index_col
829
-
830
- # We have to handle mi without names. If any of the entries in the data
831
- # columns are not empty, this is a regular row
832
- assert isinstance (header , Sequence )
833
- if len (header ) < len (data ):
834
- potential_index_names = data [len (header )]
835
- potential_data = [
836
- x
837
- for i , x in enumerate (potential_index_names )
838
- if not control_row [i ] and i not in index_col_list
839
- ]
840
- has_index_names = all (x == "" or x is None for x in potential_data )
841
-
842
- if is_list_like (index_col ):
843
- # Forward fill values for MultiIndex index.
844
- if header is None :
845
- offset = 0
846
- elif isinstance (header , int ):
847
- offset = 1 + header
848
- else :
849
- offset = 1 + max (header )
783
+ output = self ._parse_sheet (
784
+ data = data ,
785
+ output = output ,
786
+ asheetname = asheetname ,
787
+ header = header ,
788
+ names = names ,
789
+ index_col = index_col ,
790
+ usecols = usecols ,
791
+ dtype = dtype ,
792
+ skiprows = skiprows ,
793
+ nrows = nrows ,
794
+ true_values = true_values ,
795
+ false_values = false_values ,
796
+ na_values = na_values ,
797
+ parse_dates = parse_dates ,
798
+ date_parser = date_parser ,
799
+ date_format = date_format ,
800
+ thousands = thousands ,
801
+ decimal = decimal ,
802
+ comment = comment ,
803
+ skipfooter = skipfooter ,
804
+ dtype_backend = dtype_backend ,
805
+ ** kwds ,
806
+ )
850
807
851
- # GH34673: if MultiIndex names present and not defined in the header,
852
- # offset needs to be incremented so that forward filling starts
853
- # from the first MI value instead of the name
854
- if has_index_names :
855
- offset += 1
808
+ if last_sheetname is None :
809
+ raise ValueError ("Sheet name is an empty list" )
856
810
857
- # Check if we have an empty dataset
858
- # before trying to collect data.
859
- if offset < len ( data ) :
860
- assert isinstance ( index_col , Sequence )
811
+ if ret_dict :
812
+ return output
813
+ else :
814
+ return output [ last_sheetname ]
861
815
862
- for col in index_col :
863
- last = data [offset ][col ]
816
+ def _parse_sheet (
817
+ self ,
818
+ data : list ,
819
+ output : dict ,
820
+ asheetname : str | int | None = None ,
821
+ header : int | Sequence [int ] | None = 0 ,
822
+ names : SequenceNotStr [Hashable ] | range | None = None ,
823
+ index_col : int | Sequence [int ] | None = None ,
824
+ usecols = None ,
825
+ dtype : DtypeArg | None = None ,
826
+ skiprows : Sequence [int ] | int | Callable [[int ], object ] | None = None ,
827
+ nrows : int | None = None ,
828
+ true_values : Iterable [Hashable ] | None = None ,
829
+ false_values : Iterable [Hashable ] | None = None ,
830
+ na_values = None ,
831
+ parse_dates : list | dict | bool = False ,
832
+ date_parser : Callable | lib .NoDefault = lib .no_default ,
833
+ date_format : dict [Hashable , str ] | str | None = None ,
834
+ thousands : str | None = None ,
835
+ decimal : str = "." ,
836
+ comment : str | None = None ,
837
+ skipfooter : int = 0 ,
838
+ dtype_backend : DtypeBackend | lib .NoDefault = lib .no_default ,
839
+ ** kwds ,
840
+ ):
841
+ is_list_header = False
842
+ is_len_one_list_header = False
843
+ if is_list_like (header ):
844
+ assert isinstance (header , Sequence )
845
+ is_list_header = True
846
+ if len (header ) == 1 :
847
+ is_len_one_list_header = True
848
+
849
+ if is_len_one_list_header :
850
+ header = cast (Sequence [int ], header )[0 ]
851
+
852
+ # forward fill and pull out names for MultiIndex column
853
+ header_names = None
854
+ if header is not None and is_list_like (header ):
855
+ assert isinstance (header , Sequence )
856
+
857
+ header_names = []
858
+ control_row = [True ] * len (data [0 ])
859
+
860
+ for row in header :
861
+ if is_integer (skiprows ):
862
+ assert isinstance (skiprows , int )
863
+ row += skiprows
864
+
865
+ if row > len (data ) - 1 :
866
+ raise ValueError (
867
+ f"header index { row } exceeds maximum index "
868
+ f"{ len (data ) - 1 } of data." ,
869
+ )
864
870
865
- for row in range (offset + 1 , len (data )):
866
- if data [row ][col ] == "" or data [row ][col ] is None :
867
- data [row ][col ] = last
868
- else :
869
- last = data [row ][col ]
871
+ data [row ], control_row = fill_mi_header (data [row ], control_row )
870
872
871
- # GH 12292 : error when read one empty column from excel file
872
- try :
873
- parser = TextParser (
874
- data ,
875
- names = names ,
876
- header = header ,
877
- index_col = index_col ,
878
- has_index_names = has_index_names ,
879
- dtype = dtype ,
880
- true_values = true_values ,
881
- false_values = false_values ,
882
- skiprows = skiprows ,
883
- nrows = nrows ,
884
- na_values = na_values ,
885
- skip_blank_lines = False , # GH 39808
886
- parse_dates = parse_dates ,
887
- date_parser = date_parser ,
888
- date_format = date_format ,
889
- thousands = thousands ,
890
- decimal = decimal ,
891
- comment = comment ,
892
- skipfooter = skipfooter ,
893
- usecols = usecols ,
894
- dtype_backend = dtype_backend ,
895
- ** kwds ,
896
- )
873
+ if index_col is not None :
874
+ header_name , _ = pop_header_name (data [row ], index_col )
875
+ header_names .append (header_name )
897
876
898
- output [asheetname ] = parser .read (nrows = nrows )
877
+ # If there is a MultiIndex header and an index then there is also
878
+ # a row containing just the index name(s)
879
+ has_index_names = False
880
+ if is_list_header and not is_len_one_list_header and index_col is not None :
881
+ index_col_list : Sequence [int ]
882
+ if isinstance (index_col , int ):
883
+ index_col_list = [index_col ]
884
+ else :
885
+ assert isinstance (index_col , Sequence )
886
+ index_col_list = index_col
887
+
888
+ # We have to handle mi without names. If any of the entries in the data
889
+ # columns are not empty, this is a regular row
890
+ assert isinstance (header , Sequence )
891
+ if len (header ) < len (data ):
892
+ potential_index_names = data [len (header )]
893
+ potential_data = [
894
+ x
895
+ for i , x in enumerate (potential_index_names )
896
+ if not control_row [i ] and i not in index_col_list
897
+ ]
898
+ has_index_names = all (x == "" or x is None for x in potential_data )
899
+
900
+ if is_list_like (index_col ):
901
+ # Forward fill values for MultiIndex index.
902
+ if header is None :
903
+ offset = 0
904
+ elif isinstance (header , int ):
905
+ offset = 1 + header
906
+ else :
907
+ offset = 1 + max (header )
908
+
909
+ # GH34673: if MultiIndex names present and not defined in the header,
910
+ # offset needs to be incremented so that forward filling starts
911
+ # from the first MI value instead of the name
912
+ if has_index_names :
913
+ offset += 1
914
+
915
+ # Check if we have an empty dataset
916
+ # before trying to collect data.
917
+ if offset < len (data ):
918
+ assert isinstance (index_col , Sequence )
919
+
920
+ for col in index_col :
921
+ last = data [offset ][col ]
922
+
923
+ for row in range (offset + 1 , len (data )):
924
+ if data [row ][col ] == "" or data [row ][col ] is None :
925
+ data [row ][col ] = last
926
+ else :
927
+ last = data [row ][col ]
928
+
929
+ # GH 12292 : error when read one empty column from excel file
930
+ try :
931
+ parser = TextParser (
932
+ data ,
933
+ names = names ,
934
+ header = header ,
935
+ index_col = index_col ,
936
+ has_index_names = has_index_names ,
937
+ dtype = dtype ,
938
+ true_values = true_values ,
939
+ false_values = false_values ,
940
+ skiprows = skiprows ,
941
+ nrows = nrows ,
942
+ na_values = na_values ,
943
+ skip_blank_lines = False , # GH 39808
944
+ parse_dates = parse_dates ,
945
+ date_parser = date_parser ,
946
+ date_format = date_format ,
947
+ thousands = thousands ,
948
+ decimal = decimal ,
949
+ comment = comment ,
950
+ skipfooter = skipfooter ,
951
+ usecols = usecols ,
952
+ dtype_backend = dtype_backend ,
953
+ ** kwds ,
954
+ )
899
955
900
- if header_names :
901
- output [asheetname ].columns = output [asheetname ].columns .set_names (
902
- header_names
903
- )
956
+ output [asheetname ] = parser .read (nrows = nrows )
904
957
905
- except EmptyDataError :
906
- # No Data, return an empty DataFrame
907
- output [asheetname ] = DataFrame ()
958
+ if header_names :
959
+ output [asheetname ].columns = output [asheetname ].columns .set_names (
960
+ header_names
961
+ )
908
962
909
- except Exception as err :
910
- err . args = ( f" { err . args [ 0 ] } (sheet: { asheetname } )" , * err . args [ 1 :])
911
- raise err
963
+ except EmptyDataError :
964
+ # No Data, return an empty DataFrame
965
+ output [ asheetname ] = DataFrame ()
912
966
913
- if last_sheetname is None :
914
- raise ValueError ("Sheet name is an empty list" )
967
+ except Exception as err :
968
+ err .args = (f"{ err .args [0 ]} (sheet: { asheetname } )" , * err .args [1 :])
969
+ raise err
915
970
916
- if ret_dict :
917
- return output
918
- else :
919
- return output [last_sheetname ]
971
+ return output
920
972
921
973
922
974
@doc (storage_options = _shared_docs ["storage_options" ])
0 commit comments