Skip to content

Commit f6932cb

Browse files
authored
REF: Read excel parse refactor (#58497)
1 parent 7320430 commit f6932cb

File tree

1 file changed

+178
-126
lines changed

1 file changed

+178
-126
lines changed

pandas/io/excel/_base.py

+178-126
Original file line numberDiff line numberDiff line change
@@ -780,143 +780,195 @@ def parse(
780780
output[asheetname] = DataFrame()
781781
continue
782782

783-
is_list_header = False
784-
is_len_one_list_header = False
785-
if is_list_like(header):
786-
assert isinstance(header, Sequence)
787-
is_list_header = True
788-
if len(header) == 1:
789-
is_len_one_list_header = True
790-
791-
if is_len_one_list_header:
792-
header = cast(Sequence[int], header)[0]
793-
794-
# forward fill and pull out names for MultiIndex column
795-
header_names = None
796-
if header is not None and is_list_like(header):
797-
assert isinstance(header, Sequence)
798-
799-
header_names = []
800-
control_row = [True] * len(data[0])
801-
802-
for row in header:
803-
if is_integer(skiprows):
804-
assert isinstance(skiprows, int)
805-
row += skiprows
806-
807-
if row > len(data) - 1:
808-
raise ValueError(
809-
f"header index {row} exceeds maximum index "
810-
f"{len(data) - 1} of data.",
811-
)
812-
813-
data[row], control_row = fill_mi_header(data[row], control_row)
814-
815-
if index_col is not None:
816-
header_name, _ = pop_header_name(data[row], index_col)
817-
header_names.append(header_name)
818-
819-
# If there is a MultiIndex header and an index then there is also
820-
# a row containing just the index name(s)
821-
has_index_names = False
822-
if is_list_header and not is_len_one_list_header and index_col is not None:
823-
index_col_list: Sequence[int]
824-
if isinstance(index_col, int):
825-
index_col_list = [index_col]
826-
else:
827-
assert isinstance(index_col, Sequence)
828-
index_col_list = index_col
829-
830-
# We have to handle mi without names. If any of the entries in the data
831-
# columns are not empty, this is a regular row
832-
assert isinstance(header, Sequence)
833-
if len(header) < len(data):
834-
potential_index_names = data[len(header)]
835-
potential_data = [
836-
x
837-
for i, x in enumerate(potential_index_names)
838-
if not control_row[i] and i not in index_col_list
839-
]
840-
has_index_names = all(x == "" or x is None for x in potential_data)
841-
842-
if is_list_like(index_col):
843-
# Forward fill values for MultiIndex index.
844-
if header is None:
845-
offset = 0
846-
elif isinstance(header, int):
847-
offset = 1 + header
848-
else:
849-
offset = 1 + max(header)
783+
output = self._parse_sheet(
784+
data=data,
785+
output=output,
786+
asheetname=asheetname,
787+
header=header,
788+
names=names,
789+
index_col=index_col,
790+
usecols=usecols,
791+
dtype=dtype,
792+
skiprows=skiprows,
793+
nrows=nrows,
794+
true_values=true_values,
795+
false_values=false_values,
796+
na_values=na_values,
797+
parse_dates=parse_dates,
798+
date_parser=date_parser,
799+
date_format=date_format,
800+
thousands=thousands,
801+
decimal=decimal,
802+
comment=comment,
803+
skipfooter=skipfooter,
804+
dtype_backend=dtype_backend,
805+
**kwds,
806+
)
850807

851-
# GH34673: if MultiIndex names present and not defined in the header,
852-
# offset needs to be incremented so that forward filling starts
853-
# from the first MI value instead of the name
854-
if has_index_names:
855-
offset += 1
808+
if last_sheetname is None:
809+
raise ValueError("Sheet name is an empty list")
856810

857-
# Check if we have an empty dataset
858-
# before trying to collect data.
859-
if offset < len(data):
860-
assert isinstance(index_col, Sequence)
811+
if ret_dict:
812+
return output
813+
else:
814+
return output[last_sheetname]
861815

862-
for col in index_col:
863-
last = data[offset][col]
816+
def _parse_sheet(
817+
self,
818+
data: list,
819+
output: dict,
820+
asheetname: str | int | None = None,
821+
header: int | Sequence[int] | None = 0,
822+
names: SequenceNotStr[Hashable] | range | None = None,
823+
index_col: int | Sequence[int] | None = None,
824+
usecols=None,
825+
dtype: DtypeArg | None = None,
826+
skiprows: Sequence[int] | int | Callable[[int], object] | None = None,
827+
nrows: int | None = None,
828+
true_values: Iterable[Hashable] | None = None,
829+
false_values: Iterable[Hashable] | None = None,
830+
na_values=None,
831+
parse_dates: list | dict | bool = False,
832+
date_parser: Callable | lib.NoDefault = lib.no_default,
833+
date_format: dict[Hashable, str] | str | None = None,
834+
thousands: str | None = None,
835+
decimal: str = ".",
836+
comment: str | None = None,
837+
skipfooter: int = 0,
838+
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
839+
**kwds,
840+
):
841+
is_list_header = False
842+
is_len_one_list_header = False
843+
if is_list_like(header):
844+
assert isinstance(header, Sequence)
845+
is_list_header = True
846+
if len(header) == 1:
847+
is_len_one_list_header = True
848+
849+
if is_len_one_list_header:
850+
header = cast(Sequence[int], header)[0]
851+
852+
# forward fill and pull out names for MultiIndex column
853+
header_names = None
854+
if header is not None and is_list_like(header):
855+
assert isinstance(header, Sequence)
856+
857+
header_names = []
858+
control_row = [True] * len(data[0])
859+
860+
for row in header:
861+
if is_integer(skiprows):
862+
assert isinstance(skiprows, int)
863+
row += skiprows
864+
865+
if row > len(data) - 1:
866+
raise ValueError(
867+
f"header index {row} exceeds maximum index "
868+
f"{len(data) - 1} of data.",
869+
)
864870

865-
for row in range(offset + 1, len(data)):
866-
if data[row][col] == "" or data[row][col] is None:
867-
data[row][col] = last
868-
else:
869-
last = data[row][col]
871+
data[row], control_row = fill_mi_header(data[row], control_row)
870872

871-
# GH 12292 : error when read one empty column from excel file
872-
try:
873-
parser = TextParser(
874-
data,
875-
names=names,
876-
header=header,
877-
index_col=index_col,
878-
has_index_names=has_index_names,
879-
dtype=dtype,
880-
true_values=true_values,
881-
false_values=false_values,
882-
skiprows=skiprows,
883-
nrows=nrows,
884-
na_values=na_values,
885-
skip_blank_lines=False, # GH 39808
886-
parse_dates=parse_dates,
887-
date_parser=date_parser,
888-
date_format=date_format,
889-
thousands=thousands,
890-
decimal=decimal,
891-
comment=comment,
892-
skipfooter=skipfooter,
893-
usecols=usecols,
894-
dtype_backend=dtype_backend,
895-
**kwds,
896-
)
873+
if index_col is not None:
874+
header_name, _ = pop_header_name(data[row], index_col)
875+
header_names.append(header_name)
897876

898-
output[asheetname] = parser.read(nrows=nrows)
877+
# If there is a MultiIndex header and an index then there is also
878+
# a row containing just the index name(s)
879+
has_index_names = False
880+
if is_list_header and not is_len_one_list_header and index_col is not None:
881+
index_col_list: Sequence[int]
882+
if isinstance(index_col, int):
883+
index_col_list = [index_col]
884+
else:
885+
assert isinstance(index_col, Sequence)
886+
index_col_list = index_col
887+
888+
# We have to handle mi without names. If any of the entries in the data
889+
# columns are not empty, this is a regular row
890+
assert isinstance(header, Sequence)
891+
if len(header) < len(data):
892+
potential_index_names = data[len(header)]
893+
potential_data = [
894+
x
895+
for i, x in enumerate(potential_index_names)
896+
if not control_row[i] and i not in index_col_list
897+
]
898+
has_index_names = all(x == "" or x is None for x in potential_data)
899+
900+
if is_list_like(index_col):
901+
# Forward fill values for MultiIndex index.
902+
if header is None:
903+
offset = 0
904+
elif isinstance(header, int):
905+
offset = 1 + header
906+
else:
907+
offset = 1 + max(header)
908+
909+
# GH34673: if MultiIndex names present and not defined in the header,
910+
# offset needs to be incremented so that forward filling starts
911+
# from the first MI value instead of the name
912+
if has_index_names:
913+
offset += 1
914+
915+
# Check if we have an empty dataset
916+
# before trying to collect data.
917+
if offset < len(data):
918+
assert isinstance(index_col, Sequence)
919+
920+
for col in index_col:
921+
last = data[offset][col]
922+
923+
for row in range(offset + 1, len(data)):
924+
if data[row][col] == "" or data[row][col] is None:
925+
data[row][col] = last
926+
else:
927+
last = data[row][col]
928+
929+
# GH 12292 : error when read one empty column from excel file
930+
try:
931+
parser = TextParser(
932+
data,
933+
names=names,
934+
header=header,
935+
index_col=index_col,
936+
has_index_names=has_index_names,
937+
dtype=dtype,
938+
true_values=true_values,
939+
false_values=false_values,
940+
skiprows=skiprows,
941+
nrows=nrows,
942+
na_values=na_values,
943+
skip_blank_lines=False, # GH 39808
944+
parse_dates=parse_dates,
945+
date_parser=date_parser,
946+
date_format=date_format,
947+
thousands=thousands,
948+
decimal=decimal,
949+
comment=comment,
950+
skipfooter=skipfooter,
951+
usecols=usecols,
952+
dtype_backend=dtype_backend,
953+
**kwds,
954+
)
899955

900-
if header_names:
901-
output[asheetname].columns = output[asheetname].columns.set_names(
902-
header_names
903-
)
956+
output[asheetname] = parser.read(nrows=nrows)
904957

905-
except EmptyDataError:
906-
# No Data, return an empty DataFrame
907-
output[asheetname] = DataFrame()
958+
if header_names:
959+
output[asheetname].columns = output[asheetname].columns.set_names(
960+
header_names
961+
)
908962

909-
except Exception as err:
910-
err.args = (f"{err.args[0]} (sheet: {asheetname})", *err.args[1:])
911-
raise err
963+
except EmptyDataError:
964+
# No Data, return an empty DataFrame
965+
output[asheetname] = DataFrame()
912966

913-
if last_sheetname is None:
914-
raise ValueError("Sheet name is an empty list")
967+
except Exception as err:
968+
err.args = (f"{err.args[0]} (sheet: {asheetname})", *err.args[1:])
969+
raise err
915970

916-
if ret_dict:
917-
return output
918-
else:
919-
return output[last_sheetname]
971+
return output
920972

921973

922974
@doc(storage_options=_shared_docs["storage_options"])

0 commit comments

Comments
 (0)