Skip to content

REF: Read excel parse refactor #58497

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 28 commits into from
May 1, 2024
Merged
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
c14b536
Initial commit for PR
iangainey Apr 28, 2024
396d0a6
Added whats new enhancement
iangainey Apr 28, 2024
d8f8bdf
Merge branch 'main' into main
iangainey Apr 28, 2024
ff5b722
Resolving commit checks
iangainey Apr 28, 2024
6c948cb
Merge branch 'main' of https://github.com/iangainey/pandas
iangainey Apr 28, 2024
182627e
Removing local testing comments
iangainey Apr 28, 2024
6c17196
Attempting to resolve ruff-format failure that is not occuring on my …
iangainey Apr 28, 2024
d036bb5
Resolving typing and docstring manual pre-commit errors
iangainey Apr 28, 2024
ea15154
Removing type hints considering typing checks doesn't like it
iangainey Apr 28, 2024
748d90f
These errors make no sense and contradict what I am seeing
iangainey Apr 28, 2024
1e2b0f5
Refactor of parse for precursor
iangainey Apr 30, 2024
266b66d
Cleaning up
iangainey Apr 30, 2024
7615ddc
Fixing type errors
iangainey Apr 30, 2024
0961828
Trying to fix type errors again
iangainey Apr 30, 2024
1eeecc1
Trying to fix type errors again
iangainey Apr 30, 2024
67a5733
Trying to fix type errors again
iangainey Apr 30, 2024
7c9b927
Trying to fix type errors again
iangainey Apr 30, 2024
5ec999e
Finally cleaning up
iangainey Apr 30, 2024
24993be
Cleaning up for final commit
iangainey Apr 30, 2024
351d875
Merge branch 'main' into read-excel-parse-refactor
iangainey Apr 30, 2024
a86abb4
Modified naming to mark as internal to class
iangainey Apr 30, 2024
0b8ed90
Merge branch 'read-excel-parse-refactor' of https://github.com/iangai…
iangainey Apr 30, 2024
0e53a4d
Restrcutred based on refactor and solved some typing errors
iangainey Apr 30, 2024
a55f77b
Fixing what's new changes
iangainey May 1, 2024
1713918
Cleaning up changes to tests
iangainey May 1, 2024
121ecd2
Merge branch 'main' into main
iangainey May 1, 2024
fa0782f
Branched table parameter feature and merged in parse refactor
iangainey May 1, 2024
9ec25f2
Accidently changed what's new
iangainey May 1, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
304 changes: 178 additions & 126 deletions pandas/io/excel/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -780,143 +780,195 @@ def parse(
output[asheetname] = DataFrame()
continue

is_list_header = False
is_len_one_list_header = False
if is_list_like(header):
assert isinstance(header, Sequence)
is_list_header = True
if len(header) == 1:
is_len_one_list_header = True

if is_len_one_list_header:
header = cast(Sequence[int], header)[0]

# forward fill and pull out names for MultiIndex column
header_names = None
if header is not None and is_list_like(header):
assert isinstance(header, Sequence)

header_names = []
control_row = [True] * len(data[0])

for row in header:
if is_integer(skiprows):
assert isinstance(skiprows, int)
row += skiprows

if row > len(data) - 1:
raise ValueError(
f"header index {row} exceeds maximum index "
f"{len(data) - 1} of data.",
)

data[row], control_row = fill_mi_header(data[row], control_row)

if index_col is not None:
header_name, _ = pop_header_name(data[row], index_col)
header_names.append(header_name)

# If there is a MultiIndex header and an index then there is also
# a row containing just the index name(s)
has_index_names = False
if is_list_header and not is_len_one_list_header and index_col is not None:
index_col_list: Sequence[int]
if isinstance(index_col, int):
index_col_list = [index_col]
else:
assert isinstance(index_col, Sequence)
index_col_list = index_col

# We have to handle mi without names. If any of the entries in the data
# columns are not empty, this is a regular row
assert isinstance(header, Sequence)
if len(header) < len(data):
potential_index_names = data[len(header)]
potential_data = [
x
for i, x in enumerate(potential_index_names)
if not control_row[i] and i not in index_col_list
]
has_index_names = all(x == "" or x is None for x in potential_data)

if is_list_like(index_col):
# Forward fill values for MultiIndex index.
if header is None:
offset = 0
elif isinstance(header, int):
offset = 1 + header
else:
offset = 1 + max(header)
output = self._parse_sheet(
data=data,
output=output,
asheetname=asheetname,
header=header,
names=names,
index_col=index_col,
usecols=usecols,
dtype=dtype,
skiprows=skiprows,
nrows=nrows,
true_values=true_values,
false_values=false_values,
na_values=na_values,
parse_dates=parse_dates,
date_parser=date_parser,
date_format=date_format,
thousands=thousands,
decimal=decimal,
comment=comment,
skipfooter=skipfooter,
dtype_backend=dtype_backend,
**kwds,
)

# GH34673: if MultiIndex names present and not defined in the header,
# offset needs to be incremented so that forward filling starts
# from the first MI value instead of the name
if has_index_names:
offset += 1
if last_sheetname is None:
raise ValueError("Sheet name is an empty list")

# Check if we have an empty dataset
# before trying to collect data.
if offset < len(data):
assert isinstance(index_col, Sequence)
if ret_dict:
return output
else:
return output[last_sheetname]

for col in index_col:
last = data[offset][col]
def _parse_sheet(
self,
data: list,
output: dict,
asheetname: str | int | None = None,
header: int | Sequence[int] | None = 0,
names: SequenceNotStr[Hashable] | range | None = None,
index_col: int | Sequence[int] | None = None,
usecols=None,
dtype: DtypeArg | None = None,
skiprows: Sequence[int] | int | Callable[[int], object] | None = None,
nrows: int | None = None,
true_values: Iterable[Hashable] | None = None,
false_values: Iterable[Hashable] | None = None,
na_values=None,
parse_dates: list | dict | bool = False,
date_parser: Callable | lib.NoDefault = lib.no_default,
date_format: dict[Hashable, str] | str | None = None,
thousands: str | None = None,
decimal: str = ".",
comment: str | None = None,
skipfooter: int = 0,
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
**kwds,
):
is_list_header = False
is_len_one_list_header = False
if is_list_like(header):
assert isinstance(header, Sequence)
is_list_header = True
if len(header) == 1:
is_len_one_list_header = True

if is_len_one_list_header:
header = cast(Sequence[int], header)[0]

# forward fill and pull out names for MultiIndex column
header_names = None
if header is not None and is_list_like(header):
assert isinstance(header, Sequence)

header_names = []
control_row = [True] * len(data[0])

for row in header:
if is_integer(skiprows):
assert isinstance(skiprows, int)
row += skiprows

if row > len(data) - 1:
raise ValueError(
f"header index {row} exceeds maximum index "
f"{len(data) - 1} of data.",
)

for row in range(offset + 1, len(data)):
if data[row][col] == "" or data[row][col] is None:
data[row][col] = last
else:
last = data[row][col]
data[row], control_row = fill_mi_header(data[row], control_row)

# GH 12292 : error when read one empty column from excel file
try:
parser = TextParser(
data,
names=names,
header=header,
index_col=index_col,
has_index_names=has_index_names,
dtype=dtype,
true_values=true_values,
false_values=false_values,
skiprows=skiprows,
nrows=nrows,
na_values=na_values,
skip_blank_lines=False, # GH 39808
parse_dates=parse_dates,
date_parser=date_parser,
date_format=date_format,
thousands=thousands,
decimal=decimal,
comment=comment,
skipfooter=skipfooter,
usecols=usecols,
dtype_backend=dtype_backend,
**kwds,
)
if index_col is not None:
header_name, _ = pop_header_name(data[row], index_col)
header_names.append(header_name)

output[asheetname] = parser.read(nrows=nrows)
# If there is a MultiIndex header and an index then there is also
# a row containing just the index name(s)
has_index_names = False
if is_list_header and not is_len_one_list_header and index_col is not None:
index_col_list: Sequence[int]
if isinstance(index_col, int):
index_col_list = [index_col]
else:
assert isinstance(index_col, Sequence)
index_col_list = index_col

# We have to handle mi without names. If any of the entries in the data
# columns are not empty, this is a regular row
assert isinstance(header, Sequence)
if len(header) < len(data):
potential_index_names = data[len(header)]
potential_data = [
x
for i, x in enumerate(potential_index_names)
if not control_row[i] and i not in index_col_list
]
has_index_names = all(x == "" or x is None for x in potential_data)

if is_list_like(index_col):
# Forward fill values for MultiIndex index.
if header is None:
offset = 0
elif isinstance(header, int):
offset = 1 + header
else:
offset = 1 + max(header)

# GH34673: if MultiIndex names present and not defined in the header,
# offset needs to be incremented so that forward filling starts
# from the first MI value instead of the name
if has_index_names:
offset += 1

# Check if we have an empty dataset
# before trying to collect data.
if offset < len(data):
assert isinstance(index_col, Sequence)

for col in index_col:
last = data[offset][col]

for row in range(offset + 1, len(data)):
if data[row][col] == "" or data[row][col] is None:
data[row][col] = last
else:
last = data[row][col]

# GH 12292 : error when read one empty column from excel file
try:
parser = TextParser(
data,
names=names,
header=header,
index_col=index_col,
has_index_names=has_index_names,
dtype=dtype,
true_values=true_values,
false_values=false_values,
skiprows=skiprows,
nrows=nrows,
na_values=na_values,
skip_blank_lines=False, # GH 39808
parse_dates=parse_dates,
date_parser=date_parser,
date_format=date_format,
thousands=thousands,
decimal=decimal,
comment=comment,
skipfooter=skipfooter,
usecols=usecols,
dtype_backend=dtype_backend,
**kwds,
)

if header_names:
output[asheetname].columns = output[asheetname].columns.set_names(
header_names
)
output[asheetname] = parser.read(nrows=nrows)

except EmptyDataError:
# No Data, return an empty DataFrame
output[asheetname] = DataFrame()
if header_names:
output[asheetname].columns = output[asheetname].columns.set_names(
header_names
)

except Exception as err:
err.args = (f"{err.args[0]} (sheet: {asheetname})", *err.args[1:])
raise err
except EmptyDataError:
# No Data, return an empty DataFrame
output[asheetname] = DataFrame()

if last_sheetname is None:
raise ValueError("Sheet name is an empty list")
except Exception as err:
err.args = (f"{err.args[0]} (sheet: {asheetname})", *err.args[1:])
raise err

if ret_dict:
return output
else:
return output[last_sheetname]
return output


@doc(storage_options=_shared_docs["storage_options"])
Expand Down