Skip to content

Commit 2f2d3a0

Browse files
ENH: Add support to read excel notes (xlrd engine) (pandas-dev#58070)
Co-authored-by: diogomsmiranda <[email protected]>
1 parent b22fda2 commit 2f2d3a0

File tree

2 files changed

+43
-3
lines changed

2 files changed

+43
-3
lines changed

pandas/io/excel/_base.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -391,6 +391,7 @@ def read_excel(
391391
skipfooter: int = ...,
392392
storage_options: StorageOptions = ...,
393393
dtype_backend: DtypeBackend | lib.NoDefault = ...,
394+
notes: DataFrame | None = None,
394395
) -> DataFrame: ...
395396

396397

@@ -428,6 +429,7 @@ def read_excel(
428429
skipfooter: int = ...,
429430
storage_options: StorageOptions = ...,
430431
dtype_backend: DtypeBackend | lib.NoDefault = ...,
432+
notes: DataFrame | None = None,
431433
) -> dict[IntStrT, DataFrame]: ...
432434

433435

@@ -466,6 +468,7 @@ def read_excel(
466468
storage_options: StorageOptions | None = None,
467469
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
468470
engine_kwargs: dict | None = None,
471+
notes: DataFrame | None = None,
469472
) -> DataFrame | dict[IntStrT, DataFrame]:
470473
check_dtype_backend(dtype_backend)
471474
should_close = False
@@ -510,6 +513,7 @@ def read_excel(
510513
comment=comment,
511514
skipfooter=skipfooter,
512515
dtype_backend=dtype_backend,
516+
notes=notes,
513517
)
514518
finally:
515519
# make sure to close opened file handles
@@ -586,7 +590,7 @@ def get_sheet_by_name(self, name: str):
586590
def get_sheet_by_index(self, index: int):
587591
raise NotImplementedError
588592

589-
def get_sheet_data(self, sheet, rows: int | None = None):
593+
def get_sheet_data(self, sheet, rows: int | None = None, notes: DataFrame | None = None):
590594
raise NotImplementedError
591595

592596
def raise_if_bad_sheet_by_index(self, index: int) -> None:
@@ -714,6 +718,7 @@ def parse(
714718
comment: str | None = None,
715719
skipfooter: int = 0,
716720
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
721+
notes: DataFrame | None = None,
717722
**kwds,
718723
):
719724
validate_header_arg(header)
@@ -751,7 +756,7 @@ def parse(
751756
sheet = self.get_sheet_by_index(asheetname)
752757

753758
file_rows_needed = self._calc_rows(header, index_col, skiprows, nrows)
754-
data = self.get_sheet_data(sheet, file_rows_needed)
759+
data = self.get_sheet_data(sheet, file_rows_needed, notes=notes)
755760
if hasattr(sheet, "close"):
756761
# pyxlsb opens two TemporaryFiles
757762
sheet.close()
@@ -1634,6 +1639,7 @@ def parse(
16341639
comment: str | None = None,
16351640
skipfooter: int = 0,
16361641
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
1642+
notes: DataFrame | None = None,
16371643
**kwds,
16381644
) -> DataFrame | dict[str, DataFrame] | dict[int, DataFrame]:
16391645
"""
@@ -1781,6 +1787,7 @@ def parse(
17811787
comment=comment,
17821788
skipfooter=skipfooter,
17831789
dtype_backend=dtype_backend,
1790+
notes=notes,
17841791
**kwds,
17851792
)
17861793

pandas/io/excel/_xlrd.py

+34-1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
StorageOptions,
2222
)
2323

24+
from pandas.core.frame import DataFrame
25+
2426

2527
class XlrdReader(BaseExcelReader["Book"]):
2628
@doc(storage_options=_shared_docs["storage_options"])
@@ -77,7 +79,7 @@ def get_sheet_by_index(self, index):
7779
return self.book.sheet_by_index(index)
7880

7981
def get_sheet_data(
80-
self, sheet, file_rows_needed: int | None = None
82+
self, sheet, file_rows_needed: int | None = None, notes: DataFrame | None = None
8183
) -> list[list[Scalar]]:
8284
from xlrd import (
8385
XL_CELL_BOOLEAN,
@@ -133,6 +135,36 @@ def _parse_cell(cell_contents, cell_typ):
133135
nrows = sheet.nrows
134136
if file_rows_needed is not None:
135137
nrows = min(nrows, file_rows_needed)
138+
139+
if notes is not None:
140+
notes_locations = dict(sheet.cell_note_map.items())
141+
142+
min_y = min(location[0] for location in notes_locations.keys())
143+
max_y = max(location[0] for location in notes_locations.keys())
144+
min_x = min(location[1] for location in notes_locations.keys())
145+
max_x = max(location[1] for location in notes_locations.keys())
146+
147+
# Create column headers
148+
columns = [str(i) for i in range(min_x, max_x + 1)]
149+
150+
# Create empty rows
151+
data_notes = []
152+
for y in range(min_y, max_y + 1):
153+
row = []
154+
for x in range(min_x, max_x + 1):
155+
if (y, x) in notes_locations:
156+
row.append(str(notes_locations[(y, x)].text))
157+
else:
158+
row.append("")
159+
data_notes.append(row)
160+
161+
# Convert data_notes to DataFrame and set columns
162+
notes_df = DataFrame(data_notes, columns=columns)
163+
164+
# Update the notes DataFrame with the new data
165+
for col in notes_df.columns:
166+
notes[col] = notes_df[col]
167+
136168
for i in range(nrows):
137169
row = [
138170
_parse_cell(value, typ)
@@ -141,3 +173,4 @@ def _parse_cell(cell_contents, cell_typ):
141173
data.append(row)
142174

143175
return data
176+

0 commit comments

Comments
 (0)