ENH: Add support to read excel notes (xlrd engine) (pandas-dev#58070)

Dacops · diogomsmiranda · diogomsmiranda · commit 2f2d3a053a07 · 2024-05-24T10:26:46.000+01:00
Co-authored-by: diogomsmiranda &lt;diogomsmiranda@tecnico.ulisboa.pt&gt;
diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
@@ -391,6 +391,7 @@ def read_excel(
     skipfooter: int = ...,
     storage_options: StorageOptions = ...,
     dtype_backend: DtypeBackend | lib.NoDefault = ...,
+    notes: DataFrame | None = None,
 ) -> DataFrame: ...
 
 
@@ -428,6 +429,7 @@ def read_excel(
     skipfooter: int = ...,
     storage_options: StorageOptions = ...,
     dtype_backend: DtypeBackend | lib.NoDefault = ...,
+    notes: DataFrame | None = None,
 ) -> dict[IntStrT, DataFrame]: ...
 
 
@@ -466,6 +468,7 @@ def read_excel(
     storage_options: StorageOptions | None = None,
     dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
     engine_kwargs: dict | None = None,
+    notes: DataFrame | None = None,
 ) -> DataFrame | dict[IntStrT, DataFrame]:
     check_dtype_backend(dtype_backend)
     should_close = False
@@ -510,6 +513,7 @@ def read_excel(
             comment=comment,
             skipfooter=skipfooter,
             dtype_backend=dtype_backend,
+            notes=notes,
         )
     finally:
         # make sure to close opened file handles
@@ -586,7 +590,7 @@ def get_sheet_by_name(self, name: str):
     def get_sheet_by_index(self, index: int):
         raise NotImplementedError
 
-    def get_sheet_data(self, sheet, rows: int | None = None):
+    def get_sheet_data(self, sheet, rows: int | None = None, notes: DataFrame | None = None):
         raise NotImplementedError
 
     def raise_if_bad_sheet_by_index(self, index: int) -> None:
@@ -714,6 +718,7 @@ def parse(
         comment: str | None = None,
         skipfooter: int = 0,
         dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
+        notes: DataFrame | None = None,
         **kwds,
     ):
         validate_header_arg(header)
@@ -751,7 +756,7 @@ def parse(
                 sheet = self.get_sheet_by_index(asheetname)
 
             file_rows_needed = self._calc_rows(header, index_col, skiprows, nrows)
-            data = self.get_sheet_data(sheet, file_rows_needed)
+            data = self.get_sheet_data(sheet, file_rows_needed, notes=notes)
             if hasattr(sheet, "close"):
                 # pyxlsb opens two TemporaryFiles
                 sheet.close()
@@ -1634,6 +1639,7 @@ def parse(
         comment: str | None = None,
         skipfooter: int = 0,
         dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
+        notes: DataFrame | None = None,
         **kwds,
     ) -> DataFrame | dict[str, DataFrame] | dict[int, DataFrame]:
         """
@@ -1781,6 +1787,7 @@ def parse(
             comment=comment,
             skipfooter=skipfooter,
             dtype_backend=dtype_backend,
+            notes=notes,
             **kwds,
         )
 
diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py
@@ -21,6 +21,8 @@
         StorageOptions,
     )
 
+from pandas.core.frame import DataFrame
+
 
 class XlrdReader(BaseExcelReader["Book"]):
     @doc(storage_options=_shared_docs["storage_options"])
@@ -77,7 +79,7 @@ def get_sheet_by_index(self, index):
         return self.book.sheet_by_index(index)
 
     def get_sheet_data(
-        self, sheet, file_rows_needed: int | None = None
+        self, sheet, file_rows_needed: int | None = None, notes: DataFrame | None = None
     ) -> list[list[Scalar]]:
         from xlrd import (
             XL_CELL_BOOLEAN,
@@ -133,6 +135,36 @@ def _parse_cell(cell_contents, cell_typ):
         nrows = sheet.nrows
         if file_rows_needed is not None:
             nrows = min(nrows, file_rows_needed)
+                
+        if notes is not None:
+            notes_locations = dict(sheet.cell_note_map.items())
+
+            min_y = min(location[0] for location in notes_locations.keys())
+            max_y = max(location[0] for location in notes_locations.keys())
+            min_x = min(location[1] for location in notes_locations.keys())
+            max_x = max(location[1] for location in notes_locations.keys())
+
+            # Create column headers
+            columns = [str(i) for i in range(min_x, max_x + 1)]
+
+            # Create empty rows
+            data_notes = []
+            for y in range(min_y, max_y + 1):
+                row = []
+                for x in range(min_x, max_x + 1):
+                    if (y, x) in notes_locations:
+                        row.append(str(notes_locations[(y, x)].text))
+                    else:
+                        row.append("")
+                data_notes.append(row)
+
+            # Convert data_notes to DataFrame and set columns
+            notes_df = DataFrame(data_notes, columns=columns)
+
+            # Update the notes DataFrame with the new data
+            for col in notes_df.columns:
+                notes[col] = notes_df[col]
+
         for i in range(nrows):
             row = [
                 _parse_cell(value, typ)
@@ -141,3 +173,4 @@ def _parse_cell(cell_contents, cell_typ):
             data.append(row)
 
         return data
+