Add benchmark to test I/E overwriting order, and fix (#300)

olejandro · siddharth-krishna · web-flow · commit be9a145b323e · 2025-01-27T21:33:07.000-05:00
Fixes #269 --------- Co-authored-by: Siddharth Krishna <siddharth-krishna@users.noreply.github.com>
diff --git a/benchmarks.yml b/benchmarks.yml
@@ -54,6 +54,20 @@ benchmarks:
       - "syssettings"
       - "peak_rsv"
     include_dummy_imports: true
+  - name: DemoS_004a-ie-test
+    input_folder: DemoS_004-test-ie
+    inputs:
+      - "Sets-DemoModels.xlsx"
+      - "VT_REG_PRI_V04.xlsx"
+      - "BY_Trans.xlsx"
+      - "SysSettings.xlsx"
+      - "SuppXLS/Scen_Peak_RSV.xlsx"
+    dd_folder: DemoS_004a-test-ie
+    dd_files:
+      - "base"
+      - "syssettings"
+      - "peak_rsv"
+    include_dummy_imports: true
   - name: DemoS_004b
     input_folder: DemoS_004
     inputs:
diff --git a/setup-benchmarks.sh b/setup-benchmarks.sh
@@ -6,8 +6,8 @@ set -eo pipefail
 
 # Commit SHA for each repository:
 REF_TIMES_model="b488fb07f0899ee8b7e710c230b1a9414fa06f7d"
-REF_demos_xlsx="6daaa6ab2fa9bf4428417eef8172836aee7a9290"
-REF_demos_dd="e744f53cb16a4ec230c21583f9404d8cfac9eb50"
+REF_demos_xlsx="7d2e8e0c44eae22e28a14837cd0c031b61eea3ff"
+REF_demos_dd="c3b53ceae1de72965559f95016171ec67f333fc7"
 REF_tim_xlsx="e820d8002adc6b1526a3bffcc439219b28d0eed5"
 REF_tim_gams="cfe2628dbb5974b99c8a5664a9358849324e31ac"
 REF_TIMES_NZ="4170d720e1c5cb0e31537a3168188169209ceb4d"
diff --git a/xl2times/__main__.py b/xl2times/__main__.py
@@ -4,6 +4,7 @@
 import pickle
 import sys
 import time
+from collections import defaultdict
 from concurrent.futures import ProcessPoolExecutor
 from datetime import datetime, timedelta
 from pathlib import Path
@@ -160,7 +161,7 @@ def convert_xl_to_times(
         lambda config, tables, model: dump_tables(
             tables, os.path.join(output_dir, "merged_tables.txt")
         ),
-        lambda config, tables, model: produce_times_tables(config, tables),
+        lambda config, tables, model: produce_times_tables(config, tables, model),
     ]
 
     input = raw_tables
@@ -287,12 +288,34 @@ def compare(
 
 
 def produce_times_tables(
-    config: Config, input: dict[str, DataFrame]
+    config: Config, input: dict[str, DataFrame], model: TimesModel
 ) -> dict[str, DataFrame]:
     logger.info(
         f"produce_times_tables: {len(input)} tables incoming,"
         f" {sum(len(value) for (_, value) in input.items())} rows"
     )
+    file_order = defaultdict(lambda: -1)
+    for i, f in enumerate(model.files):
+        file_order[f] = i
+
+    def keep_last_by_file_order(df):
+        """Drop duplicate rows, keeping the last dupicate row (including value) as per
+        input file order, and remove the `source_filename` column from the DataFrame.
+
+        Note: we do not remove duplicate values for the same query columns for parameters
+        here, because in the future we might want to re-use the processed tables and
+        select the rows coming from different scenarios/files after processing just once.
+        If so, at that point we can use the info in the `source_filename` column to do
+        this.
+        """
+        if "source_filename" in df.columns:
+            df["file_order"] = df["source_filename"].map(file_order)
+            df = df.sort_values(by="file_order", kind="stable")
+            df = df.drop(columns=["source_filename", "file_order"])
+        df = df.drop_duplicates(keep="last")
+        df.reset_index(drop=True, inplace=True)
+        return df
+
     result = {}
     used_tables = set()
     for mapping in config.times_xl_maps:
@@ -326,10 +349,13 @@ def produce_times_tables(
                 # Excel columns can be duplicated into multiple Times columns
                 for times_col, xl_col in mapping.col_map.items():
                     df[times_col] = df[xl_col]
-                cols_to_drop = [x for x in df.columns if x not in mapping.times_cols]
+                # Keep only the required columns
+                cols_to_keep = set(mapping.times_cols).union({"source_filename"})
+                cols_to_drop = [x for x in df.columns if x not in cols_to_keep]
                 df.drop(columns=cols_to_drop, inplace=True)
-                df.drop_duplicates(inplace=True)
-                df.reset_index(drop=True, inplace=True)
+                # Drop duplicates, keeping last seen rows as per file order
+                df = keep_last_by_file_order(df)
+                # Drop rows with missing values
                 # TODO this is a hack. Use pd.StringDtype() so that notna() is sufficient
                 i = (
                     df[mapping.times_cols[-1]].notna()
@@ -486,7 +512,7 @@ def run(args: argparse.Namespace) -> str | None:
     else:
         input_files = args.input
 
-    model.files.update([Path(path).stem for path in input_files])
+    model.files = [Path(path).stem for path in input_files]
 
     processing_order = ["base", "subres", "trade", "demand", "scen", "syssettings"]
     for data_module in processing_order:
diff --git a/xl2times/datatypes.py b/xl2times/datatypes.py
@@ -248,7 +248,7 @@ class TimesModel:
     time_periods: DataFrame = field(default_factory=DataFrame)
     units: DataFrame = field(default_factory=DataFrame)
     start_year: int = field(default_factory=int)
-    files: set[str] = field(default_factory=set)
+    files: list[str] = field(default_factory=list)
     data_modules: list[str] = field(default_factory=list)
     custom_sets: DataFrame = field(default_factory=DataFrame)
 
diff --git a/xl2times/transforms.py b/xl2times/transforms.py
@@ -3384,13 +3384,15 @@ def apply_final_fixup(
         "side",
         "module_name",
         "module_type",
+        "source_filename",
     }
     keep_cols = cols_to_keep.intersection(df.columns)
     df.dropna(subset="value", inplace=True)
     drop_cols = [col for col in df.columns if col != "value" and col not in keep_cols]
     df.drop(columns=drop_cols, inplace=True)
     df = df.drop_duplicates(
-        subset=list(keep_cols.intersection(df.columns)), keep="last"
+        subset=list(keep_cols.intersection(df.columns).difference({"source_filename"})),
+        keep="last",
     )
 
     # Control application of i/e rules from syssettings