Skip to content

Commit be9a145

Browse files
Add benchmark to test I/E overwriting order, and fix (#300)
Fixes #269 --------- Co-authored-by: Siddharth Krishna <[email protected]>
1 parent e2faee3 commit be9a145

File tree

5 files changed

+52
-10
lines changed

5 files changed

+52
-10
lines changed

benchmarks.yml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,20 @@ benchmarks:
5454
- "syssettings"
5555
- "peak_rsv"
5656
include_dummy_imports: true
57+
- name: DemoS_004a-ie-test
58+
input_folder: DemoS_004-test-ie
59+
inputs:
60+
- "Sets-DemoModels.xlsx"
61+
- "VT_REG_PRI_V04.xlsx"
62+
- "BY_Trans.xlsx"
63+
- "SysSettings.xlsx"
64+
- "SuppXLS/Scen_Peak_RSV.xlsx"
65+
dd_folder: DemoS_004a-test-ie
66+
dd_files:
67+
- "base"
68+
- "syssettings"
69+
- "peak_rsv"
70+
include_dummy_imports: true
5771
- name: DemoS_004b
5872
input_folder: DemoS_004
5973
inputs:

setup-benchmarks.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@ set -eo pipefail
66

77
# Commit SHA for each repository:
88
REF_TIMES_model="b488fb07f0899ee8b7e710c230b1a9414fa06f7d"
9-
REF_demos_xlsx="6daaa6ab2fa9bf4428417eef8172836aee7a9290"
10-
REF_demos_dd="e744f53cb16a4ec230c21583f9404d8cfac9eb50"
9+
REF_demos_xlsx="7d2e8e0c44eae22e28a14837cd0c031b61eea3ff"
10+
REF_demos_dd="c3b53ceae1de72965559f95016171ec67f333fc7"
1111
REF_tim_xlsx="e820d8002adc6b1526a3bffcc439219b28d0eed5"
1212
REF_tim_gams="cfe2628dbb5974b99c8a5664a9358849324e31ac"
1313
REF_TIMES_NZ="4170d720e1c5cb0e31537a3168188169209ceb4d"

xl2times/__main__.py

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import pickle
55
import sys
66
import time
7+
from collections import defaultdict
78
from concurrent.futures import ProcessPoolExecutor
89
from datetime import datetime, timedelta
910
from pathlib import Path
@@ -160,7 +161,7 @@ def convert_xl_to_times(
160161
lambda config, tables, model: dump_tables(
161162
tables, os.path.join(output_dir, "merged_tables.txt")
162163
),
163-
lambda config, tables, model: produce_times_tables(config, tables),
164+
lambda config, tables, model: produce_times_tables(config, tables, model),
164165
]
165166

166167
input = raw_tables
@@ -287,12 +288,34 @@ def compare(
287288

288289

289290
def produce_times_tables(
290-
config: Config, input: dict[str, DataFrame]
291+
config: Config, input: dict[str, DataFrame], model: TimesModel
291292
) -> dict[str, DataFrame]:
292293
logger.info(
293294
f"produce_times_tables: {len(input)} tables incoming,"
294295
f" {sum(len(value) for (_, value) in input.items())} rows"
295296
)
297+
file_order = defaultdict(lambda: -1)
298+
for i, f in enumerate(model.files):
299+
file_order[f] = i
300+
301+
def keep_last_by_file_order(df):
302+
"""Drop duplicate rows, keeping the last dupicate row (including value) as per
303+
input file order, and remove the `source_filename` column from the DataFrame.
304+
305+
Note: we do not remove duplicate values for the same query columns for parameters
306+
here, because in the future we might want to re-use the processed tables and
307+
select the rows coming from different scenarios/files after processing just once.
308+
If so, at that point we can use the info in the `source_filename` column to do
309+
this.
310+
"""
311+
if "source_filename" in df.columns:
312+
df["file_order"] = df["source_filename"].map(file_order)
313+
df = df.sort_values(by="file_order", kind="stable")
314+
df = df.drop(columns=["source_filename", "file_order"])
315+
df = df.drop_duplicates(keep="last")
316+
df.reset_index(drop=True, inplace=True)
317+
return df
318+
296319
result = {}
297320
used_tables = set()
298321
for mapping in config.times_xl_maps:
@@ -326,10 +349,13 @@ def produce_times_tables(
326349
# Excel columns can be duplicated into multiple Times columns
327350
for times_col, xl_col in mapping.col_map.items():
328351
df[times_col] = df[xl_col]
329-
cols_to_drop = [x for x in df.columns if x not in mapping.times_cols]
352+
# Keep only the required columns
353+
cols_to_keep = set(mapping.times_cols).union({"source_filename"})
354+
cols_to_drop = [x for x in df.columns if x not in cols_to_keep]
330355
df.drop(columns=cols_to_drop, inplace=True)
331-
df.drop_duplicates(inplace=True)
332-
df.reset_index(drop=True, inplace=True)
356+
# Drop duplicates, keeping last seen rows as per file order
357+
df = keep_last_by_file_order(df)
358+
# Drop rows with missing values
333359
# TODO this is a hack. Use pd.StringDtype() so that notna() is sufficient
334360
i = (
335361
df[mapping.times_cols[-1]].notna()
@@ -486,7 +512,7 @@ def run(args: argparse.Namespace) -> str | None:
486512
else:
487513
input_files = args.input
488514

489-
model.files.update([Path(path).stem for path in input_files])
515+
model.files = [Path(path).stem for path in input_files]
490516

491517
processing_order = ["base", "subres", "trade", "demand", "scen", "syssettings"]
492518
for data_module in processing_order:

xl2times/datatypes.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,7 @@ class TimesModel:
248248
time_periods: DataFrame = field(default_factory=DataFrame)
249249
units: DataFrame = field(default_factory=DataFrame)
250250
start_year: int = field(default_factory=int)
251-
files: set[str] = field(default_factory=set)
251+
files: list[str] = field(default_factory=list)
252252
data_modules: list[str] = field(default_factory=list)
253253
custom_sets: DataFrame = field(default_factory=DataFrame)
254254

xl2times/transforms.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3384,13 +3384,15 @@ def apply_final_fixup(
33843384
"side",
33853385
"module_name",
33863386
"module_type",
3387+
"source_filename",
33873388
}
33883389
keep_cols = cols_to_keep.intersection(df.columns)
33893390
df.dropna(subset="value", inplace=True)
33903391
drop_cols = [col for col in df.columns if col != "value" and col not in keep_cols]
33913392
df.drop(columns=drop_cols, inplace=True)
33923393
df = df.drop_duplicates(
3393-
subset=list(keep_cols.intersection(df.columns)), keep="last"
3394+
subset=list(keep_cols.intersection(df.columns).difference({"source_filename"})),
3395+
keep="last",
33943396
)
33953397

33963398
# Control application of i/e rules from syssettings

0 commit comments

Comments
 (0)