|
4 | 4 | import pickle
|
5 | 5 | import sys
|
6 | 6 | import time
|
| 7 | +from collections import defaultdict |
7 | 8 | from concurrent.futures import ProcessPoolExecutor
|
8 | 9 | from datetime import datetime, timedelta
|
9 | 10 | from pathlib import Path
|
@@ -160,7 +161,7 @@ def convert_xl_to_times(
|
160 | 161 | lambda config, tables, model: dump_tables(
|
161 | 162 | tables, os.path.join(output_dir, "merged_tables.txt")
|
162 | 163 | ),
|
163 |
| - lambda config, tables, model: produce_times_tables(config, tables), |
| 164 | + lambda config, tables, model: produce_times_tables(config, tables, model), |
164 | 165 | ]
|
165 | 166 |
|
166 | 167 | input = raw_tables
|
@@ -287,12 +288,34 @@ def compare(
|
287 | 288 |
|
288 | 289 |
|
289 | 290 | def produce_times_tables(
|
290 |
| - config: Config, input: dict[str, DataFrame] |
| 291 | + config: Config, input: dict[str, DataFrame], model: TimesModel |
291 | 292 | ) -> dict[str, DataFrame]:
|
292 | 293 | logger.info(
|
293 | 294 | f"produce_times_tables: {len(input)} tables incoming,"
|
294 | 295 | f" {sum(len(value) for (_, value) in input.items())} rows"
|
295 | 296 | )
|
| 297 | + file_order = defaultdict(lambda: -1) |
| 298 | + for i, f in enumerate(model.files): |
| 299 | + file_order[f] = i |
| 300 | + |
| 301 | + def keep_last_by_file_order(df): |
| 302 | + """Drop duplicate rows, keeping the last dupicate row (including value) as per |
| 303 | + input file order, and remove the `source_filename` column from the DataFrame. |
| 304 | +
|
| 305 | + Note: we do not remove duplicate values for the same query columns for parameters |
| 306 | + here, because in the future we might want to re-use the processed tables and |
| 307 | + select the rows coming from different scenarios/files after processing just once. |
| 308 | + If so, at that point we can use the info in the `source_filename` column to do |
| 309 | + this. |
| 310 | + """ |
| 311 | + if "source_filename" in df.columns: |
| 312 | + df["file_order"] = df["source_filename"].map(file_order) |
| 313 | + df = df.sort_values(by="file_order", kind="stable") |
| 314 | + df = df.drop(columns=["source_filename", "file_order"]) |
| 315 | + df = df.drop_duplicates(keep="last") |
| 316 | + df.reset_index(drop=True, inplace=True) |
| 317 | + return df |
| 318 | + |
296 | 319 | result = {}
|
297 | 320 | used_tables = set()
|
298 | 321 | for mapping in config.times_xl_maps:
|
@@ -326,10 +349,13 @@ def produce_times_tables(
|
326 | 349 | # Excel columns can be duplicated into multiple Times columns
|
327 | 350 | for times_col, xl_col in mapping.col_map.items():
|
328 | 351 | df[times_col] = df[xl_col]
|
329 |
| - cols_to_drop = [x for x in df.columns if x not in mapping.times_cols] |
| 352 | + # Keep only the required columns |
| 353 | + cols_to_keep = set(mapping.times_cols).union({"source_filename"}) |
| 354 | + cols_to_drop = [x for x in df.columns if x not in cols_to_keep] |
330 | 355 | df.drop(columns=cols_to_drop, inplace=True)
|
331 |
| - df.drop_duplicates(inplace=True) |
332 |
| - df.reset_index(drop=True, inplace=True) |
| 356 | + # Drop duplicates, keeping last seen rows as per file order |
| 357 | + df = keep_last_by_file_order(df) |
| 358 | + # Drop rows with missing values |
333 | 359 | # TODO this is a hack. Use pd.StringDtype() so that notna() is sufficient
|
334 | 360 | i = (
|
335 | 361 | df[mapping.times_cols[-1]].notna()
|
@@ -486,7 +512,7 @@ def run(args: argparse.Namespace) -> str | None:
|
486 | 512 | else:
|
487 | 513 | input_files = args.input
|
488 | 514 |
|
489 |
| - model.files.update([Path(path).stem for path in input_files]) |
| 515 | + model.files = [Path(path).stem for path in input_files] |
490 | 516 |
|
491 | 517 | processing_order = ["base", "subres", "trade", "demand", "scen", "syssettings"]
|
492 | 518 | for data_module in processing_order:
|
|
0 commit comments