cmu-delphi
diff --git a/‎covid_hosp_explore/data_targets.R
Lines changed: 1 addition & 1 deletion b/‎covid_hosp_explore/data_targets.R
Lines changed: 1 addition & 1 deletion
diff --git a/‎flu_hosp_explore.R
Lines changed: 121 additions & 163 deletions b/‎flu_hosp_explore.R
Lines changed: 121 additions & 163 deletions
@@ -3,7 +3,7 @@ data_targets <- local({
   time_type <- "day"
   geo_values <- "*"
   time_values <- epidatr::epirange(from = "2020-01-01", to = "2024-01-01")
-  fetch_args <- fetch_args_list(return_empty = TRUE, timeout_seconds = 200)
+  fetch_args <- epidatr::fetch_args_list(return_empty = TRUE, timeout_seconds = 200)
   issues <- "*"
 
   list(
 
@@ -12,9 +12,21 @@ suppressPackageStartupMessages({
   library(tibble)
   library(tidyr)
   library(rlang)
-  library(epidatr)
 })
 
+# The external scores processing causes the pipeline to exit with an error,
+# apparently due to running out of memory. Set up a non-parallel `crew`
+# controller to avoid.
+# https://books.ropensci.org/targets/crew.html#heterogeneous-workers
+main_controller <- crew_controller_local(
+  name = "main_controller",
+  workers = parallel::detectCores() - 5
+)
+serial_controller <- crew_controller_local(
+  name = "serial_controller",
+  workers = 1L
+)
+
 tar_option_set(
   packages = c(
     "assertthat",
@@ -30,159 +42,29 @@ tar_option_set(
   ), # packages that your targets need to run
   imports = c("epieval", "parsnip"),
   format = "qs", # Optionally set the default storage format. qs is fast.
-  controller = crew::crew_controller_local(workers = parallel::detectCores() - 5),
-  )
-# Run the R scripts in the R/ folder with your custom functions:
-# tar_source()
-linreg <- parsnip::linear_reg()
-quantreg <- epipredict::quantile_reg()
-
-grids <- list(
-  tidyr::expand_grid(
-    forecaster = "scaled_pop",
-    trainer = c("linreg", "quantreg"),
-    ahead = 1:4,
-    pop_scaling = c(TRUE, FALSE)
-  ),
-  tidyr::expand_grid(
-    forecaster = "scaled_pop",
-    trainer = c("linreg", "quantreg"),
-    ahead = 5:7,
-    lags = list(c(0, 3, 5, 7, 14), c(0, 7, 14)),
-    pop_scaling = c(TRUE, FALSE)
-  )
-)
-# bind them together and give static ids; if you add a new field to a given
-# expand_grid, everything will get a new id, so it's better to add a new
-# expand_grid instead
-param_grid <- bind_rows(map(grids, add_id)) %>% relocate(id, .after = last_col())
-
-forecaster_param_grids <- make_target_param_grid(param_grid) %>%
-  ## TODO This forecaster is hanging. Filter it out for now.
-  filter(id != "necessary endless 5")
-
-# not actually used downstream, this is for lookup during plotting and human evaluation
-forecasters <- list(
-  tar_target(
-    name = forecasters,
-    command = {
-      param_grid
-    }
-  )
-)
-
-response_signal <- "confirmed_admissions_influenza_1d_prop_7dav"
-target_range <- epirange(from = "20211001", to = "20220401")
-data <- list(
-  tar_target(
-    name = hhs_evaluation_data,
-    command = {
-      epidatr::pub_covidcast(
-        source = "hhs",
-        signals = response_signal,
-        geo_type = "state",
-        time_type = "day",
-        geo_values = "*",
-        time_values = epirange(from = "2020-01-01", to = "2024-01-01"),
-      ) %>%
-        rename(
-          actual = value,
-          target_end_date = time_value
-        )
-    }
-  ),
-  tar_target(
-    name = hhs_archive_data_2022,
-    command = {
-      epidatr::pub_covidcast(
-        source = "hhs",
-        signals = response_signal,
-        geo_type = "state",
-        time_type = "day",
-        geo_values = "*",
-        time_values = target_range,
-        issues = "*",
-        fetch_params = fetch_params_list(return_empty = TRUE, timeout_seconds = 100)
-      )
-    }
-  ),
-  tar_target(
-    name = chng_archive_data_2022,
-    command = {
-      epidatr::pub_covidcast(
-        source = "chng",
-        signals = "smoothed_adj_outpatient_flu",
-        geo_type = "state",
-        time_type = "day",
-        geo_values = "*",
-        time_values = target_range,
-        issues = "*",
-        fetch_params = fetch_params_list(return_empty = TRUE, timeout_seconds = 100)
-      )
-    }
-  ),
-  tar_target(
-    name = joined_archive_data_2022,
-    command = {
-      hhs_archive_data_2022 %<>%
-        select(geo_value, time_value, value, issue) %>%
-        rename("hhs" := value) %>%
-        rename(version = issue) %>%
-        as_epi_archive(
-          geo_type = "state",
-          time_type = "day",
-          compactify = TRUE
-        )
-      chng_archive_data_2022 %<>%
-        select(geo_value, time_value, value, issue) %>%
-        rename("chng" := value) %>%
-        rename(version = issue) %>%
-        as_epi_archive(
-          geo_type = "state",
-          time_type = "day",
-          compactify = TRUE
-        )
-      epix_merge(hhs_archive_data_2022, chng_archive_data_2022, sync = "locf")
-    }
-  ),
-  tar_target(
-    name = hhs_latest_data_2022,
-    command = {
-      epidatr::pub_covidcast(
-        source = "hhs",
-        signals = "confirmed_admissions_covid_1d",
-        geo_type = "state",
-        time_type = "day",
-        geo_values = "*",
-        time_values = epirange(from = "20220101", to = "20220401"),
-        fetch_params = fetch_params_list(return_empty = TRUE, timeout_seconds = 100)
-      )
-    }
-  ),
-  tar_target(
-    name = chng_latest_data_2022,
-    command = {
-      epidatr::pub_covidcast(
-        source = "chng",
-        signals = "smoothed_adj_outpatient_covid",
-        geo_type = "state",
-        time_type = "day",
-        geo_values = "*",
-        time_values = epirange(from = "20220101", to = "20220401"),
-        fetch_params = fetch_params_list(return_empty = TRUE, timeout_seconds = 100)
-      )
-    }
+  controller = crew_controller_group(main_controller, serial_controller),
+  # Set default crew controller.
+  # https://books.ropensci.org/targets/crew.html#heterogeneous-workers
+  resources = tar_resources(
+    crew = tar_resources_crew(controller = "main_controller")
   )
 )
 
+# Run the R scripts in the R/ folder with your custom functions:
+# tar_source()
+# where the forecasters and parameters are joined; see either the variable param_grid or `tar_read(forecasters)`
+source("flu_hosp_explore/forecaster_instantiation.R")
+source("flu_hosp_explore/data_targets.R")
+# Auto-generated in run.R
+source("flu_hosp_explore/dynamic_constants.R")
 
-forecasts_and_scores <- tar_map(
+forecasts_and_scores_by_ahead <- tar_map(
   values = forecaster_param_grids,
   names = id,
   unlist = FALSE,
-  tar_target(
-    name = forecast,
-    command = {
+  tar_target_raw(
+    name = ONE_AHEAD_FORECAST_NAME,
+    command = expression(
       forecaster_pred(
         data = joined_archive_data_2022,
         outcome = "hhs",
@@ -192,34 +74,51 @@ forecasts_and_scores <- tar_map(
         forecaster_args = params,
         forecaster_args_names = param_names
       )
-    }
+    )
   ),
-  tar_target(
-    name = score,
-    command = {
+  tar_target_raw(
+    name = ONE_AHEAD_SCORE_NAME,
+    command = expression(
       run_evaluation_measure(
-        data = forecast,
+        data = forecast_by_ahead,
         evaluation_data = hhs_evaluation_data,
         measure = list(
           wis = weighted_interval_score,
           ae = absolute_error,
           cov_80 = interval_coverage(0.8)
         )
       )
-    }
+    )
   )
 )
 
-ensemble_keys <- list(a = c(300, 15))
-ensembles <- list(
+forecasts_and_scores <- tar_map(
+  values = forecaster_parent_id_map,
+  names = parent_id,
+  tar_target(
+    name = forecast,
+    command = {
+      bind_rows(forecast_component_ids) %>%
+        mutate(parent_forecaster = parent_id)
+    }
+  ),
   tar_target(
-    name = ensembles,
+    name = score,
     command = {
-      ensemble_keys
+      bind_rows(score_component_ids) %>%
+        mutate(parent_forecaster = parent_id)
     }
   )
 )
 
+ensemble_keys <- list(a = c(300, 15))
+ensemble_targets <- tar_target(
+  name = ensembles,
+  command = {
+    ensemble_keys
+  }
+)
+
 # The combine approach below is taken from the manual:
 #   https://books.ropensci.org/targets/static.html#combine
 # The key is that the map above has unlist = FALSE.
@@ -229,8 +128,8 @@ ensemble_forecast <- tar_map(
     name = ensemble_forecast,
     # TODO: Needs a lookup table to select the right forecasters
     list(
-      forecasts_and_scores[["forecast"]][[1]],
-      forecasts_and_scores[["forecast"]][[2]]
+      forecasts_and_scores_by_ahead[["forecast_by_ahead"]][[1]],
+      forecasts_and_scores_by_ahead[["forecast_by_ahead"]][[2]]
     ),
     command = {
       bind_rows(!!!.x, .id = "forecaster") %>%
@@ -261,10 +160,69 @@ ensemble_forecast <- tar_map(
   )
 )
 
+if (LOAD_EXTERNAL_SCORES) {
+  external_names_and_scores <- list(
+    tar_target(
+      name = external_scores_df,
+      command = {
+        readRDS(external_scores_path) %>%
+          group_by(forecaster) %>%
+          targets::tar_group()
+      },
+      iteration = "group",
+      garbage_collection = TRUE
+    ),
+    tar_target(
+      name = external_names,
+      command = {
+        external_scores_df %>%
+          group_by(forecaster) %>%
+          group_keys() %>%
+          pull(forecaster)
+      },
+      garbage_collection = TRUE
+    ),
+    tar_target(
+      name = external_scores,
+      pattern = map(external_scores_df),
+      command = {
+        external_scores_df
+      },
+      # This step causes the pipeline to exit with an error, apparently due to
+      # running out of memory. Run this in series on a non-parallel `crew`
+      # controller to avoid.
+      # https://books.ropensci.org/targets/crew.html#heterogeneous-workers
+      resources = tar_resources(
+        crew = tar_resources_crew(controller = "serial_controller")
+      ),
+      memory = "transient",
+      garbage_collection = TRUE
+    )
+  )
+} else {
+  external_names_and_scores <- list(
+    tar_target(
+      name = external_names,
+      command = {
+        c()
+      }
+    ),
+    tar_target(
+      name = external_scores,
+      command = {
+        data.frame()
+      }
+    )
+  )
+}
+
+
 list(
-  data,
-  forecasters,
+  data_targets,
+  forecaster_targets,
+  forecasts_and_scores_by_ahead,
   forecasts_and_scores,
-  ensembles,
-  ensemble_forecast
+  ensemble_targets,
+  ensemble_forecast,
+  external_names_and_scores
 )