cmu-delphi
diff --git a/‎NAMESPACE
Lines changed: 6 additions & 0 deletions b/‎NAMESPACE
Lines changed: 6 additions & 0 deletions
diff --git a/‎R/forecaster.R
Lines changed: 11 additions & 6 deletions b/‎R/forecaster.R
Lines changed: 11 additions & 6 deletions
diff --git a/‎R/forecaster_scaled_pop.R
Lines changed: 1 addition & 1 deletion b/‎R/forecaster_scaled_pop.R
Lines changed: 1 addition & 1 deletion
diff --git a/‎R/targets_utils.R
Lines changed: 281 additions & 2 deletions b/‎R/targets_utils.R
Lines changed: 281 additions & 2 deletions
diff --git a/‎R/small_utils.R renamed to ‎R/utils.R b/‎R/small_utils.R renamed to ‎R/utils.R
diff --git a/‎_targets.yaml
Lines changed: 12 additions & 12 deletions b/‎_targets.yaml
Lines changed: 12 additions & 12 deletions
@@ -15,6 +15,12 @@ export(format_storage)
 export(id_ahead_ensemble_grid)
 export(interval_coverage)
 export(lookup_ids)
+export(make_data_targets)
+export(make_ensemble_targets)
+export(make_external_names_and_scores)
+export(make_forecasts_and_scores)
+export(make_forecasts_and_scores_by_ahead)
+export(make_shared_grids)
 export(make_target_param_grid)
 export(manage_S3_forecast_cache)
 export(overprediction)
 
@@ -45,23 +45,26 @@ perform_sanity_checks <- function(epi_data,
 #' epipredict is a little bit fragile about having enough data to train; we want
 #'   to be able to return a null result rather than error out.
 #' @param epi_data the input data
-#' @param buffer how many training data to insist on having (e.g. if `buffer=1`,
-#'   this trains on one sample; the default is set so that `linear_reg` isn't
-#'   rank deficient)
 #' @param ahead the effective ahead; may be infinite if there isn't enough data.
 #' @param args_input the input as supplied to `forecaster_pred`; lags is the
 #'   important argument, which may or may not be defined, with the default
 #'   coming from `arx_args_list`
-#'
-#' # TODO: Buffer should probably be 2 * n(lags) * n(predictors).
+#' @param buffer how many training data to insist on having (e.g. if `buffer=1`,
+#'   this trains on one sample; the default is set so that `linear_reg` isn't
+#'   rank deficient)
 #'
 #' @export
-confirm_sufficient_data <- function(epi_data, ahead, args_input, buffer = 15) {
+confirm_sufficient_data <- function(epi_data, ahead, args_input, buffer = 20) {
   if (!is.null(args_input$lags)) {
     lag_max <- max(args_input$lags)
   } else {
     lag_max <- 14 # default value of 2 weeks
   }
+
+  # TODO: Buffer should probably be 2 * n(lags) * n(predictors). But honestly,
+  # this needs to be fixed in epipredict itself, see
+  # https://github.com/cmu-delphi/epipredict/issues/106.
+
   return(
     !is.infinite(ahead) &&
       epi_data %>%
@@ -233,6 +236,7 @@ forecaster_pred <- function(data,
     function(data, gk, rtv, ...) {
       # TODO: Can we get rid of this tryCatch and instead hook it up to targets
       #       error handling or something else?
+      #       https://github.com/cmu-delphi/exploration-tooling/issues/41
       tryCatch(
         {
           do.call(
@@ -259,6 +263,7 @@ forecaster_pred <- function(data,
               e = e
             )
             saveRDS(dump_vars, "forecaster_pred_error.rds")
+            e
           }
         }
       )
 
@@ -73,7 +73,7 @@ scaled_pop <- function(epi_data,
   args_list <- do.call(arx_args_list, args_input)
   # if you want to ignore extra_sources, setting predictors is the way to do it
   predictors <- c(outcome, extra_sources)
-  # TODO: Partial match quantile_level coming from here
+  # TODO: Partial match quantile_level coming from here (on Dmitry's machine)
   argsPredictorsTrainer <- perform_sanity_checks(epi_data, outcome, predictors, trainer, args_list)
   args_list <- argsPredictorsTrainer[[1]]
   predictors <- argsPredictorsTrainer[[2]]
 
@@ -6,8 +6,10 @@
 #' @export
 #' @importFrom rlang syms
 make_target_param_grid <- function(param_grid) {
-  param_grid %<>% mutate(forecaster = syms(forecaster))
-  param_grid %<>% mutate(trainer = syms(trainer))
+  param_grid %>%
+    select(-any_of("parent_id")) %>%
+    mutate(forecaster = syms(forecaster)) %>%
+    mutate(trainer = syms(trainer))
   list_of_params <- lists_of_real_values(param_grid)
   list_names <- map(list_of_params, names)
   tibble(
@@ -27,3 +29,280 @@ lists_of_real_values <- function(param_grid) {
   }
   map(full_lists, filter_nonvalues)
 }
+
+#' Make common targets for fetching data
+#'
+#' Relies on the following globals:
+#' - `hhs_signal`
+#' - `chng_signal`
+#' - `geo_type`
+#' - `time_type`
+#' - `geo_values`
+#' - `time_values`
+#' - `issues`
+#' - `fetch_args`
+#'
+#' @export
+make_data_targets <- function() {
+  list(
+    tar_target(
+      name = hhs_latest_data,
+      command = {
+        epidatr::pub_covidcast(
+          source = "hhs",
+          signals = hhs_signal,
+          geo_type = geo_type,
+          time_type = time_type,
+          geo_values = geo_values,
+          time_values = time_values,
+          fetch_args = fetch_args
+        )
+      }
+    ),
+    tar_target(
+      name = chng_latest_data,
+      command = {
+        epidatr::pub_covidcast(
+          source = "chng",
+          signals = chng_signal,
+          geo_type = geo_type,
+          time_type = time_type,
+          geo_values = geo_values,
+          time_values = time_values,
+          fetch_args = fetch_args
+        )
+      }
+    ),
+    tar_target(
+      name = hhs_evaluation_data,
+      command = {
+        hhs_latest_data %>%
+          rename(
+            actual = value,
+            target_end_date = time_value
+          )
+      }
+    ),
+    tar_target(
+      name = hhs_latest_data_2022,
+      command = {
+        hhs_latest_data %>%
+          filter(time_value >= "2022-01-01")
+      }
+    ),
+    tar_target(
+      name = chng_latest_data_2022,
+      command = {
+        chng_latest_data %>%
+          filter(time_value >= "2022-01-01")
+      }
+    ),
+    tar_target(
+      name = hhs_archive_data_2022,
+      command = {
+        epidatr::pub_covidcast(
+          source = "hhs",
+          signals = hhs_signal,
+          geo_type = geo_type,
+          time_type = time_type,
+          geo_values = geo_values,
+          time_values = time_values,
+          issues = issues,
+          fetch_args = fetch_args
+        )
+      }
+    ),
+    tar_target(
+      name = chng_archive_data_2022,
+      command = {
+        epidatr::pub_covidcast(
+          source = "chng",
+          signals = chng_signal,
+          geo_type = geo_type,
+          time_type = time_type,
+          geo_values = geo_values,
+          time_values = time_values,
+          issues = issues,
+          fetch_args = fetch_args
+        )
+      }
+    ),
+    tar_target(
+      name = joined_archive_data_2022,
+      command = {
+        hhs_archive_data_2022 %<>%
+          select(geo_value, time_value, value, issue) %>%
+          rename("hhs" := value) %>%
+          rename(version = issue) %>%
+          as_epi_archive(
+            geo_type = geo_type,
+            time_type = time_type,
+            compactify = TRUE
+          )
+        chng_archive_data_2022 %<>%
+          select(geo_value, time_value, value, issue) %>%
+          rename("chng" := value) %>%
+          rename(version = issue) %>%
+          as_epi_archive(
+            geo_type = geo_type,
+            time_type = time_type,
+            compactify = TRUE
+          )
+        epix_merge(hhs_archive_data_2022, chng_archive_data_2022, sync = "locf")$DT %>%
+          drop_na() %>%
+          filter(!geo_value %in% c("as", "pr", "vi", "gu", "mp")) %>%
+          epiprocess::as_epi_archive()
+      }
+    )
+  )
+}
+
+#' Make common targets for forecasting experiments
+#' @export
+make_shared_grids <- function() {
+  list(
+    tidyr::expand_grid(
+      forecaster = "scaled_pop",
+      trainer = c("linreg", "quantreg"),
+      ahead = 1:4,
+      pop_scaling = c(FALSE)
+    ),
+    tidyr::expand_grid(
+      forecaster = "scaled_pop",
+      trainer = c("linreg", "quantreg"),
+      ahead = 5:7,
+      lags = list(c(0, 3, 5, 7, 14), c(0, 7, 14)),
+      pop_scaling = c(FALSE)
+    )
+  )
+}
+
+#' Make forecasts and scores by ahead targets
+#' @export
+make_forecasts_and_scores_by_ahead <- function() {
+  tar_map(
+    values = targets_param_grid,
+    names = id,
+    unlist = FALSE,
+    tar_target_raw(
+      name = ONE_AHEAD_FORECAST_NAME,
+      command = expression(
+        forecaster_pred(
+          data = joined_archive_data_2022,
+          outcome = "hhs",
+          extra_sources = "",
+          forecaster = forecaster,
+          n_training_pad = 30L,
+          forecaster_args = params,
+          forecaster_args_names = param_names,
+          date_range_step_size = 7L
+        )
+      )
+    ),
+    tar_target_raw(
+      name = ONE_AHEAD_SCORE_NAME,
+      command = expression(
+        run_evaluation_measure(
+          data = forecast_by_ahead,
+          evaluation_data = hhs_evaluation_data,
+          measure = list(
+            wis = weighted_interval_score,
+            ae = absolute_error,
+            cov_80 = interval_coverage(0.8)
+          )
+        )
+      )
+    )
+  )
+}
+
+#' Make forecasts and scores targets
+#' @export
+make_forecasts_and_scores <- function() {
+  tar_map(
+    values = forecaster_parent_id_map,
+    names = parent_id,
+    tar_target(
+      name = forecast,
+      command = {
+        bind_rows(forecast_component_ids) %>%
+          mutate(parent_forecaster = parent_id)
+      }
+    ),
+    tar_target(
+      name = score,
+      command = {
+        bind_rows(score_component_ids) %>%
+          mutate(parent_forecaster = parent_id)
+      }
+    )
+  )
+}
+
+#' Make ensemble targets
+#' @export
+make_ensemble_targets <- function() {
+  list()
+}
+
+
+#' Make external names and scores targets
+#' @export
+make_external_names_and_scores <- function() {
+  external_scores_path <- Sys.getenv("EXTERNAL_SCORES_PATH", "")
+  if (external_scores_path != "") {
+    external_names_and_scores <- list(
+      tar_target(
+        name = external_scores_df,
+        command = {
+          readRDS(external_scores_path) %>%
+            group_by(forecaster) %>%
+            targets::tar_group()
+        },
+        iteration = "group",
+        garbage_collection = TRUE
+      ),
+      tar_target(
+        name = external_names,
+        command = {
+          external_scores_df %>%
+            group_by(forecaster) %>%
+            group_keys() %>%
+            pull(forecaster)
+        },
+        garbage_collection = TRUE
+      ),
+      tar_target(
+        name = external_scores,
+        pattern = map(external_scores_df),
+        command = {
+          external_scores_df
+        },
+        # This step causes the pipeline to exit with an error, apparently due to
+        # running out of memory. Run this in series on a non-parallel `crew`
+        # controller to avoid.
+        # https://books.ropensci.org/targets/crew.html#heterogeneous-workers
+        resources = tar_resources(
+          crew = tar_resources_crew(controller = "serial_controller")
+        ),
+        memory = "transient",
+        garbage_collection = TRUE
+      )
+    )
+  } else {
+    external_names_and_scores <- list(
+      tar_target(
+        name = external_names,
+        command = {
+          c()
+        }
+      ),
+      tar_target(
+        name = external_scores,
+        command = {
+          data.frame()
+        }
+      )
+    )
+  }
+}
@@ -1,16 +1,16 @@
 covid_hosp_explore:
-  script: covid_hosp_explore.R
-  store: covid_hosp_explore
-  use_crew: yes
+    script: covid_hosp_explore.R
+    store: covid_hosp_explore
+    use_crew: no
 flu_hosp_explore:
-  script: flu_hosp_explore.R
-  store: flu_hosp_explore
-  use_crew: yes
+    script: flu_hosp_explore.R
+    store: flu_hosp_explore
+    use_crew: no
 flu_hosp_prod:
-  script: flu_hosp_prod.R
-  store: flu_hosp_prod
-  use_crew: yes
+    script: flu_hosp_prod.R
+    store: flu_hosp_prod
+    use_crew: no
 covid_hosp_prod:
-  script: covid_hosp_prod.R
-  store: covid_hosp_prod
-  use_crew: yes
+    script: covid_hosp_prod.R
+    store: covid_hosp_prod
+    use_crew: no