refactor: small rename, no extra negatives

dshemetov · dshemetov · commit 275b7af3db2b · 2023-11-06T18:18:53.000-08:00
diff --git a/NAMESPACE b/NAMESPACE
@@ -5,7 +5,7 @@ export(add_id)
 export(arx_postprocess)
 export(arx_preprocess)
 export(collapse_cards)
-export(confirm_insufficient_data)
+export(confirm_sufficient_data)
 export(covidhub_probs)
 export(evaluate_predictions)
 export(extend_ahead)
diff --git a/R/forecaster.R b/R/forecaster.R
@@ -43,8 +43,7 @@ perform_sanity_checks <- function(epi_data,
 #' confirm that there's enough data to run this model
 #' @description
 #' epipredict is a little bit fragile about having enough data to train; we want
-#'   to be able to return a null result rather than error out; this check say to
-#'   return a null
+#'   to be able to return a null result rather than error out.
 #' @param epi_data the input data
 #' @param buffer how many training data to insist on having (e.g. if `buffer=1`,
 #'   this trains on one sample; the default is set so that `linear_reg` isn't
@@ -53,19 +52,30 @@ perform_sanity_checks <- function(epi_data,
 #' @param args_input the input as supplied to `forecaster_pred`; lags is the
 #'   important argument, which may or may not be defined, with the default
 #'   coming from `arx_args_list`
+#'
+#' # TODO: Buffer should probably be 2 * n(lags) * n(predictors).
+#'
 #' @export
-confirm_insufficient_data <- function(epi_data, ahead, args_input, buffer = 9) {
+confirm_sufficient_data <- function(epi_data, ahead, args_input, buffer = 15) {
   if (!is.null(args_input$lags)) {
     lag_max <- max(args_input$lags)
   } else {
     lag_max <- 14 # default value of 2 weeks
   }
+
   return(
-    is.infinite(ahead) ||
-      as.integer(max(epi_data$time_value) - min(epi_data$time_value)) <=
-        lag_max + ahead + buffer
+    !is.infinite(ahead) &&
+      epi_data %>%
+        # TODO: This isn't generalizable to other signals.
+        filter(!is.na(hhs) & !is.na(chng)) %>%
+        # TODO: Quitting forecasting because of one geo_value is bad.
+        group_by(geo_value) %>%
+        summarise(has_enough_data = n_distinct(time_value) >= lag_max + ahead + buffer) %>%
+        pull(has_enough_data) %>%
+        any()
   )
 }
+
 # TODO replace with `step_arx_forecaster`
 #' add the default steps for arx_forecaster
 #' @description
@@ -187,7 +197,8 @@ forecaster_pred <- function(data,
                             slide_training = 0,
                             n_training_pad = 5,
                             forecaster_args = list(),
-                            forecaster_args_names = list()) {
+                            forecaster_args_names = list(),
+                            date_range_step_size = 1L) {
   archive <- data
   if (length(forecaster_args) > 0) {
     names(forecaster_args) <- forecaster_args_names
@@ -210,25 +221,45 @@ forecaster_pred <- function(data,
   # restrict the dataset to areas where training is possible
   start_date <- min(archive$DT$time_value) + net_slide_training
   end_date <- max(archive$DT$time_value) - forecaster_args$ahead
-  valid_predict_dates <- seq.Date(from = start_date, to = end_date, by = 1)
+  valid_predict_dates <- seq.Date(from = start_date, to = end_date, by = date_range_step_size)
 
   # first generate the forecasts
   before <- n_training + n_training_pad - 1
-  ## TODO epix_slide doesn't support infinite `before`
+  ## TODO: epix_slide doesn't support infinite `before`
   ## https://github.com/cmu-delphi/epiprocess/issues/219
   if (before == Inf) before <- 365L * 10000
   res <- epix_slide(archive,
     function(data, gk, rtv, ...) {
-      do.call(
-        forecaster,
-        append(
-          list(
-            epi_data = data,
-            outcome = outcome,
-            extra_sources = extra_sources
-          ),
-          forecaster_args
-        )
+      # TODO: Can we get rid of this tryCatch and instead hook it up to targets
+      #       error handling or something else?
+      tryCatch(
+        {
+          do.call(
+            forecaster,
+            append(
+              list(
+                epi_data = data,
+                outcome = outcome,
+                extra_sources = extra_sources
+              ),
+              forecaster_args
+            )
+          )
+        },
+        error = function(e) {
+          if (interactive()) {
+            browser()
+          } else {
+            dump_vars <- list(
+              data = data,
+              rtv = rtv,
+              forecaster = forecaster,
+              forecaster_args = forecaster_args,
+              e = e
+            )
+            saveRDS(dump_vars, "forecaster_pred_error.rds")
+          }
+        }
       )
     },
     before = before,
diff --git a/R/forecaster_flatline.R b/R/forecaster_flatline.R
@@ -23,7 +23,7 @@ flatline_fc <- function(epi_data,
   effective_ahead <- epidataAhead[[2]]
   args_input <- list(...)
   # edge case where there is no data or less data than the lags; eventually epipredict will handle this
-  if (confirm_insufficient_data(epi_data, effective_ahead, args_input)) {
+  if (!confirm_sufficient_data(epi_data, effective_ahead, args_input)) {
     null_result <- tibble(
       geo_value = character(),
       forecast_date = lubridate::Date(),
diff --git a/R/forecaster_scaled_pop.R b/R/forecaster_scaled_pop.R
@@ -58,7 +58,7 @@ scaled_pop <- function(epi_data,
   effective_ahead <- epidataAhead[[2]]
   args_input <- list(...)
   # edge case where there is no data or less data than the lags; eventually epipredict will handle this
-  if (confirm_insufficient_data(epi_data, effective_ahead, args_input)) {
+  if (!confirm_sufficient_data(epi_data, effective_ahead, args_input)) {
     null_result <- tibble(
       geo_value = character(),
       forecast_date = lubridate::Date(),
@@ -73,6 +73,7 @@ scaled_pop <- function(epi_data,
   args_list <- do.call(arx_args_list, args_input)
   # if you want to ignore extra_sources, setting predictors is the way to do it
   predictors <- c(outcome, extra_sources)
+  # TODO: Partial match quantile_level coming from here
   argsPredictorsTrainer <- perform_sanity_checks(epi_data, outcome, predictors, trainer, args_list)
   args_list <- argsPredictorsTrainer[[1]]
   predictors <- argsPredictorsTrainer[[2]]
@@ -98,7 +99,6 @@ scaled_pop <- function(epi_data,
   # postprocessing supported by epipredict
   postproc <- frosting()
   postproc %<>% arx_postprocess(trainer, args_list)
-  postproc
   if (pop_scaling) {
     postproc %<>% layer_population_scaling(
       .pred, .pred_distn,
diff --git a/flu_hosp_explore.R b/flu_hosp_explore.R
@@ -72,7 +72,8 @@ forecasts_and_scores_by_ahead <- tar_map(
         forecaster = forecaster,
         n_training_pad = 30L,
         forecaster_args = params,
-        forecaster_args_names = param_names
+        forecaster_args_names = param_names,
+        date_range_step_size = 7L
       )
     )
   ),
diff --git a/flu_hosp_explore/data_targets.R b/flu_hosp_explore/data_targets.R
@@ -1,7 +1,7 @@
 geo_type <- "state"
 time_type <- "day"
 geo_values <- "*"
-time_values <- epidatr::epirange(from = "2020-01-01", to = "2024-01-01")
+time_values <- epidatr::epirange(from = "2022-01-01", to = "2024-01-01")
 fetch_args <- epidatr::fetch_args_list(return_empty = TRUE, timeout_seconds = 200)
 issues <- "*"
 
@@ -76,6 +76,7 @@ data_targets <- list(
   tar_target(
     name = chng_archive_data_2022,
     command = {
+      # TODO: Filter out unused columns like missing, direction, etc.
       epidatr::pub_covidcast(
         source = "chng",
         signals = "smoothed_adj_outpatient_flu",
@@ -109,7 +110,10 @@ data_targets <- list(
           time_type = time_type,
           compactify = TRUE
         )
-      epix_merge(hhs_archive_data_2022, chng_archive_data_2022, sync = "locf")
+      epix_merge(hhs_archive_data_2022, chng_archive_data_2022, sync = "locf")$DT %>%
+        filter(!is.na(hhs) & !is.na(chng)) %>%
+        filter(!geo_value %in% c("as", "pr", "vi", "gu", "mp")) %>%
+        epiprocess::as_epi_archive()
     }
   )
 )
diff --git a/man/confirm_sufficient_data.Rd b/man/confirm_sufficient_data.Rd
diff --git a/man/forecaster_pred.Rd b/man/forecaster_pred.Rd
diff --git a/run.R b/run.R
@@ -42,6 +42,8 @@ input: ") {
 project_selection <- readline_wrapper()
 external_scores_path <- readline_wrapper("path to RDS file containing external forecast scores, if desired:")
 
+debug_mode <- readline_wrapper("Would you like to run debug mode? (y/[N]): ")
+
 suppressPackageStartupMessages({
   library(targets)
   library(shiny)
@@ -79,7 +81,11 @@ tar_helper(
 )
 
 tar_manifest()
-tar_make()
+if (debug_mode == "y") {
+  tar_make(callr_function = NULL)
+} else {
+  tar_make()
+}
 # tar_make_clustermq(workers = 2) # nolint
 # tar_make_future(workers = 2) # nolint
 
diff --git a/tests/testthat/test-forecasters-basics.R b/tests/testthat/test-forecasters-basics.R
@@ -26,7 +26,7 @@ for (forecaster in forecasters) {
         "case_rate",
         c("death_rate"),
         -2L,
-        pop_scaling = FALSE
+        pop_scaling = FALSE,
       )
       expect_false(res_unscaled %>%
         full_join(res,

Original file line number	Diff line number	Diff line change
`@@ -72,7 +72,8 @@ forecasts_and_scores_by_ahead <- tar_map(`
`72`	`72`	`forecaster = forecaster,`
`73`	`73`	`n_training_pad = 30L,`
`74`	`74`	`forecaster_args = params,`
`75`		`- forecaster_args_names = param_names`
	`75`	`+ forecaster_args_names = param_names,`
	`76`	`+ date_range_step_size = 7L`
`76`	`77`	`)`
`77`	`78`	`)`
`78`	`79`	`),`
Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@ for (forecaster in forecasters) {`
`26`	`26`	`"case_rate",`
`27`	`27`	`c("death_rate"),`
`28`	`28`	`-2L,`
`29`		`- pop_scaling = FALSE`
	`29`	`+ pop_scaling = FALSE,`
`30`	`30`	`)`
`31`	`31`	`expect_false(res_unscaled %>%`
`32`	`32`	`full_join(res,`