cmu-delphi
diff --git a/‎NAMESPACE
Lines changed: 1 addition & 5 deletions b/‎NAMESPACE
Lines changed: 1 addition & 5 deletions
diff --git a/‎R/data_validation.R
Lines changed: 77 additions & 0 deletions b/‎R/data_validation.R
Lines changed: 77 additions & 0 deletions
diff --git a/‎R/epipredict_utilities.R
Lines changed: 90 additions & 0 deletions b/‎R/epipredict_utilities.R
Lines changed: 90 additions & 0 deletions
@@ -15,7 +15,6 @@ export(evaluate_predictions)
 export(extend_ahead)
 export(flatline_fc)
 export(forecaster_lookup)
-export(forecaster_pred)
 export(format_storage)
 export(id_ahead_ensemble_grid)
 export(interval_coverage)
@@ -28,7 +27,6 @@ export(make_shared_ensembles)
 export(make_shared_grids)
 export(make_target_ensemble_grid)
 export(make_target_param_grid)
-export(manage_S3_forecast_cache)
 export(overprediction)
 export(perform_sanity_checks)
 export(read_external_predictions_data)
@@ -37,11 +35,10 @@ export(run_workflow_and_format)
 export(scaled_pop)
 export(sharpness)
 export(single_id)
+export(slide_forecaster)
 export(underprediction)
 export(weighted_interval_score)
 importFrom(assertthat,assert_that)
-importFrom(aws.s3,get_bucket)
-importFrom(aws.s3,s3sync)
 importFrom(cli,cli_abort)
 importFrom(cli,hash_animal)
 importFrom(dplyr,across)
@@ -85,7 +82,6 @@ importFrom(epipredict,step_population_scaling)
 importFrom(epipredict,step_training_window)
 importFrom(epiprocess,as_epi_df)
 importFrom(epiprocess,epix_slide)
-importFrom(here,here)
 importFrom(magrittr,"%<>%")
 importFrom(magrittr,"%>%")
 importFrom(purrr,imap)
 
@@ -0,0 +1,77 @@
+#' helper function for those writing forecasters
+#' @description
+#' a smorgasbord of checks that any epipredict-based forecaster should do:
+#' 1. check that the args list is created correctly,
+#' 2. rewrite an empty extra sources list from an empty string
+#' 3. validate the outcome and predictors as present,
+#' 4. make sure the trainer is a `regression` model from `parsnip`
+#' 5. adjust the trainer's quantiles based on those in args_list if it's a
+#'    quantile trainer
+#' 6. remake the lags to match the numebr of predictors
+#' @inheritParams scaled_pop
+#' @param predictors the full list of predictors including the outcome. can
+#'   include empty strings
+#' @param args_list the args list created by [`epipredict::arx_args_list`]
+#' @export
+perform_sanity_checks <- function(epi_data,
+                                  outcome,
+                                  predictors,
+                                  trainer,
+                                  args_list) {
+  if (!inherits(args_list, c("arx_fcast", "alist"))) {
+    cli::cli_abort("args_list was not created using `arx_args_list().")
+  }
+
+  predictors <- predictors[predictors != ""]
+  epipredict:::validate_forecaster_inputs(epi_data, outcome, predictors)
+
+  if (!is.null(trainer) && !epipredict:::is_regression(trainer)) {
+    cli::cli_abort("{trainer} must be a `{parsnip}` model of mode 'regression'.")
+  } else if (inherits(trainer, "quantile_reg")) {
+    # add all quantile_levels to the trainer and update args list
+    quantile_levels <- sort(epipredict:::compare_quantile_args(
+      args_list$quantile_levels,
+      rlang::eval_tidy(trainer$args$quantile_levels)
+    ))
+    args_list$quantile_levels <- quantile_levels
+    trainer$args$quantile_levels <- rlang::enquo(quantile_levels)
+  }
+  args_list$lags <- epipredict:::arx_lags_validator(predictors, args_list$lags)
+  return(list(args_list, predictors, trainer))
+}
+
+#' confirm that there's enough data to run this model
+#' @description
+#' epipredict is a little bit fragile about having enough data to train; we want
+#'   to be able to return a null result rather than error out.
+#' @param epi_data the input data
+#' @param ahead the effective ahead; may be infinite if there isn't enough data.
+#' @param args_input the input as supplied to `slide_forecaster`; lags is the
+#'   important argument, which may or may not be defined, with the default
+#'   coming from `arx_args_list`
+#' @param buffer how many training data to insist on having (e.g. if `buffer=1`,
+#'   this trains on one sample; the default is set so that `linear_reg` isn't
+#'   rank deficient)
+#' @importFrom tidyr drop_na
+#' @export
+confirm_sufficient_data <- function(epi_data, ahead, args_input, buffer = 9) {
+  if (!is.null(args_input$lags)) {
+    lag_max <- max(args_input$lags)
+  } else {
+    lag_max <- 14 # default value of 2 weeks
+  }
+
+  # TODO: Buffer should probably be 2 * n(lags) * n(predictors). But honestly,
+  # this needs to be fixed in epipredict itself, see
+  # https://github.com/cmu-delphi/epipredict/issues/106.
+
+  return(
+    !is.infinite(ahead) &&
+      epi_data %>%
+        drop_na() %>%
+        group_by(geo_value) %>%
+        summarise(has_enough_data = n_distinct(time_value) >= lag_max + ahead + buffer) %>%
+        pull(has_enough_data) %>%
+        any()
+  )
+}
@@ -0,0 +1,90 @@
+# TODO replace with `step_arx_forecaster`
+#' add the default steps for arx_forecaster
+#' @description
+#' add the default steps for arx_forecaster
+#' @param rec an [`epipredict::epi_recipe`]
+#' @param outcome a character of the column to be predicted
+#' @param predictors a character vector of the columns used as predictors
+#' @param args_list an [`epipredict::arx_args_list`]
+#' @seealso [arx_postprocess] for the layer equivalent
+#' @importFrom epipredict step_epi_lag step_epi_ahead step_epi_naomit step_training_window
+#' @export
+arx_preprocess <- function(rec, outcome, predictors, args_list) {
+  # input already validated
+  lags <- args_list$lags
+  for (l in seq_along(lags)) {
+    p <- predictors[l]
+    rec %<>% step_epi_lag(!!p, lag = lags[[l]])
+  }
+  rec %<>%
+    step_epi_ahead(!!outcome, ahead = args_list$ahead) %>%
+    step_epi_naomit() %>%
+    step_training_window(n_recent = args_list$n_training)
+  return(rec)
+}
+
+# TODO replace with `layer_arx_forecaster`
+#' add the default layers for arx_forecaster
+#' @description
+#' add the default layers for arx_forecaster
+#' @param postproc an [`epipredict::frosting`]
+#' @param trainer the trainer used (e.g. linear_reg() or quantile_reg())
+#' @param args_list an [`epipredict::arx_args_list`]
+#' @param forecast_date the date from which the forecast was made. defaults to
+#'   the default of `layer_add_forecast_date`, which is currently the max
+#'   time_value present in the data
+#' @param target_date the date about which the forecast was made. defaults to
+#'   the default of `layer_add_target_date`, which is either
+#'   `forecast_date+ahead`, or the `max time_value + ahead`
+#' @seealso [arx_preprocess] for the step equivalent
+#' @importFrom epipredict layer_predict layer_quantile_distn layer_point_from_distn layer_residual_quantiles layer_threshold layer_naomit layer_add_target_date
+#' @export
+arx_postprocess <- function(postproc,
+                            trainer,
+                            args_list,
+                            forecast_date = NULL,
+                            target_date = NULL) {
+  postproc %<>% layer_predict()
+  if (inherits(trainer, "quantile_reg")) {
+    postproc %<>%
+      layer_quantile_distn(quantile_levels = args_list$quantile_levels) %>%
+      layer_point_from_distn()
+  } else {
+    postproc %<>% layer_residual_quantiles(
+      quantile_levels = args_list$quantile_levels, symmetrize = args_list$symmetrize,
+      by_key = args_list$quantile_by_key
+    )
+  }
+  if (args_list$nonneg) {
+    postproc %<>% layer_threshold(dplyr::starts_with(".pred"))
+  }
+
+  postproc %<>%
+    layer_naomit(dplyr::starts_with(".pred")) %>%
+    layer_add_target_date(target_date = target_date)
+  return(postproc)
+}
+
+#' helper function to run a epipredict model and reformat to hub format
+#' @description
+#' helper function to run a epipredict model and reformat to hub format
+#' @param preproc the preprocessing steps
+#' @param postproc the postprocessing frosting
+#' @param trainer the parsnip trainer
+#' @param epi_data the actual epi_df to train on
+#' @export
+#' @importFrom epipredict epi_workflow fit add_frosting get_test_data
+run_workflow_and_format <- function(preproc, postproc, trainer, epi_data) {
+  workflow <- epi_workflow(preproc, trainer) %>%
+    fit(epi_data) %>%
+    add_frosting(postproc)
+  latest <- get_test_data(recipe = preproc, x = epi_data)
+  pred <- predict(workflow, latest)
+  # the forecast_date may currently be the max time_value
+  as_of <- attributes(epi_data)$metadata$as_of
+  if (is.null(as_of)) {
+    as_of <- max(epi_data$time_value)
+  }
+  true_forecast_date <- as_of
+  return(format_storage(pred, true_forecast_date))
+}