cmu-delphi
diff --git a/‎R/arx_forecaster.R
Lines changed: 69 additions & 50 deletions b/‎R/arx_forecaster.R
Lines changed: 69 additions & 50 deletions
diff --git a/‎R/arx_forecaster_mod.R
Lines changed: 0 additions & 83 deletions b/‎R/arx_forecaster_mod.R
Lines changed: 0 additions & 83 deletions
diff --git a/‎musings/arx_forecaster_old.R
Lines changed: 65 additions & 0 deletions b/‎musings/arx_forecaster_old.R
Lines changed: 65 additions & 0 deletions
diff --git a/‎R/assign_arg_list.R renamed to ‎musings/assign_arg_list.R b/‎R/assign_arg_list.R renamed to ‎musings/assign_arg_list.R
diff --git a/‎R/create_lags_and_leads.R renamed to ‎musings/create_lags_and_leads.R b/‎R/create_lags_and_leads.R renamed to ‎musings/create_lags_and_leads.R
diff --git a/‎R/knn_iterative_ar_forecaster.R renamed to ‎musings/knn_iterative_ar_forecaster.R b/‎R/knn_iterative_ar_forecaster.R renamed to ‎musings/knn_iterative_ar_forecaster.R
diff --git a/‎R/knnarx_forecaster.R renamed to ‎musings/knnarx_forecaster.R b/‎R/knnarx_forecaster.R renamed to ‎musings/knnarx_forecaster.R
diff --git a/‎R/make_predictions.R renamed to ‎musings/make_predictions.R b/‎R/make_predictions.R renamed to ‎musings/make_predictions.R
diff --git a/‎R/probs_to_string.R renamed to ‎musings/probs_to_string.R b/‎R/probs_to_string.R renamed to ‎musings/probs_to_string.R
diff --git a/‎R/smooth_and_fit.R renamed to ‎musings/smooth_and_fit.R b/‎R/smooth_and_fit.R renamed to ‎musings/smooth_and_fit.R
diff --git a/‎R/smooth_arx_forecaster.R renamed to ‎musings/smooth_arx_forecaster.R b/‎R/smooth_arx_forecaster.R renamed to ‎musings/smooth_arx_forecaster.R
diff --git a/‎vignettes/epipredict.Rmd
Lines changed: 6 additions & 6 deletions b/‎vignettes/epipredict.Rmd
Lines changed: 6 additions & 6 deletions
@@ -1,68 +1,87 @@
-#' AR forecaster with optional covariates
+#' Direct autoregressive forecaster with covariates
 #'
-#' @param x Covariates. Allowed to be missing (resulting in AR on `y`).
-#' @param y Response.
-#' @param key_vars Factor(s). A prediction will be made for each unique
-#'   combination.
-#' @param time_value the time value associated with each row of measurements.
-#' @param args Additional arguments specifying the forecasting task. Created
-#'   by calling `arx_args_list()`.
+#' This is an autoregressive forecasting model for
+#' [epiprocess::epi_df] data. It does "direct" forecasting, meaning
+#' that it estimates a model for a particular target horizon.
 #'
-#' @return A data frame of point (and optionally interval) forecasts at a single
-#'   ahead (unique horizon) for each unique combination of `key_vars`.
+#'
+#' @param epi_data An `epi_df` object
+#' @param outcome A character (scalar) specifying the outcome (in the
+#'   `epi_df`).
+#' @param predictors A character vector giving column(s) of predictor
+#'   variables.
+#' @param trainer A `{parsnip}` model describing the type of estimation.
+#'   For now, we enforce `mode = "regression"`.
+#' @param args_list A list of customization arguments to determine
+#'   the type of forecasting model. See [arx_args_list()].
+#'
+#' @return A list with (1) `predictions` an `epi_df` of predicted values
+#'   and (2) `epi_workflow`, a list that encapsulates the entire estimation
+#'   workflow
 #' @export
-arx_forecaster <- function(x, y, key_vars, time_value,
-                           args = arx_args_list()) {
+#'
+#' @examples
+#' jhu <- case_death_rate_subset %>%
+#'   dplyr::filter(time_value >= as.Date("2021-12-01"))
+#'
+#' out <- arx_forecaster(jhu, "death_rate",
+#'   c("case_rate", "death_rate"))
+arx_forecaster <- function(epi_data,
+                               outcome,
+                               predictors,
+                               trainer = parsnip::linear_reg(),
+                               args_list = arx_args_list()) {
 
-  # TODO: function to verify standard forecaster signature inputs
+  validate_forecaster_inputs(epi_data, outcome, predictors)
+  if (!is.list(trainer) || trainer$mode != "regression")
+    cli_stop("{trainer} must be a `parsnip` method of mode 'regression'.")
+  lags <- arx_lags_validator(predictors, args_list$lags)
 
-  assign_arg_list(args)
-  if (is.null(key_vars)) { # this is annoying/repetitive, seemingly necessary?
-    keys <- NULL
-    distinct_keys <- tibble(.dump = NA)
-  } else {
-    keys <- tibble::tibble(key_vars)
-    distinct_keys <- dplyr::distinct(keys)
+  r <- epi_recipe(epi_data)
+  for (l in seq_along(lags)) {
+    p <- predictors[l]
+    r <- step_epi_lag(r, !!p, lag = lags[[l]])
   }
+  r <- r %>%
+    step_epi_ahead(dplyr::all_of(!!outcome), ahead = args_list$ahead) %>%
+    step_epi_naomit()
+  # should limit the training window here (in an open PR)
+  # What to do if insufficient training data? Add issue.
 
-  # Return NA if insufficient training data
-  if (length(y) < min_train_window + max_lags + ahead) {
-    qnames <- probs_to_string(levels)
-    out <- dplyr::bind_cols(distinct_keys, point = NA) %>%
-      dplyr::select(!dplyr::any_of(".dump"))
-    return(enframer(out, qnames))
-  }
+  forecast_date <- args_list$forecast_date %||% max(epi_data$time_value)
+  target_date <- args_list$target_date %||% forecast_date + args_list$ahead
+  f <- frosting() %>%
+    layer_predict() %>%
+    # layer_naomit(.pred) %>%
+    layer_residual_quantiles(
+      probs = args_list$levels,
+      symmetrize = args_list$symmetrize) %>%
+    layer_add_forecast_date(forecast_date = forecast_date) %>%
+    layer_add_target_date(target_date = target_date)
+  if (args_list$nonneg) f <- layer_threshold(f, dplyr::starts_with(".pred"))
 
-  dat <- create_lags_and_leads(x, y, lags, ahead, time_value, keys)
-  dat$x0 <- 1
+  latest <- get_test_data(r, epi_data)
 
-  obj <- stats::lm(
-    y1 ~ . + 0,
-    data = dat %>% dplyr::select(starts_with(c("x", "y")))
+  wf <- epi_workflow(r, trainer, f) %>% generics::fit(epi_data)
+  list(
+    predictions = predict(wf, new_data = latest),
+    epi_workflow = wf
   )
+}
 
-  point <- make_predictions(obj, dat, time_value, keys)
-
-  # Residuals, simplest case, requires
-  # 1. same quantiles for all keys
-  # 2. `residuals(obj)` works
-  r <- residuals(obj)
-  q <- residual_quantiles(r, point, levels, symmetrize)
 
-  # Harder case requires handling failures of 1 and or 2, neither implemented
-  # 1. different quantiles by key, need to bind the keys, then group_modify
-  # 2 fails. need to bind the keys, grab, y and yhat, subtract
-  if (nonneg) {
-    q <- dplyr::mutate(q, dplyr::across(dplyr::everything(), ~ pmax(.x, 0)))
+arx_lags_validator <- function(predictors, lags) {
+  p <- length(predictors)
+  if (!is.list(lags)) lags <- list(lags)
+  if (length(lags) == 1) lags <- rep(lags, p)
+  else if (length(lags) < p) {
+    cli_stop(
+      "You have requested {p} predictors but lags cannot be recycled to match."
+    )
   }
-
-  return(
-    dplyr::bind_cols(distinct_keys, q) %>%
-      dplyr::select(!dplyr::any_of(".dump"))
-  )
+  lags
 }
 
-
 #' ARX forecaster argument constructor
 #'
 #' Constructs a list of arguments for [arx_forecaster()].
 
@@ -0,0 +1,65 @@
+#' AR forecaster with optional covariates
+#'
+#' @param x Covariates. Allowed to be missing (resulting in AR on `y`).
+#' @param y Response.
+#' @param key_vars Factor(s). A prediction will be made for each unique
+#'   combination.
+#' @param time_value the time value associated with each row of measurements.
+#' @param args Additional arguments specifying the forecasting task. Created
+#'   by calling `arx_args_list()`.
+#'
+#' @return A data frame of point (and optionally interval) forecasts at a single
+#'   ahead (unique horizon) for each unique combination of `key_vars`.
+#' @export
+arx_forecaster <- function(x, y, key_vars, time_value,
+                           args = arx_args_list()) {
+
+  # TODO: function to verify standard forecaster signature inputs
+
+  assign_arg_list(args)
+  if (is.null(key_vars)) { # this is annoying/repetitive, seemingly necessary?
+    keys <- NULL
+    distinct_keys <- tibble(.dump = NA)
+  } else {
+    keys <- tibble::tibble(key_vars)
+    distinct_keys <- dplyr::distinct(keys)
+  }
+
+  # Return NA if insufficient training data
+  if (length(y) < min_train_window + max_lags + ahead) {
+    qnames <- probs_to_string(levels)
+    out <- dplyr::bind_cols(distinct_keys, point = NA) %>%
+      dplyr::select(!dplyr::any_of(".dump"))
+    return(enframer(out, qnames))
+  }
+
+  dat <- create_lags_and_leads(x, y, lags, ahead, time_value, keys)
+  dat$x0 <- 1
+
+  obj <- stats::lm(
+    y1 ~ . + 0,
+    data = dat %>% dplyr::select(starts_with(c("x", "y")))
+  )
+
+  point <- make_predictions(obj, dat, time_value, keys)
+
+  # Residuals, simplest case, requires
+  # 1. same quantiles for all keys
+  # 2. `residuals(obj)` works
+  r <- residuals(obj)
+  q <- residual_quantiles(r, point, levels, symmetrize)
+
+  # Harder case requires handling failures of 1 and or 2, neither implemented
+  # 1. different quantiles by key, need to bind the keys, then group_modify
+  # 2 fails. need to bind the keys, grab, y and yhat, subtract
+  if (nonneg) {
+    q <- dplyr::mutate(q, dplyr::across(dplyr::everything(), ~ pmax(.x, 0)))
+  }
+
+  return(
+    dplyr::bind_cols(distinct_keys, q) %>%
+      dplyr::select(!dplyr::any_of(".dump"))
+  )
+}
+
+
@@ -103,7 +103,7 @@ We'll estimate the model jointly across all locations using only the most recent
 
 ```{r demo-workflow}
 jhu <- jhu %>% filter(time_value >= max(time_value) - 30)
-out <- arx_epi_forecaster(jhu, outcome = "death_rate",
+out <- arx_forecaster(jhu, outcome = "death_rate",
   predictors = c("case_rate", "death_rate")
 )
 ```
@@ -131,7 +131,7 @@ knitr::opts_chunk$set(warning = FALSE, message = FALSE)
 ```
 
 ```{r differential-lags}
-out2week <- arx_epi_forecaster(jhu, "death_rate", c("case_rate", "death_rate"),
+out2week <- arx_forecaster(jhu, "death_rate", c("case_rate", "death_rate"),
   args_list = arx_args_list(
     lags = list(c(0,1,2,3,7,14), c(0,7,14)),
     ahead = 14)
@@ -145,7 +145,7 @@ Here, we've used different lags on the `case_rate` and are now predicting 2 week
 Another property of the basic model is the predictive interval. We describe this in more detail in a different vignette, but it is easy to request multiple quantiles.
 
 ```{r differential-levels}
-out_q <- arx_epi_forecaster(jhu, "death_rate", c("case_rate", "death_rate"),
+out_q <- arx_forecaster(jhu, "death_rate", c("case_rate", "death_rate"),
   args_list = arx_args_list(
     levels = c(.01,.025, seq(.05,.95, by=.05), .975,.99))
   )
@@ -183,14 +183,14 @@ The `trainer` argument determines the type of model we want.
 This takes a [`{parsnip}`](https://parsnip.tidymodels.org) model. The default is linear regression, but we could instead use a random forest with the `{ranger}` package:
 
 ```{r ranger, warning = FALSE}
-out_rf <- arx_epi_forecaster(jhu, "death_rate", c("case_rate", "death_rate"),
+out_rf <- arx_forecaster(jhu, "death_rate", c("case_rate", "death_rate"),
   rand_forest(mode = "regression"))
 ```
 
 Or boosted regression trees with `{xgboost}`:
 
 ```{r xgboost, warning = FALSE}
-out_gb <- arx_epi_forecaster(jhu, "death_rate", c("case_rate", "death_rate"),
+out_gb <- arx_forecaster(jhu, "death_rate", c("case_rate", "death_rate"),
   boost_tree(mode = "regression", trees = 20))
 ```
 
@@ -290,7 +290,7 @@ To stretch the metaphor of preparing a cake to its natural limits, we have
 created postprocessing functionality called "frosting". Much like the recipe,
 each postprocessing operation is a "layer" and we "slather" these onto our 
 baked cake. To fix ideas, below is the postprocessing `frosting` for 
-`arx_epi_forecaster()`
+`arx_forecaster()`
 
 ```{r}
 extract_frosting(out_q$epi_workflow)