From 8a1e2d624adbaed5c031b00261f317c0f93f92b4 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Thu, 14 Sep 2023 16:18:30 -0700 Subject: [PATCH 01/22] start CDC baseline layer --- R/layer_cdc_flatline_quantiles.R | 102 +++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 R/layer_cdc_flatline_quantiles.R diff --git a/R/layer_cdc_flatline_quantiles.R b/R/layer_cdc_flatline_quantiles.R new file mode 100644 index 000000000..0e2fa4f1b --- /dev/null +++ b/R/layer_cdc_flatline_quantiles.R @@ -0,0 +1,102 @@ +layer_cdc_flatline_quantiles <- function( + frosting, + ..., + aheads = 1:4, + quantiles = c(.01, .025, 1:19 / 20, .975, .99), + nsims = 1e5, + by_key = "geo_value", + symmetrize = FALSE, + id = rand_id("cdc_baseline_bands")) { + + rlang::check_dots_empty() + + arg_is_int(aheads) + arg_is_probabilities(quantiles) + arg_is_pos_int(nsims) + arg_is_scalar(nsims) + arg_is_chr_scalar(id) + arg_is_lgl_scalar(symmetrize) + arg_is_chr(by_key, allow_null = TRUE, allow_na = TRUE, allow_empty = TRUE) + + add_layer( + frosting, + layer_cdc_flatline_quantiles_new( + aheads = aheads, + quantiles = quantiles, + nsims = nsims, + by_key = by_key, + symmetrize = symmetrize, + id = id + ) + ) +} + +layer_cdc_flatline_quantiles_new <- function( + aheads, + quantiles, + nsims, + by_key, + symmetrize, + id +) { + layer( + "cdc_flatline_quantiles", + aheads, + quantiles, + nsims, + by_key, + symmetrize, + id + ) +} + +#' @export +slather.layer_cdc_flatline_quantiles <- + function(object, components, workflow, new_data, ...) { + the_fit <- workflows::extract_fit_parsnip(workflow) + s <- ifelse(object$symmetrize, -1, NA) + r <- grab_residuals(the_fit, components) + + ## Handle any grouping requests + if (length(object$by_key) > 0L) { + key_cols <- dplyr::bind_cols( + geo_value = components$mold$extras$roles$geo_value, + components$mold$extras$roles$key + ) + common <- intersect(object$by_key, names(key_cols)) + excess <- setdiff(object$by_key, names(key_cols)) + if (length(excess) > 0L) { + rlang::warn( + "Requested residual grouping key(s) {excess} are unavailable ", + "in the original data. Grouping by the remainder: {common}." + ) + } + if (length(common) > 0L) { + r <- r %>% dplyr::select(tidyselect::any_of(c(common, ".resid"))) + common_in_r <- common[common %in% names(r)] + if (length(common_in_r) != length(common)) { + rlang::warn( + "Some grouping keys are not in data.frame returned by the", + "`residuals()` method. Groupings may not be correct." + ) + } + r <- dplyr::bind_cols(key_cols, r) %>% + dplyr::group_by(!!!rlang::syms(common)) + } + } + + + + + + + # always return components + components + } + +propogate_samples <- function(x, p, horizon, nsim, symmetrize) { + samp <- quantile(x, probs = c(0, seq_len(nsim)) / nsim) + + for (iter in seq(horizon)) {} +} + From cea1599dfad47e77c8a8026a66b16963ed7c86dd Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Fri, 22 Sep 2023 10:33:18 -0700 Subject: [PATCH 02/22] upgrade enframer --- R/utils-enframer.R | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/R/utils-enframer.R b/R/utils-enframer.R index 387d04356..0a8152906 100644 --- a/R/utils-enframer.R +++ b/R/utils-enframer.R @@ -13,10 +13,11 @@ enframer <- function(df, x, fill = NA) { } enlist <- function(...) { - # in epiprocess - x <- list(...) - n <- as.character(sys.call())[-1] - if (!is.null(n0 <- names(x))) n[n0 != ""] <- n0[n0 != ""] - names(x) <- n - x + # converted to thin wrapper around + rlang::dots_list( + ..., + .homonyms = "error", + .named = TRUE, + .check_assign = TRUE + ) } From d606741cb05d8546f82ca057e5515a53b19b2a05 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Sat, 23 Sep 2023 16:33:50 -0700 Subject: [PATCH 03/22] functions, remains to check validity --- R/compat-purrr.R | 5 + R/layer_cdc_flatline_quantiles.R | 144 +++++++++++++++++------- R/layer_residual_quantiles.R | 8 +- tests/testthat/test-propogate_samples.R | 8 ++ tests/testthat/test-shuffle.R | 5 + 5 files changed, 128 insertions(+), 42 deletions(-) create mode 100644 tests/testthat/test-propogate_samples.R create mode 100644 tests/testthat/test-shuffle.R diff --git a/R/compat-purrr.R b/R/compat-purrr.R index 7e28bd630..712926f73 100644 --- a/R/compat-purrr.R +++ b/R/compat-purrr.R @@ -32,6 +32,11 @@ map_chr <- function(.x, .f, ...) { .rlang_purrr_map_mold(.x, .f, character(1), ...) } +map_vec <- function(.x, .f, ...) { + out <- map(.x, .f, ...) + vctrs::list_unchop(out) +} + map_dfr <- function(.x, .f, ..., .id = NULL) { .f <- rlang::as_function(.f, env = rlang::global_env()) res <- map(.x, .f, ...) diff --git a/R/layer_cdc_flatline_quantiles.R b/R/layer_cdc_flatline_quantiles.R index 0e2fa4f1b..c64b96c2d 100644 --- a/R/layer_cdc_flatline_quantiles.R +++ b/R/layer_cdc_flatline_quantiles.R @@ -6,6 +6,7 @@ layer_cdc_flatline_quantiles <- function( nsims = 1e5, by_key = "geo_value", symmetrize = FALSE, + nonneg = TRUE, id = rand_id("cdc_baseline_bands")) { rlang::check_dots_empty() @@ -15,7 +16,7 @@ layer_cdc_flatline_quantiles <- function( arg_is_pos_int(nsims) arg_is_scalar(nsims) arg_is_chr_scalar(id) - arg_is_lgl_scalar(symmetrize) + arg_is_lgl_scalar(symmetrize, nonneg) arg_is_chr(by_key, allow_null = TRUE, allow_na = TRUE, allow_empty = TRUE) add_layer( @@ -26,6 +27,7 @@ layer_cdc_flatline_quantiles <- function( nsims = nsims, by_key = by_key, symmetrize = symmetrize, + nonneg = nonneg, id = id ) ) @@ -37,16 +39,18 @@ layer_cdc_flatline_quantiles_new <- function( nsims, by_key, symmetrize, + nonneg, id ) { layer( "cdc_flatline_quantiles", - aheads, - quantiles, - nsims, - by_key, - symmetrize, - id + aheads = aheads, + quantiles = quantiles, + nsims = nsims, + by_key = by_key, + symmetrize = symmetrize, + nonneg = nonneg, + id = id ) } @@ -54,49 +58,113 @@ layer_cdc_flatline_quantiles_new <- function( slather.layer_cdc_flatline_quantiles <- function(object, components, workflow, new_data, ...) { the_fit <- workflows::extract_fit_parsnip(workflow) - s <- ifelse(object$symmetrize, -1, NA) + if (!inherits(the_fit, "_flatline")) { + cli::cli_warn( + c("Predictions for this workflow were not produced by the {.cls flatline}", + "{.pkg parsnip} engine. Results may be unexpected. See {.fn epipredict::flatline}.") + ) + } + p <- components$predictions + ek <- kill_time_value(epi_keys_mold(components$mold)) r <- grab_residuals(the_fit, components) - ## Handle any grouping requests + avail_grps <- character(0L) if (length(object$by_key) > 0L) { - key_cols <- dplyr::bind_cols( - geo_value = components$mold$extras$roles$geo_value, - components$mold$extras$roles$key - ) - common <- intersect(object$by_key, names(key_cols)) - excess <- setdiff(object$by_key, names(key_cols)) - if (length(excess) > 0L) { - rlang::warn( - "Requested residual grouping key(s) {excess} are unavailable ", - "in the original data. Grouping by the remainder: {common}." - ) + cols_in_preds <- hardhat::check_column_names(p, object$by_key) + if (!cols_in_preds$ok) { + cli::cli_warn(c( + "Predicted values are missing key columns: {.var cols_in_preds$missing_names}.", + "Ignoring these." + )) } - if (length(common) > 0L) { - r <- r %>% dplyr::select(tidyselect::any_of(c(common, ".resid"))) - common_in_r <- common[common %in% names(r)] - if (length(common_in_r) != length(common)) { - rlang::warn( - "Some grouping keys are not in data.frame returned by the", - "`residuals()` method. Groupings may not be correct." - ) + if (inherits(the_fit, "_flatline")) { + cols_in_resids <- hardhat::check_column_names(r, object$by_key) + if (!cols_in_resids$ok) { + cli::cli_warn(c( + "Existing residuals are missing key columns: {.var cols_in_resids$missing_names}.", + "Ignoring these." + )) + } + # use only the keys that are in the predictions and requested. + avail_grps <- intersect(ek, setdiff( + object$by_key, + c(cols_in_preds$missing_names, cols_in_resids$missing_names) + )) + } else { # not flatline, but we'll try + key_cols <- dplyr::bind_cols( + geo_value = components$mold$extras$roles$geo_value, + components$mold$extras$roles$key + ) + cols_in_resids <- hardhat::check_column_names(key_cols, object$by_key) + if (!cols_in_resids$ok) { + cli::cli_warn(c( + "Requested residuals are missing key columns: {.var cols_in_resids$missing_names}.", + "Ignoring these." + )) } - r <- dplyr::bind_cols(key_cols, r) %>% - dplyr::group_by(!!!rlang::syms(common)) + avail_grps <- intersect(names(key_cols), setdiff( + object$by_key, + c(cols_in_preds$missing_names, cols_in_resids$missing_names) + )) + r <- dplyr::bind_cols(key_cols, r) } } + r <- r %>% + dplyr::select(tidyselect::all_of(c(avail_grps, ".resid"))) %>% + dplyr::group_by(!!!rlang::syms(avail_grps)) %>% + dplyr::summarise(.resid = list(.resid), .groups = "drop") + res <- dplyr::left_join(p, r, by = avail_grps) %>% + dplyr::rowwise() %>% + dplyr::mutate( + .pred_distn_all = propogate_samples( + .resid, .pred, object$quantiles, + object$aheads, object$nsim, object$symmetrize, object$nonneg + ) + ) %>% + dplyr::select(tidyselect::all_of(c(avail_grps, ".pred_distn_all"))) - - - - - # always return components + # res <- check_pname(res, components$predictions, object) + components$predictions <- dplyr::left_join( + components$predictions, + res, + by = avail_grps + ) components } -propogate_samples <- function(x, p, horizon, nsim, symmetrize) { - samp <- quantile(x, probs = c(0, seq_len(nsim)) / nsim) +propogate_samples <- function( + r, p, quantiles, aheads, nsim, symmetrize, nonneg) { + max_ahead <- max(aheads) + samp <- quantile(r, probs = c(0, seq_len(nsim - 1)) / (nsim - 1), na.rm = TRUE) + res <- list() - for (iter in seq(horizon)) {} + # p should be all the same + p <- max(p, na.rm = TRUE) + + raw <- samp + p + if (nonneg) raw <- pmax(0, raw) + res[[1]] <- raw + if (max_ahead > 1L) { + for (iter in 2:max_ahead) { + samp <- shuffle(samp) + raw <- raw + samp + if (symmetrize) symmetric <- raw - (median(raw) + p) + else symmetric <- raw + if (nonneg) symmetric <- pmax(0, symmetric) + res[[iter]] <- symmetric + } + } + res <- res[aheads] + list(tibble::tibble( + aheads = aheads, + .pred_distn = map_vec( + res, ~ dist_quantiles(quantile(.x, quantiles), tau = quantiles) + ) + )) } +shuffle <- function(x) { + stopifnot(is.vector(x)) + sample(x, length(x), replace = FALSE) +} diff --git a/R/layer_residual_quantiles.R b/R/layer_residual_quantiles.R index a9a8cab24..b9a71e265 100644 --- a/R/layer_residual_quantiles.R +++ b/R/layer_residual_quantiles.R @@ -141,8 +141,8 @@ grab_residuals <- function(the_fit, components) { if (".resid" %in% names(r)) { # success return(r) } else { # failure - rlang::warn(c( - "The `residuals()` method for objects of class {cl} results in", + cli::cli_warn(c( + "The `residuals()` method for {.cls cl} objects results in", "a data frame without a column named `.resid`.", i = "Residual quantiles will be calculated directly from the", i = "difference between predictions and observations.", @@ -152,8 +152,8 @@ grab_residuals <- function(the_fit, components) { } else if (is.vector(drop(r))) { # also success return(tibble(.resid = drop(r))) } else { # failure - rlang::warn(c( - "The `residuals()` method for objects of class {cl} results in an", + cli::cli_warn(c( + "The `residuals()` method for {.cls cl} objects results in an", "object that is neither a data frame with a column named `.resid`,", "nor something coercible to a vector.", i = "Residual quantiles will be calculated directly from the", diff --git a/tests/testthat/test-propogate_samples.R b/tests/testthat/test-propogate_samples.R new file mode 100644 index 000000000..3b02404b6 --- /dev/null +++ b/tests/testthat/test-propogate_samples.R @@ -0,0 +1,8 @@ +test_that("propogate_samples", { + r <- -30:50 + p <- 40 + quantiles <- 1:9 / 10 + aheads <- c(2, 4, 7) + nsim <- 100 + +}) diff --git a/tests/testthat/test-shuffle.R b/tests/testthat/test-shuffle.R new file mode 100644 index 000000000..94bc1aa3b --- /dev/null +++ b/tests/testthat/test-shuffle.R @@ -0,0 +1,5 @@ +test_that("shuffle works", { + expect_error(shuffle(matrix(NA, 2, 2))) + expect_length(shuffle(1:10), 10L) + expect_identical(sort(shuffle(1:10)), 1:10) +}) From 7294c000fba0fc2a3dc2298ce654a70879a58627 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Sun, 24 Sep 2023 11:19:09 -0700 Subject: [PATCH 04/22] correct symmetrization, enhance documentation of the "ahead" param in `flatline_forecaster()`. --- R/flatline_forecaster.R | 6 ++++++ R/layer_cdc_flatline_quantiles.R | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/R/flatline_forecaster.R b/R/flatline_forecaster.R index e437f50ea..197c8cca5 100644 --- a/R/flatline_forecaster.R +++ b/R/flatline_forecaster.R @@ -94,6 +94,12 @@ flatline_forecaster <- function( #' Constructs a list of arguments for [flatline_forecaster()]. #' #' @inheritParams arx_args_list +#' @param ahead Integer. Unlike [arx_forecaster()], this doesn't have any effect +#' on the predicted values. Predictions are always the most recent observation. +#' However, this _does_ impact the residuals stored in the object. Residuals +#' are calculated based on this number to mimic how badly you would have done. +#' So for example, `ahead = 7` will create residuals by comparing values +#' 7 days apart. #' #' @return A list containing updated parameter choices with class `flatline_alist`. #' @export diff --git a/R/layer_cdc_flatline_quantiles.R b/R/layer_cdc_flatline_quantiles.R index c64b96c2d..3f178f6da 100644 --- a/R/layer_cdc_flatline_quantiles.R +++ b/R/layer_cdc_flatline_quantiles.R @@ -149,7 +149,7 @@ propogate_samples <- function( for (iter in 2:max_ahead) { samp <- shuffle(samp) raw <- raw + samp - if (symmetrize) symmetric <- raw - (median(raw) + p) + if (symmetrize) symmetric <- raw - (median(raw) - p) else symmetric <- raw if (nonneg) symmetric <- pmax(0, symmetric) res[[iter]] <- symmetric @@ -157,7 +157,7 @@ propogate_samples <- function( } res <- res[aheads] list(tibble::tibble( - aheads = aheads, + ahead = aheads, .pred_distn = map_vec( res, ~ dist_quantiles(quantile(.x, quantiles), tau = quantiles) ) From f18e88f7b45602c4bbdf4a26d32252f2203485b2 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Sun, 24 Sep 2023 11:29:14 -0700 Subject: [PATCH 05/22] better defaults, cli, pred is scalar in propagate_samples --- R/layer_cdc_flatline_quantiles.R | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/R/layer_cdc_flatline_quantiles.R b/R/layer_cdc_flatline_quantiles.R index 3f178f6da..f2b55e5ec 100644 --- a/R/layer_cdc_flatline_quantiles.R +++ b/R/layer_cdc_flatline_quantiles.R @@ -3,7 +3,7 @@ layer_cdc_flatline_quantiles <- function( ..., aheads = 1:4, quantiles = c(.01, .025, 1:19 / 20, .975, .99), - nsims = 1e5, + nsims = 1e3, by_key = "geo_value", symmetrize = FALSE, nonneg = TRUE, @@ -73,7 +73,7 @@ slather.layer_cdc_flatline_quantiles <- cols_in_preds <- hardhat::check_column_names(p, object$by_key) if (!cols_in_preds$ok) { cli::cli_warn(c( - "Predicted values are missing key columns: {.var cols_in_preds$missing_names}.", + "Predicted values are missing key columns: {.val {cols_in_preds$missing_names}}.", "Ignoring these." )) } @@ -81,7 +81,7 @@ slather.layer_cdc_flatline_quantiles <- cols_in_resids <- hardhat::check_column_names(r, object$by_key) if (!cols_in_resids$ok) { cli::cli_warn(c( - "Existing residuals are missing key columns: {.var cols_in_resids$missing_names}.", + "Existing residuals are missing key columns: {.val {cols_in_resids$missing_names}}.", "Ignoring these." )) } @@ -98,7 +98,7 @@ slather.layer_cdc_flatline_quantiles <- cols_in_resids <- hardhat::check_column_names(key_cols, object$by_key) if (!cols_in_resids$ok) { cli::cli_warn(c( - "Requested residuals are missing key columns: {.var cols_in_resids$missing_names}.", + "Requested residuals are missing key columns: {.val {cols_in_resids$missing_names}}.", "Ignoring these." )) } @@ -139,9 +139,6 @@ propogate_samples <- function( samp <- quantile(r, probs = c(0, seq_len(nsim - 1)) / (nsim - 1), na.rm = TRUE) res <- list() - # p should be all the same - p <- max(p, na.rm = TRUE) - raw <- samp + p if (nonneg) raw <- pmax(0, raw) res[[1]] <- raw From d6a28f371d0f8436b436da2b511a8e02816094c6 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Sun, 24 Sep 2023 16:40:31 -0700 Subject: [PATCH 06/22] redocument --- NAMESPACE | 2 + R/layer_cdc_flatline_quantiles.R | 94 +++++++++++++++++++++ man/add_frosting.Rd | 4 +- man/arx_fcast_epi_workflow.Rd | 12 ++- man/arx_forecaster.Rd | 12 ++- man/create_layer.Rd | 6 +- man/dist_quantiles.Rd | 2 +- man/extrapolate_quantiles.Rd | 8 +- man/fit-epi_workflow.Rd | 2 +- man/flatline.Rd | 6 +- man/flatline_args_list.Rd | 8 +- man/frosting.Rd | 4 +- man/layer_add_forecast_date.Rd | 12 +-- man/layer_add_target_date.Rd | 6 +- man/layer_cdc_flatline_quantiles.Rd | 125 ++++++++++++++++++++++++++++ man/layer_population_scaling.Rd | 29 ++++--- man/layer_predict.Rd | 6 +- man/nested_quantiles.Rd | 4 +- man/smooth_quantile_reg.Rd | 27 +++--- man/step_epi_shift.Rd | 2 +- man/step_growth_rate.Rd | 4 +- man/step_lag_difference.Rd | 4 +- man/step_population_scaling.Rd | 26 +++--- man/step_training_window.Rd | 9 +- 24 files changed, 340 insertions(+), 74 deletions(-) create mode 100644 man/layer_cdc_flatline_quantiles.Rd diff --git a/NAMESPACE b/NAMESPACE index b22ec53a5..e361dec00 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -79,6 +79,7 @@ S3method(residuals,flatline) S3method(run_mold,default_epi_recipe_blueprint) S3method(slather,layer_add_forecast_date) S3method(slather,layer_add_target_date) +S3method(slather,layer_cdc_flatline_quantiles) S3method(slather,layer_naomit) S3method(slather,layer_point_from_distn) S3method(slather,layer_population_scaling) @@ -131,6 +132,7 @@ export(is_layer) export(layer) export(layer_add_forecast_date) export(layer_add_target_date) +export(layer_cdc_flatline_quantiles) export(layer_naomit) export(layer_point_from_distn) export(layer_population_scaling) diff --git a/R/layer_cdc_flatline_quantiles.R b/R/layer_cdc_flatline_quantiles.R index f2b55e5ec..afee37577 100644 --- a/R/layer_cdc_flatline_quantiles.R +++ b/R/layer_cdc_flatline_quantiles.R @@ -1,3 +1,97 @@ +#' CDC Flatline Forecast Quantiles +#' +#' This layer creates quantile forecasts by taking a sample from the +#' interpolated CDF of the flatline residuals, and shuffling them. +#' These are then added on to the point prediction. +#' +#' @details +#' This layer is intended to be used in concert with [flatline()]. But it can +#' also be used with anything else. As long as residuals are available in the +#' the fitted model, this layer could be useful. Like +#' [layer_residual_quantiles()] it only uses the residuals for the fitted model +#' object. However, it propagates these forward for *all* aheads, by +#' iteratively shuffling them (randomly), and then adding them to the previous +#' set. This is in contrast to what happens with the [flatline_forecaster()]. +#' When using [flatline()] as the underlying engine (here), both will result in the +#' same predictions (the most recent observed value), but that model calculates +#' separate residuals for each `ahead` by comparing to observations further into +#' the future. This version continues to use the same set of residuals, and +#' adds them on to produce wider intervals as `ahead` increases. +#' +#' +#' @inheritParams layer_residual_quantiles +#' @param aheads Numeric vector of desired forecast horizons. These should be +#' given in the "units of the training data". So, for example, for data +#' typically observed daily (possibly with missing values), but +#' with weekly forecast targets, you would use `c(7, 14, 21, 28)`. But with +#' weekly data, you would use `1:4`. +#' @param quantiles Numeric vector of probabilities with values in (0,1) +#' referring to the desired predictive intervals. The default is the standard +#' set for the COVID Forecast Hub. +#' @param nsims Positive integer. The number of draws from the empirical CDF. +#' These samples are spaced evenly on the (0, 1) scale, F_X(x) resulting +#' in linear interpolation on the X scale. This is achieved with +#' [stats::quantile()] Type 7 (the default for that function). +#' @param nonneg Logical. Force all predictive intervals be non-negative. +#' Because non-negativity is forced _before_ propagating forward, this +#' has slightly different behaviour than would occur if using +#' [layer_threshold_preds()]. +#' +#' @return an updated `frosting` postprocessor. Calling [predict()] will result +#' in an additional `` named `.pred_distn_all` containing 2-column +#' [tibble::tibble()]'s. For each +#' desired combination of `key`'s, the tibble will contain one row per ahead +#' with the associated [dist_quantiles()]. +#' @export +#' +#' @examples +#' r <- epi_recipe(case_death_rate_subset) %>% +#' # data is "daily", so we fit this to 1 ahead, the result will contain +#' # 1 day ahead residuals +#' step_epi_ahead(death_rate, ahead = 1L, skip = TRUE) %>% +#' recipes::update_role(death_rate, new_role = "predictor") %>% +#' recipes::add_role(time_value, geo_value, new_role = "predictor") +#' +#' forecast_date <- max(case_death_rate_subset$time_value) +#' +#' latest <- get_test_data( +#' epi_recipe(case_death_rate_subset), case_death_rate_subset +#' ) +#' +#' f <- frosting() %>% +#' layer_predict() %>% +#' layer_cdc_flatline_quantiles(aheads = c(7, 14, 21, 28), symmetrize = TRUE) +#' +#' eng <- parsnip::linear_reg() %>% parsnip::set_engine("flatline") +#' +#' wf <- epi_workflow(r, eng, f) %>% fit(case_death_rate_subset) +#' preds <- suppressWarnings(predict(wf, new_data = latest)) %>% +#' dplyr::select(-time_value) %>% +#' dplyr::mutate(forecast_date = forecast_date) +#' preds +#' +#' preds <- preds %>% +#' unnest(.pred_distn_all) %>% +#' pivot_quantiles(.pred_distn) %>% +#' mutate(target_date = forecast_date + ahead) +#' +#' library(ggplot2) +#' four_states <- c("ca", "pa", "wa", "ny") +#' preds %>% +#' filter(geo_value %in% four_states) %>% +#' ggplot(aes(target_date)) + +#' geom_ribbon(aes(ymin = `0.1`, ymax = `0.9`), fill = blues9[3]) + +#' geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`), fill = blues9[6]) + +#' geom_line(aes(y = .pred), color = "orange") + +#' geom_line(data = case_death_rate_subset %>% filter(geo_value %in% four_states), +#' aes(x = time_value, y = death_rate)) + +#' scale_x_date(limits = c(forecast_date - 90, forecast_date + 30)) + +#' labs(x = "Date", y = "Death rate") + +#' facet_wrap(~geo_value, scales = "free_y") + +#' theme_bw() + +#' geom_vline(xintercept = forecast_date) +#' +#' layer_cdc_flatline_quantiles <- function( frosting, ..., diff --git a/man/add_frosting.Rd b/man/add_frosting.Rd index d7d217777..4d77572a1 100644 --- a/man/add_frosting.Rd +++ b/man/add_frosting.Rd @@ -35,7 +35,9 @@ latest <- jhu \%>\% dplyr::filter(time_value >= max(time_value) - 14) # Add frosting to a workflow and predict -f <- frosting() \%>\% layer_predict() \%>\% layer_naomit(.pred) +f <- frosting() \%>\% + layer_predict() \%>\% + layer_naomit(.pred) wf1 <- wf \%>\% add_frosting(f) p1 <- predict(wf1, latest) p1 diff --git a/man/arx_fcast_epi_workflow.Rd b/man/arx_fcast_epi_workflow.Rd index fdd309959..7a6b66305 100644 --- a/man/arx_fcast_epi_workflow.Rd +++ b/man/arx_fcast_epi_workflow.Rd @@ -41,12 +41,16 @@ use \code{\link[=quantile_reg]{quantile_reg()}}) but can be omitted. jhu <- case_death_rate_subset \%>\% dplyr::filter(time_value >= as.Date("2021-12-01")) -arx_fcast_epi_workflow(jhu, "death_rate", - c("case_rate", "death_rate")) +arx_fcast_epi_workflow( + jhu, "death_rate", + c("case_rate", "death_rate") +) arx_fcast_epi_workflow(jhu, "death_rate", - c("case_rate", "death_rate"), trainer = quantile_reg(), - args_list = arx_args_list(levels = 1:9 / 10)) + c("case_rate", "death_rate"), + trainer = quantile_reg(), + args_list = arx_args_list(levels = 1:9 / 10) +) } \seealso{ \code{\link[=arx_forecaster]{arx_forecaster()}} diff --git a/man/arx_forecaster.Rd b/man/arx_forecaster.Rd index d4866aa0e..e121f272c 100644 --- a/man/arx_forecaster.Rd +++ b/man/arx_forecaster.Rd @@ -41,12 +41,16 @@ that it estimates a model for a particular target horizon. jhu <- case_death_rate_subset \%>\% dplyr::filter(time_value >= as.Date("2021-12-01")) -out <- arx_forecaster(jhu, "death_rate", - c("case_rate", "death_rate")) +out <- arx_forecaster( + jhu, "death_rate", + c("case_rate", "death_rate") +) out <- arx_forecaster(jhu, "death_rate", - c("case_rate", "death_rate"), trainer = quantile_reg(), - args_list = arx_args_list(levels = 1:9 / 10)) + c("case_rate", "death_rate"), + trainer = quantile_reg(), + args_list = arx_args_list(levels = 1:9 / 10) +) } \seealso{ \code{\link[=arx_fcast_epi_workflow]{arx_fcast_epi_workflow()}}, \code{\link[=arx_args_list]{arx_args_list()}} diff --git a/man/create_layer.Rd b/man/create_layer.Rd index 399d62efa..d36385fb2 100644 --- a/man/create_layer.Rd +++ b/man/create_layer.Rd @@ -20,9 +20,9 @@ fill in the name of the layer, and open the file. \examples{ \dontrun{ - # Note: running this will write `layer_strawberry.R` to - # the `R/` directory of your current project - create_layer("strawberry") +# Note: running this will write `layer_strawberry.R` to +# the `R/` directory of your current project +create_layer("strawberry") } } diff --git a/man/dist_quantiles.Rd b/man/dist_quantiles.Rd index 50f00dc32..739bae5a8 100644 --- a/man/dist_quantiles.Rd +++ b/man/dist_quantiles.Rd @@ -15,7 +15,7 @@ dist_quantiles(x, tau) A distribution parameterized by a set of quantiles } \examples{ -dstn <- dist_quantiles(list(1:4, 8:11), list(c(.2,.4,.6,.8))) +dstn <- dist_quantiles(list(1:4, 8:11), list(c(.2, .4, .6, .8))) quantile(dstn, p = c(.1, .25, .5, .9)) median(dstn) diff --git a/man/extrapolate_quantiles.Rd b/man/extrapolate_quantiles.Rd index 985d7cae8..cc6cb2c3c 100644 --- a/man/extrapolate_quantiles.Rd +++ b/man/extrapolate_quantiles.Rd @@ -24,12 +24,14 @@ library(distributional) dstn <- dist_normal(c(10, 2), c(5, 10)) extrapolate_quantiles(dstn, p = c(.25, 0.5, .75)) -dstn <- dist_quantiles(list(1:4, 8:11), list(c(.2,.4,.6,.8))) +dstn <- dist_quantiles(list(1:4, 8:11), list(c(.2, .4, .6, .8))) # because this distribution is already quantiles, any extra quantiles are # appended extrapolate_quantiles(dstn, p = c(.25, 0.5, .75)) -dstn <- c(dist_normal(c(10, 2), c(5, 10)), - dist_quantiles(list(1:4, 8:11), list(c(.2,.4,.6,.8)))) +dstn <- c( + dist_normal(c(10, 2), c(5, 10)), + dist_quantiles(list(1:4, 8:11), list(c(.2, .4, .6, .8))) +) extrapolate_quantiles(dstn, p = c(.25, 0.5, .75)) } diff --git a/man/fit-epi_workflow.Rd b/man/fit-epi_workflow.Rd index fb1c3af28..3dfa0029a 100644 --- a/man/fit-epi_workflow.Rd +++ b/man/fit-epi_workflow.Rd @@ -29,7 +29,7 @@ preprocessing the data and fitting the underlying parsnip model. } \examples{ jhu <- case_death_rate_subset \%>\% -filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) + filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) r <- epi_recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% diff --git a/man/flatline.Rd b/man/flatline.Rd index a396cfeb9..c353ff163 100644 --- a/man/flatline.Rd +++ b/man/flatline.Rd @@ -38,8 +38,10 @@ This is an internal function that is used to create a \code{\link[parsnip:linear model. It has somewhat odd behaviour (see below). } \examples{ -tib <- data.frame(y = runif(100), - expand.grid(k = letters[1:4], j = letters[5:9], time_value = 1:5)) \%>\% +tib <- data.frame( + y = runif(100), + expand.grid(k = letters[1:4], j = letters[5:9], time_value = 1:5) +) \%>\% dplyr::group_by(k, j) \%>\% dplyr::mutate(y2 = dplyr::lead(y, 2)) # predict 2 steps ahead flat <- flatline(y2 ~ j + k + y, tib) # predictions for 20 locations diff --git a/man/flatline_args_list.Rd b/man/flatline_args_list.Rd index 55d93c1db..dcae448f1 100644 --- a/man/flatline_args_list.Rd +++ b/man/flatline_args_list.Rd @@ -17,8 +17,12 @@ flatline_args_list( ) } \arguments{ -\item{ahead}{Integer. Number of time steps ahead (in days) of the forecast -date for which forecasts should be produced.} +\item{ahead}{Integer. Unlike \code{\link[=arx_forecaster]{arx_forecaster()}}, this doesn't have any effect +on the predicted values. Predictions are always the most recent observation. +However, this \emph{does} impact the residuals stored in the object. Residuals +are calculated based on this number to mimic how badly you would have done. +So for example, \code{ahead = 7} will create residuals by comparing values +7 days apart.} \item{n_training}{Integer. An upper limit for the number of rows per key that are used for training diff --git a/man/frosting.Rd b/man/frosting.Rd index 83a8d6a9d..362c40a4f 100644 --- a/man/frosting.Rd +++ b/man/frosting.Rd @@ -24,8 +24,8 @@ The arguments are currently placeholders and must be NULL \examples{ # Toy example to show that frosting can be created and added for postprocessing - f <- frosting() - wf <- epi_workflow() \%>\% add_frosting(f) +f <- frosting() +wf <- epi_workflow() \%>\% add_frosting(f) # A more realistic example jhu <- case_death_rate_subset \%>\% diff --git a/man/layer_add_forecast_date.Rd b/man/layer_add_forecast_date.Rd index 421978eb5..4e173d662 100644 --- a/man/layer_add_forecast_date.Rd +++ b/man/layer_add_forecast_date.Rd @@ -46,15 +46,17 @@ wf <- epi_workflow(r, parsnip::linear_reg()) \%>\% fit(jhu) latest <- jhu \%>\% dplyr::filter(time_value >= max(time_value) - 14) - # Don't specify `forecast_date` (by default, this should be last date in latest) -f <- frosting() \%>\% layer_predict() \%>\% - layer_naomit(.pred) +# Don't specify `forecast_date` (by default, this should be last date in latest) +f <- frosting() \%>\% + layer_predict() \%>\% + layer_naomit(.pred) wf0 <- wf \%>\% add_frosting(f) p0 <- predict(wf0, latest) p0 # Specify a `forecast_date` that is greater than or equal to `as_of` date -f <- frosting() \%>\% layer_predict() \%>\% +f <- frosting() \%>\% + layer_predict() \%>\% layer_add_forecast_date(forecast_date = "2022-05-31") \%>\% layer_naomit(.pred) wf1 <- wf \%>\% add_frosting(f) @@ -73,7 +75,7 @@ p2 <- predict(wf2, latest) p2 # Do not specify a forecast_date - f3 <- frosting() \%>\% +f3 <- frosting() \%>\% layer_predict() \%>\% layer_add_forecast_date() \%>\% layer_naomit(.pred) diff --git a/man/layer_add_target_date.Rd b/man/layer_add_target_date.Rd index 58ff7770f..3c2884e10 100644 --- a/man/layer_add_target_date.Rd +++ b/man/layer_add_target_date.Rd @@ -48,7 +48,8 @@ wf <- epi_workflow(r, parsnip::linear_reg()) \%>\% fit(jhu) latest <- get_test_data(r, jhu) # Use ahead + forecast date -f <- frosting() \%>\% layer_predict() \%>\% +f <- frosting() \%>\% + layer_predict() \%>\% layer_add_forecast_date(forecast_date = "2022-05-31") \%>\% layer_add_target_date() \%>\% layer_naomit(.pred) @@ -59,7 +60,8 @@ p # Use ahead + max time value from pre, fit, post # which is the same if include `layer_add_forecast_date()` -f2 <- frosting() \%>\% layer_predict() \%>\% +f2 <- frosting() \%>\% + layer_predict() \%>\% layer_add_target_date() \%>\% layer_naomit(.pred) wf2 <- wf \%>\% add_frosting(f2) diff --git a/man/layer_cdc_flatline_quantiles.Rd b/man/layer_cdc_flatline_quantiles.Rd new file mode 100644 index 000000000..c5bb33d3b --- /dev/null +++ b/man/layer_cdc_flatline_quantiles.Rd @@ -0,0 +1,125 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/layer_cdc_flatline_quantiles.R +\name{layer_cdc_flatline_quantiles} +\alias{layer_cdc_flatline_quantiles} +\title{CDC Flatline Forecast Quantiles} +\usage{ +layer_cdc_flatline_quantiles( + frosting, + ..., + aheads = 1:4, + quantiles = c(0.01, 0.025, 1:19/20, 0.975, 0.99), + nsims = 1000, + by_key = "geo_value", + symmetrize = FALSE, + nonneg = TRUE, + id = rand_id("cdc_baseline_bands") +) +} +\arguments{ +\item{frosting}{a \code{frosting} postprocessor} + +\item{...}{Unused, include for consistency with other layers.} + +\item{aheads}{Numeric vector of desired forecast horizons. These should be +given in the "units of the training data". So, for example, for data +typically observed daily (possibly with missing values), but +with weekly forecast targets, you would use \code{c(7, 14, 21, 28)}. But with +weekly data, you would use \code{1:4}.} + +\item{quantiles}{Numeric vector of probabilities with values in (0,1) +referring to the desired predictive intervals. The default is the standard +set for the COVID Forecast Hub.} + +\item{nsims}{Positive integer. The number of draws from the empirical CDF. +These samples are spaced evenly on the (0, 1) scale, F_X(x) resulting +in linear interpolation on the X scale. This is achieved with +\code{\link[stats:quantile]{stats::quantile()}} Type 7 (the default for that function).} + +\item{by_key}{A character vector of keys to group the residuals by before +calculating quantiles. The default, \code{c()} performs no grouping.} + +\item{symmetrize}{logical. If \code{TRUE} then interval will be symmetric.} + +\item{nonneg}{Logical. Force all predictive intervals be non-negative. +Because non-negativity is forced \emph{before} propagating forward, this +has slightly different behaviour than would occur if using +\code{\link[=layer_threshold_preds]{layer_threshold_preds()}}.} + +\item{id}{a random id string} +} +\value{ +an updated \code{frosting} postprocessor. Calling \code{\link[=predict]{predict()}} will result +in an additional \verb{} named \code{.pred_distn_all} containing 2-column +\code{\link[tibble:tibble]{tibble::tibble()}}'s. For each +desired combination of \code{key}'s, the tibble will contain one row per ahead +with the associated \code{\link[=dist_quantiles]{dist_quantiles()}}. +} +\description{ +This layer creates quantile forecasts by taking a sample from the +interpolated CDF of the flatline residuals, and shuffling them. +These are then added on to the point prediction. +} +\details{ +This layer is intended to be used in concert with \code{\link[=flatline]{flatline()}}. But it can +also be used with anything else. As long as residuals are available in the +the fitted model, this layer could be useful. Like +\code{\link[=layer_residual_quantiles]{layer_residual_quantiles()}} it only uses the residuals for the fitted model +object. However, it propagates these forward for \emph{all} aheads, by +iteratively shuffling them (randomly), and then adding them to the previous +set. This is in contrast to what happens with the \code{\link[=flatline_forecaster]{flatline_forecaster()}}. +When using \code{\link[=flatline]{flatline()}} as the underlying engine (here), both will result in the +same predictions (the most recent observed value), but that model calculates +separate residuals for each \code{ahead} by comparing to observations further into +the future. This version continues to use the same set of residuals, and +adds them on to produce wider intervals as \code{ahead} increases. +} +\examples{ +r <- epi_recipe(case_death_rate_subset) \%>\% + # data is "daily", so we fit this to 1 ahead, the result will contain + # 1 day ahead residuals + step_epi_ahead(death_rate, ahead = 1L, skip = TRUE) \%>\% + recipes::update_role(death_rate, new_role = "predictor") \%>\% + recipes::add_role(time_value, geo_value, new_role = "predictor") + +forecast_date <- max(case_death_rate_subset$time_value) + +latest <- get_test_data( + epi_recipe(case_death_rate_subset), case_death_rate_subset +) + +f <- frosting() \%>\% + layer_predict() \%>\% + layer_cdc_flatline_quantiles(aheads = c(7, 14, 21, 28), symmetrize = TRUE) + +eng <- parsnip::linear_reg() \%>\% parsnip::set_engine("flatline") + +wf <- epi_workflow(r, eng, f) \%>\% fit(case_death_rate_subset) +preds <- suppressWarnings(predict(wf, new_data = latest)) \%>\% + dplyr::select(-time_value) \%>\% + dplyr::mutate(forecast_date = forecast_date) +preds + +preds <- preds \%>\% + unnest(.pred_distn_all) \%>\% + pivot_quantiles(.pred_distn) \%>\% + mutate(target_date = forecast_date + ahead) + +library(ggplot2) +four_states <- c("ca", "pa", "wa", "ny") +preds \%>\% + filter(geo_value \%in\% four_states) \%>\% + ggplot(aes(target_date)) + + geom_ribbon(aes(ymin = `0.1`, ymax = `0.9`), fill = blues9[3]) + + geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`), fill = blues9[6]) + + geom_line(aes(y = .pred), color = "orange") + + geom_line(data = case_death_rate_subset \%>\% filter(geo_value \%in\% four_states), + aes(x = time_value, y = death_rate)) + + scale_x_date(limits = c(forecast_date - 90, forecast_date + 30)) + + labs(x = "Date", y = "Death rate") + + facet_wrap(~geo_value, scales = "free_y") + + theme_bw() + + geom_vline(xintercept = forecast_date) + + +} diff --git a/man/layer_population_scaling.Rd b/man/layer_population_scaling.Rd index e841e9a50..179d6862c 100644 --- a/man/layer_population_scaling.Rd +++ b/man/layer_population_scaling.Rd @@ -78,13 +78,15 @@ jhu <- epiprocess::jhu_csse_daily_subset \%>\% dplyr::filter(time_value > "2021-11-01", geo_value \%in\% c("ca", "ny")) \%>\% dplyr::select(geo_value, time_value, cases) -pop_data = data.frame(states = c("ca", "ny"), value = c(20000, 30000)) +pop_data <- data.frame(states = c("ca", "ny"), value = c(20000, 30000)) r <- epi_recipe(jhu) \%>\% - step_population_scaling(df = pop_data, - df_pop_col = "value", - by = c("geo_value" = "states"), - cases, suffix = "_scaled") \%>\% + step_population_scaling( + df = pop_data, + df_pop_col = "value", + by = c("geo_value" = "states"), + cases, suffix = "_scaled" + ) \%>\% step_epi_lag(cases_scaled, lag = c(0, 7, 14)) \%>\% step_epi_ahead(cases_scaled, ahead = 7, role = "outcome") \%>\% step_epi_naomit() @@ -93,9 +95,11 @@ f <- frosting() \%>\% layer_predict() \%>\% layer_threshold(.pred) \%>\% layer_naomit(.pred) \%>\% - layer_population_scaling(.pred, df = pop_data, - by = c("geo_value" = "states"), - df_pop_col = "value") + layer_population_scaling(.pred, + df = pop_data, + by = c("geo_value" = "states"), + df_pop_col = "value" + ) wf <- epi_workflow(r, parsnip::linear_reg()) \%>\% fit(jhu) \%>\% @@ -104,9 +108,12 @@ wf <- epi_workflow(r, parsnip::linear_reg()) \%>\% latest <- get_test_data( recipe = r, x = epiprocess::jhu_csse_daily_subset \%>\% - dplyr::filter(time_value > "2021-11-01", - geo_value \%in\% c("ca", "ny")) \%>\% - dplyr::select(geo_value, time_value, cases)) + dplyr::filter( + time_value > "2021-11-01", + geo_value \%in\% c("ca", "ny") + ) \%>\% + dplyr::select(geo_value, time_value, cases) +) predict(wf, latest) } diff --git a/man/layer_predict.Rd b/man/layer_predict.Rd index 1326dfe75..03473053f 100644 --- a/man/layer_predict.Rd +++ b/man/layer_predict.Rd @@ -62,9 +62,9 @@ jhu <- case_death_rate_subset \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) r <- epi_recipe(jhu) \%>\% - step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% - step_epi_ahead(death_rate, ahead = 7) \%>\% - step_epi_naomit() + step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% + step_epi_ahead(death_rate, ahead = 7) \%>\% + step_epi_naomit() wf <- epi_workflow(r, parsnip::linear_reg()) \%>\% fit(jhu) latest <- jhu \%>\% filter(time_value >= max(time_value) - 14) diff --git a/man/nested_quantiles.Rd b/man/nested_quantiles.Rd index 1a2824041..c4b578c1a 100644 --- a/man/nested_quantiles.Rd +++ b/man/nested_quantiles.Rd @@ -16,8 +16,8 @@ a list-col Turn a vector of quantile distributions into a list-col } \examples{ -edf <- case_death_rate_subset[1:3,] -edf$q <- dist_quantiles(list(1:5, 2:4, 3:10), list(1:5/6, 2:4/5, 3:10/11)) +edf <- case_death_rate_subset[1:3, ] +edf$q <- dist_quantiles(list(1:5, 2:4, 3:10), list(1:5 / 6, 2:4 / 5, 3:10 / 11)) edf_nested <- edf \%>\% dplyr::mutate(q = nested_quantiles(q)) edf_nested \%>\% tidyr::unnest(q) diff --git a/man/smooth_quantile_reg.Rd b/man/smooth_quantile_reg.Rd index 293999876..6cc2dfc82 100644 --- a/man/smooth_quantile_reg.Rd +++ b/man/smooth_quantile_reg.Rd @@ -39,9 +39,10 @@ only supported engine is \code{\link[smoothqr:smooth_qr]{smoothqr::smooth_qr()}} tib <- data.frame( y1 = rnorm(100), y2 = rnorm(100), y3 = rnorm(100), y4 = rnorm(100), y5 = rnorm(100), y6 = rnorm(100), - x1 = rnorm(100), x2 = rnorm(100)) + x1 = rnorm(100), x2 = rnorm(100) +) qr_spec <- smooth_quantile_reg(tau = c(.2, .5, .8), outcome_locations = 1:6) -ff <- qr_spec \%>\% fit(cbind(y1, y2 , y3 , y4 , y5 , y6) ~ ., data = tib) +ff <- qr_spec \%>\% fit(cbind(y1, y2, y3, y4, y5, y6) ~ ., data = tib) p <- predict(ff, new_data = tib) x <- -99:99 / 100 * 2 * pi @@ -50,21 +51,23 @@ fd <- x[length(x) - 20] XY <- smoothqr::lagmat(y[1:(length(y) - 20)], c(-20:20)) XY <- tibble::as_tibble(XY) qr_spec <- smooth_quantile_reg(tau = c(.2, .5, .8), outcome_locations = 20:1) -tt <- qr_spec \%>\% fit_xy(x = XY[,21:41], y = XY[,1:20]) +tt <- qr_spec \%>\% fit_xy(x = XY[, 21:41], y = XY[, 1:20]) library(tidyr) library(dplyr) pl <- predict( - object = tt, - new_data = XY[max(which(complete.cases(XY[,21:41]))), 21:41] - ) + object = tt, + new_data = XY[max(which(complete.cases(XY[, 21:41]))), 21:41] +) pl <- pl \%>\% - unnest(.pred) \%>\% - mutate(distn = nested_quantiles(distn)) \%>\% - unnest(distn) \%>\% - mutate(x = x[length(x) - 20] + ahead / 100 * 2 * pi, - ahead = NULL) \%>\% - pivot_wider(names_from = tau, values_from = q) + unnest(.pred) \%>\% + mutate(distn = nested_quantiles(distn)) \%>\% + unnest(distn) \%>\% + mutate( + x = x[length(x) - 20] + ahead / 100 * 2 * pi, + ahead = NULL + ) \%>\% + pivot_wider(names_from = tau, values_from = q) plot(x, y, pch = 16, xlim = c(pi, 2 * pi), col = "lightgrey") curve(sin(x), add = TRUE) abline(v = fd, lty = 2) diff --git a/man/step_epi_shift.Rd b/man/step_epi_shift.Rd index ca8609b1e..bf135346e 100644 --- a/man/step_epi_shift.Rd +++ b/man/step_epi_shift.Rd @@ -90,7 +90,7 @@ are always set to \code{"ahead_"} and \code{"epi_ahead"} respectively, while for \examples{ r <- epi_recipe(case_death_rate_subset) \%>\% step_epi_ahead(death_rate, ahead = 7) \%>\% - step_epi_lag(death_rate, lag = c(0,7,14)) + step_epi_lag(death_rate, lag = c(0, 7, 14)) r } \seealso{ diff --git a/man/step_growth_rate.Rd b/man/step_growth_rate.Rd index 0449b887c..b409135b1 100644 --- a/man/step_growth_rate.Rd +++ b/man/step_growth_rate.Rd @@ -87,7 +87,9 @@ r <- epi_recipe(case_death_rate_subset) \%>\% step_growth_rate(case_rate, death_rate) r -r \%>\% recipes::prep() \%>\% recipes::bake(case_death_rate_subset) +r \%>\% + recipes::prep() \%>\% + recipes::bake(case_death_rate_subset) } \seealso{ Other row operation steps: diff --git a/man/step_lag_difference.Rd b/man/step_lag_difference.Rd index d69c25faa..b06abe43c 100644 --- a/man/step_lag_difference.Rd +++ b/man/step_lag_difference.Rd @@ -59,7 +59,9 @@ r <- epi_recipe(case_death_rate_subset) \%>\% step_lag_difference(case_rate, death_rate, horizon = c(7, 14)) r -r \%>\% recipes::prep() \%>\% recipes::bake(case_death_rate_subset) +r \%>\% + recipes::prep() \%>\% + recipes::bake(case_death_rate_subset) } \seealso{ Other row operation steps: diff --git a/man/step_population_scaling.Rd b/man/step_population_scaling.Rd index 2964c6912..1a9564563 100644 --- a/man/step_population_scaling.Rd +++ b/man/step_population_scaling.Rd @@ -104,13 +104,15 @@ jhu <- epiprocess::jhu_csse_daily_subset \%>\% dplyr::filter(time_value > "2021-11-01", geo_value \%in\% c("ca", "ny")) \%>\% dplyr::select(geo_value, time_value, cases) -pop_data = data.frame(states = c("ca", "ny"), value = c(20000, 30000)) +pop_data <- data.frame(states = c("ca", "ny"), value = c(20000, 30000)) r <- epi_recipe(jhu) \%>\% - step_population_scaling(df = pop_data, - df_pop_col = "value", - by = c("geo_value" = "states"), - cases, suffix = "_scaled") \%>\% + step_population_scaling( + df = pop_data, + df_pop_col = "value", + by = c("geo_value" = "states"), + cases, suffix = "_scaled" + ) \%>\% step_epi_lag(cases_scaled, lag = c(0, 7, 14)) \%>\% step_epi_ahead(cases_scaled, ahead = 7, role = "outcome") \%>\% step_epi_naomit() @@ -119,9 +121,11 @@ f <- frosting() \%>\% layer_predict() \%>\% layer_threshold(.pred) \%>\% layer_naomit(.pred) \%>\% - layer_population_scaling(.pred, df = pop_data, - by = c("geo_value" = "states"), - df_pop_col = "value") + layer_population_scaling(.pred, + df = pop_data, + by = c("geo_value" = "states"), + df_pop_col = "value" + ) wf <- epi_workflow(r, parsnip::linear_reg()) \%>\% fit(jhu) \%>\% @@ -130,8 +134,10 @@ wf <- epi_workflow(r, parsnip::linear_reg()) \%>\% latest <- get_test_data( recipe = r, epiprocess::jhu_csse_daily_subset \%>\% - dplyr::filter(time_value > "2021-11-01", - geo_value \%in\% c("ca", "ny")) \%>\% + dplyr::filter( + time_value > "2021-11-01", + geo_value \%in\% c("ca", "ny") + ) \%>\% dplyr::select(geo_value, time_value, cases) ) diff --git a/man/step_training_window.Rd b/man/step_training_window.Rd index 7861f27ea..ce7c0fc74 100644 --- a/man/step_training_window.Rd +++ b/man/step_training_window.Rd @@ -50,9 +50,12 @@ after any filtering step. tib <- tibble::tibble( x = 1:10, y = 1:10, - time_value = rep(seq(as.Date("2020-01-01"), by = 1, - length.out = 5), times = 2), - geo_value = rep(c("ca", "hi"), each = 5)) \%>\% + time_value = rep(seq(as.Date("2020-01-01"), + by = 1, + length.out = 5 + ), times = 2), + geo_value = rep(c("ca", "hi"), each = 5) +) \%>\% as_epi_df() epi_recipe(y ~ x, data = tib) \%>\% From 237ec50ddd74d577184f68cad39a794e91468110 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Sun, 24 Sep 2023 16:42:59 -0700 Subject: [PATCH 07/22] run styler --- R/layer_cdc_flatline_quantiles.R | 24 +++++++++++++--------- R/step_growth_rate.R | 27 ++++++++++++------------- R/step_lag_difference.R | 19 +++++++++-------- tests/testthat/test-propogate_samples.R | 1 - 4 files changed, 36 insertions(+), 35 deletions(-) diff --git a/R/layer_cdc_flatline_quantiles.R b/R/layer_cdc_flatline_quantiles.R index afee37577..1881b8523 100644 --- a/R/layer_cdc_flatline_quantiles.R +++ b/R/layer_cdc_flatline_quantiles.R @@ -83,15 +83,16 @@ #' geom_ribbon(aes(ymin = `0.1`, ymax = `0.9`), fill = blues9[3]) + #' geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`), fill = blues9[6]) + #' geom_line(aes(y = .pred), color = "orange") + -#' geom_line(data = case_death_rate_subset %>% filter(geo_value %in% four_states), -#' aes(x = time_value, y = death_rate)) + +#' geom_line( +#' data = case_death_rate_subset %>% filter(geo_value %in% four_states), +#' aes(x = time_value, y = death_rate) +#' ) + #' scale_x_date(limits = c(forecast_date - 90, forecast_date + 30)) + #' labs(x = "Date", y = "Death rate") + #' facet_wrap(~geo_value, scales = "free_y") + #' theme_bw() + #' geom_vline(xintercept = forecast_date) #' -#' layer_cdc_flatline_quantiles <- function( frosting, ..., @@ -102,7 +103,6 @@ layer_cdc_flatline_quantiles <- function( symmetrize = FALSE, nonneg = TRUE, id = rand_id("cdc_baseline_bands")) { - rlang::check_dots_empty() arg_is_int(aheads) @@ -134,8 +134,7 @@ layer_cdc_flatline_quantiles_new <- function( by_key, symmetrize, nonneg, - id -) { + id) { layer( "cdc_flatline_quantiles", aheads = aheads, @@ -154,8 +153,10 @@ slather.layer_cdc_flatline_quantiles <- the_fit <- workflows::extract_fit_parsnip(workflow) if (!inherits(the_fit, "_flatline")) { cli::cli_warn( - c("Predictions for this workflow were not produced by the {.cls flatline}", - "{.pkg parsnip} engine. Results may be unexpected. See {.fn epipredict::flatline}.") + c( + "Predictions for this workflow were not produced by the {.cls flatline}", + "{.pkg parsnip} engine. Results may be unexpected. See {.fn epipredict::flatline}." + ) ) } p <- components$predictions @@ -240,8 +241,11 @@ propogate_samples <- function( for (iter in 2:max_ahead) { samp <- shuffle(samp) raw <- raw + samp - if (symmetrize) symmetric <- raw - (median(raw) - p) - else symmetric <- raw + if (symmetrize) { + symmetric <- raw - (median(raw) - p) + } else { + symmetric <- raw + } if (nonneg) symmetric <- pmax(0, symmetric) res[[iter]] <- symmetric } diff --git a/R/step_growth_rate.R b/R/step_growth_rate.R index f6ad29a5b..74cfff284 100644 --- a/R/step_growth_rate.R +++ b/R/step_growth_rate.R @@ -42,20 +42,19 @@ #' recipes::prep() %>% #' recipes::bake(case_death_rate_subset) step_growth_rate <- - function( - recipe, - ..., - role = "predictor", - trained = FALSE, - horizon = 7, - method = c("rel_change", "linear_reg", "smooth_spline", "trend_filter"), - log_scale = FALSE, - replace_Inf = NA, - prefix = "gr_", - columns = NULL, - skip = FALSE, - id = rand_id("growth_rate"), - additional_gr_args_list = list()) { + function(recipe, + ..., + role = "predictor", + trained = FALSE, + horizon = 7, + method = c("rel_change", "linear_reg", "smooth_spline", "trend_filter"), + log_scale = FALSE, + replace_Inf = NA, + prefix = "gr_", + columns = NULL, + skip = FALSE, + id = rand_id("growth_rate"), + additional_gr_args_list = list()) { if (!is_epi_recipe(recipe)) { rlang::abort("This recipe step can only operate on an `epi_recipe`.") } diff --git a/R/step_lag_difference.R b/R/step_lag_difference.R index 2482be46a..21878eaa7 100644 --- a/R/step_lag_difference.R +++ b/R/step_lag_difference.R @@ -23,16 +23,15 @@ #' recipes::prep() %>% #' recipes::bake(case_death_rate_subset) step_lag_difference <- - function( - recipe, - ..., - role = "predictor", - trained = FALSE, - horizon = 7, - prefix = "lag_diff_", - columns = NULL, - skip = FALSE, - id = rand_id("lag_diff")) { + function(recipe, + ..., + role = "predictor", + trained = FALSE, + horizon = 7, + prefix = "lag_diff_", + columns = NULL, + skip = FALSE, + id = rand_id("lag_diff")) { if (!is_epi_recipe(recipe)) { rlang::abort("This recipe step can only operate on an `epi_recipe`.") } diff --git a/tests/testthat/test-propogate_samples.R b/tests/testthat/test-propogate_samples.R index 3b02404b6..b8a1ff82d 100644 --- a/tests/testthat/test-propogate_samples.R +++ b/tests/testthat/test-propogate_samples.R @@ -4,5 +4,4 @@ test_that("propogate_samples", { quantiles <- 1:9 / 10 aheads <- c(2, 4, 7) nsim <- 100 - }) From c13b83eb1371ebd0668ded2d6eb0495b17c4dd9e Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Sun, 24 Sep 2023 16:43:26 -0700 Subject: [PATCH 08/22] redocument after styling --- man/layer_cdc_flatline_quantiles.Rd | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/man/layer_cdc_flatline_quantiles.Rd b/man/layer_cdc_flatline_quantiles.Rd index c5bb33d3b..4f151e854 100644 --- a/man/layer_cdc_flatline_quantiles.Rd +++ b/man/layer_cdc_flatline_quantiles.Rd @@ -113,13 +113,14 @@ preds \%>\% geom_ribbon(aes(ymin = `0.1`, ymax = `0.9`), fill = blues9[3]) + geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`), fill = blues9[6]) + geom_line(aes(y = .pred), color = "orange") + - geom_line(data = case_death_rate_subset \%>\% filter(geo_value \%in\% four_states), - aes(x = time_value, y = death_rate)) + + geom_line( + data = case_death_rate_subset \%>\% filter(geo_value \%in\% four_states), + aes(x = time_value, y = death_rate) + ) + scale_x_date(limits = c(forecast_date - 90, forecast_date + 30)) + labs(x = "Date", y = "Death rate") + facet_wrap(~geo_value, scales = "free_y") + theme_bw() + geom_vline(xintercept = forecast_date) - } From 16f6c2c5da3fb906b0152348c29a328ada0ace55 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Mon, 25 Sep 2023 10:04:36 -0700 Subject: [PATCH 09/22] example plotting with ggplot2 handled correctly --- R/layer_cdc_flatline_quantiles.R | 4 ++-- R/make_smooth_quantile_reg.R | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/R/layer_cdc_flatline_quantiles.R b/R/layer_cdc_flatline_quantiles.R index 1881b8523..7ff224359 100644 --- a/R/layer_cdc_flatline_quantiles.R +++ b/R/layer_cdc_flatline_quantiles.R @@ -75,7 +75,7 @@ #' pivot_quantiles(.pred_distn) %>% #' mutate(target_date = forecast_date + ahead) #' -#' library(ggplot2) +#' if (require("ggplot2")) { #' four_states <- c("ca", "pa", "wa", "ny") #' preds %>% #' filter(geo_value %in% four_states) %>% @@ -92,7 +92,7 @@ #' facet_wrap(~geo_value, scales = "free_y") + #' theme_bw() + #' geom_vline(xintercept = forecast_date) -#' +#' } layer_cdc_flatline_quantiles <- function( frosting, ..., diff --git a/R/make_smooth_quantile_reg.R b/R/make_smooth_quantile_reg.R index 6eab2a132..cfb08a9c7 100644 --- a/R/make_smooth_quantile_reg.R +++ b/R/make_smooth_quantile_reg.R @@ -61,7 +61,8 @@ #' lines(pl$x, pl$`0.2`, col = "blue") #' lines(pl$x, pl$`0.8`, col = "blue") #' lines(pl$x, pl$`0.5`, col = "red") -#' \dontrun{ +#' +#' if (require("ggplot2")) { #' ggplot(data.frame(x = x, y = y), aes(x)) + #' geom_ribbon(data = pl, aes(ymin = `0.2`, ymax = `0.8`), fill = "lightblue") + #' geom_point(aes(y = y), colour = "grey") + # observed data From c9b4667ccecf9d36de71383ef94fda0fa84ba996 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Wed, 4 Oct 2023 12:30:41 -0700 Subject: [PATCH 10/22] working cdc baseline --- NAMESPACE | 3 + R/cdc_baseline_forecaster.R | 228 ++++++++++++++++++++++++++++ R/layer_cdc_flatline_quantiles.R | 17 ++- man/cdc_baseline_args_list.Rd | 85 +++++++++++ man/cdc_baseline_forecaster.Rd | 73 +++++++++ man/layer_cdc_flatline_quantiles.Rd | 14 +- man/smooth_quantile_reg.Rd | 3 +- tests/testthat/test-parse_period.R | 12 ++ 8 files changed, 419 insertions(+), 16 deletions(-) create mode 100644 R/cdc_baseline_forecaster.R create mode 100644 man/cdc_baseline_args_list.Rd create mode 100644 man/cdc_baseline_forecaster.Rd create mode 100644 tests/testthat/test-parse_period.R diff --git a/NAMESPACE b/NAMESPACE index e361dec00..d7e941b0f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -52,6 +52,7 @@ S3method(print,alist) S3method(print,arx_class) S3method(print,arx_fcast) S3method(print,canned_epipred) +S3method(print,cdc_baseline_fcast) S3method(print,epi_workflow) S3method(print,flat_fcast) S3method(print,flatline) @@ -107,6 +108,8 @@ export(arx_classifier) export(arx_fcast_epi_workflow) export(arx_forecaster) export(bake) +export(cdc_baseline_args_list) +export(cdc_baseline_forecaster) export(create_layer) export(default_epi_recipe_blueprint) export(detect_layer) diff --git a/R/cdc_baseline_forecaster.R b/R/cdc_baseline_forecaster.R new file mode 100644 index 000000000..f5961431d --- /dev/null +++ b/R/cdc_baseline_forecaster.R @@ -0,0 +1,228 @@ +#' Predict the future with the most recent value +#' +#' This is a simple forecasting model for +#' [epiprocess::epi_df] data. It uses the most recent observation as the +#' forecast for any future date, and produces intervals by shuffling the quantiles +#' of the residuals of such a "flatline" forecast and incrementing these +#' forward over all available training data. +#' +#' By default, the predictive intervals are computed separately for each +#' combination of `geo_value` in the `epi_data` argument. +#' +#' This forecaster is meant to produce exactly the CDC Baseline used for +#' [COVID19ForecastHub](https://covid19forecasthub.org) +#' +#' @param epi_data An [epiprocess::epi_df] +#' @param outcome A scalar character for the column name we wish to predict. +#' @param args_list A list of additional arguments as created by the +#' [cdc_baseline_args_list()] constructor function. +#' +#' @return A data frame of point and interval forecasts at for all +#' aheads (unique horizons) for each unique combination of `key_vars`. +#' @export +#' +#' @examples +#' library(dplyr) +#' weekly_deaths <- case_death_rate_subset %>% +#' select(geo_value, time_value, death_rate) %>% +#' left_join(state_census %>% select(pop, abbr), by = c("geo_value" = "abbr")) %>% +#' mutate(deaths = pmax(death_rate / 1e5 * pop, 0)) %>% +#' select(-pop, -death_rate) %>% +#' group_by(geo_value) %>% +#' epi_slide(~ sum(.$deaths), before = 6, new_col_name = "deaths") %>% +#' ungroup() %>% +#' filter(weekdays(time_value) == "Saturday") +#' +#' cdc <- cdc_baseline_forecaster(deaths, "deaths") +#' preds <- pivot_quantiles(cdc$predictions, .pred_distn) +#' +#' if (require(ggplot2)) { +#' forecast_date <- unique(preds$forecast_date) +#' four_states <- c("ca", "pa", "wa", "ny") +#' preds %>% +#' filter(geo_value %in% four_states) %>% +#' ggplot(aes(target_date)) + +#' geom_ribbon(aes(ymin = `0.1`, ymax = `0.9`), fill = blues9[3]) + +#' geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`), fill = blues9[6]) + +#' geom_line(aes(y = .pred), color = "orange") + +#' geom_line( +#' data = deaths %>% filter(geo_value %in% four_states), +#' aes(x = time_value, y = deaths) +#' ) + +#' scale_x_date(limits = c(forecast_date - 90, forecast_date + 30)) + +#' labs(x = "Date", y = "Weekly deaths") + +#' facet_wrap(~geo_value, scales = "free_y") + +#' theme_bw() + +#' geom_vline(xintercept = forecast_date) +#' } +cdc_baseline_forecaster <- function( + epi_data, + outcome, + args_list = cdc_baseline_args_list()) { + validate_forecaster_inputs(epi_data, outcome, "time_value") + if (!inherits(args_list, c("cdc_flat_fcast", "alist"))) { + cli_stop("args_list was not created using `cdc_baseline_args_list().") + } + keys <- epi_keys(epi_data) + ek <- kill_time_value(keys) + outcome <- rlang::sym(outcome) + + + r <- epi_recipe(epi_data) %>% + step_epi_ahead(!!outcome, ahead = args_list$data_frequency, skip = TRUE) %>% + recipes::update_role(!!outcome, new_role = "predictor") %>% + recipes::add_role(tidyselect::all_of(keys), new_role = "predictor") %>% + step_training_window(n_recent = args_list$n_training) + + forecast_date <- args_list$forecast_date %||% max(epi_data$time_value) + # target_date <- args_list$target_date %||% forecast_date + args_list$ahead + + + latest <- get_test_data( + epi_recipe(epi_data), epi_data, TRUE, args_list$nafill_buffer, + forecast_date + ) + + f <- frosting() %>% + layer_predict() %>% + layer_cdc_flatline_quantiles( + aheads = args_list$aheads, + quantile_levels = args_list$quantile_levels, + nsims = args_list$nsims, + by_key = args_list$quantile_by_key, + symmetrize = args_list$symmetrize, + nonneg = args_list$nonneg + ) %>% + layer_add_forecast_date(forecast_date = forecast_date) %>% + layer_unnest(.pred_distn_all) + # layer_add_target_date(target_date = target_date) + if (args_list$nonneg) f <- layer_threshold(f, ".pred") + + eng <- parsnip::linear_reg() %>% parsnip::set_engine("flatline") + + wf <- epi_workflow(r, eng, f) + wf <- generics::fit(wf, epi_data) + preds <- suppressWarnings(predict(wf, new_data = latest)) %>% + tibble::as_tibble() %>% + dplyr::select(-time_value) %>% + dplyr::mutate(target_date = forecast_date + ahead * args_list$data_frequency) + + structure( + list( + predictions = preds, + epi_workflow = wf, + metadata = list( + training = attr(epi_data, "metadata"), + forecast_created = Sys.time() + ) + ), + class = c("cdc_baseline_fcast", "canned_epipred") + ) +} + + + +#' CDC baseline forecaster argument constructor +#' +#' Constructs a list of arguments for [cdc_baseline_forecaster()]. +#' +#' @inheritParams arx_args_list +#' @param data_frequency Integer or string. This describes the frequency of the +#' input `epi_df`. For typical FluSight forecasts, this would be `"1 week"`. +#' Allowable arguments are integers (taken to mean numbers of days) or a +#' string like `"7 days"` or `"2 weeks"`. Currently, all other periods +#' (other than days or weeks) result in an error. +#' @param aheads Integer vector. Unlike [arx_forecaster()], this doesn't have +#' any effect on the predicted values. +#' Predictions are always the most recent observation. This determines the +#' set of prediction horizons for [layer_cdc_flatline_quantiles()]`. It interacts +#' with the `data_frequency` argument. So, for example, if the data is daily +#' and you want forecasts for 1:4 days ahead, then you would use `1:4`. However, +#' if you want one-week predictions, you would set this as `c(7, 14, 21, 28)`. +#' But if `data_frequency` is `"1 week"`, then you would set it as `1:4`. +#' @param quantile_levels Vector or `NULL`. A vector of probabilities to produce +#' prediction intervals. These are created by computing the quantiles of +#' training residuals. A `NULL` value will result in point forecasts only. +#' @param nsims Positive integer. The number of draws from the empirical CDF. +#' These samples are spaced evenly on the (0, 1) scale, F_X(x) resulting +#' in linear interpolation on the X scale. This is achieved with +#' [stats::quantile()] Type 7 (the default for that function). +#' @param nonneg Logical. Force all predictive intervals be non-negative. +#' Because non-negativity is forced _before_ propagating forward, this +#' has slightly different behaviour than would occur if using +#' [layer_threshold_preds()]. +#' +#' @return A list containing updated parameter choices with class `cdc_flat_fcast`. +#' @export +#' +#' @examples +#' cdc_baseline_args_list() +#' cdc_baseline_args_list(symmetrize = FALSE) +#' cdc_baseline_args_list(levels = c(.1, .3, .7, .9), n_training = 120) +cdc_baseline_args_list <- function( + data_frequency = "1 week", + aheads = 1:4, + n_training = Inf, + forecast_date = NULL, + quantile_levels = c(.01, .025, 1:19 / 20, .975, .99), + nsims = 1e3L, + symmetrize = TRUE, + nonneg = TRUE, + quantile_by_key = "geo_value", + nafill_buffer = Inf) { + arg_is_scalar(n_training, nsims, data_frequency) + data_frequency <- parse_period(data_frequency) + arg_is_pos_int(data_frequency) + arg_is_chr(quantile_by_key, allow_empty = TRUE) + arg_is_scalar(forecast_date, allow_null = TRUE) + arg_is_date(forecast_date, allow_null = TRUE) + arg_is_nonneg_int(aheads, nsims) + arg_is_lgl(symmetrize, nonneg) + arg_is_probabilities(quantile_levels, allow_null = TRUE) + arg_is_pos(n_training) + if (is.finite(n_training)) arg_is_pos_int(n_training) + if (is.finite(nafill_buffer)) arg_is_pos_int(nafill_buffer, allow_null = TRUE) + + structure( + enlist( + data_frequency, + aheads, + n_training, + forecast_date, + quantile_levels, + nsims, + symmetrize, + nonneg, + quantile_by_key, + nafill_buffer + ), + class = c("cdc_baseline_fcast", "alist") + ) +} + +#' @export +print.cdc_baseline_fcast <- function(x, ...) { + name <- "CDC Baseline" + NextMethod(name = name, ...) +} + +parse_period <- function(x) { + arg_is_scalar(x) + if (is.character(x)) { + x <- unlist(strsplit(x, " ")) + if (length(x) == 1L) x <- as.numeric(x) + if (length(x) == 2L) { + mult <- substr(x[2], 1, 3) + mult <- switch( + mult, + day = 1L, + wee = 7L, + cli::cli_abort("incompatible timespan in `aheads`.") + ) + x <- as.numeric(x[1]) * mult + } + if (length(x) > 2L) cli::cli_abort("incompatible timespan in `aheads`.") + } + stopifnot(rlang::is_integerish(x)) + as.integer(x) +} diff --git a/R/layer_cdc_flatline_quantiles.R b/R/layer_cdc_flatline_quantiles.R index 7ff224359..aa953ad45 100644 --- a/R/layer_cdc_flatline_quantiles.R +++ b/R/layer_cdc_flatline_quantiles.R @@ -97,7 +97,7 @@ layer_cdc_flatline_quantiles <- function( frosting, ..., aheads = 1:4, - quantiles = c(.01, .025, 1:19 / 20, .975, .99), + quantile_levels = c(.01, .025, 1:19 / 20, .975, .99), nsims = 1e3, by_key = "geo_value", symmetrize = FALSE, @@ -106,7 +106,7 @@ layer_cdc_flatline_quantiles <- function( rlang::check_dots_empty() arg_is_int(aheads) - arg_is_probabilities(quantiles) + arg_is_probabilities(quantile_levels, allow_null = TRUE) arg_is_pos_int(nsims) arg_is_scalar(nsims) arg_is_chr_scalar(id) @@ -117,7 +117,7 @@ layer_cdc_flatline_quantiles <- function( frosting, layer_cdc_flatline_quantiles_new( aheads = aheads, - quantiles = quantiles, + quantile_levels = quantile_levels, nsims = nsims, by_key = by_key, symmetrize = symmetrize, @@ -129,7 +129,7 @@ layer_cdc_flatline_quantiles <- function( layer_cdc_flatline_quantiles_new <- function( aheads, - quantiles, + quantile_levels, nsims, by_key, symmetrize, @@ -138,7 +138,7 @@ layer_cdc_flatline_quantiles_new <- function( layer( "cdc_flatline_quantiles", aheads = aheads, - quantiles = quantiles, + quantile_levels = quantile_levels, nsims = nsims, by_key = by_key, symmetrize = symmetrize, @@ -150,6 +150,7 @@ layer_cdc_flatline_quantiles_new <- function( #' @export slather.layer_cdc_flatline_quantiles <- function(object, components, workflow, new_data, ...) { + if (is.null(object$quantile_levels)) return(components) the_fit <- workflows::extract_fit_parsnip(workflow) if (!inherits(the_fit, "_flatline")) { cli::cli_warn( @@ -213,7 +214,7 @@ slather.layer_cdc_flatline_quantiles <- dplyr::rowwise() %>% dplyr::mutate( .pred_distn_all = propogate_samples( - .resid, .pred, object$quantiles, + .resid, .pred, object$quantile_levels, object$aheads, object$nsim, object$symmetrize, object$nonneg ) ) %>% @@ -229,7 +230,7 @@ slather.layer_cdc_flatline_quantiles <- } propogate_samples <- function( - r, p, quantiles, aheads, nsim, symmetrize, nonneg) { + r, p, quantile_levels, aheads, nsim, symmetrize, nonneg) { max_ahead <- max(aheads) samp <- quantile(r, probs = c(0, seq_len(nsim - 1)) / (nsim - 1), na.rm = TRUE) res <- list() @@ -254,7 +255,7 @@ propogate_samples <- function( list(tibble::tibble( ahead = aheads, .pred_distn = map_vec( - res, ~ dist_quantiles(quantile(.x, quantiles), tau = quantiles) + res, ~ dist_quantiles(quantile(.x, quantile_levels), quantile_levels) ) )) } diff --git a/man/cdc_baseline_args_list.Rd b/man/cdc_baseline_args_list.Rd new file mode 100644 index 000000000..37c326e9c --- /dev/null +++ b/man/cdc_baseline_args_list.Rd @@ -0,0 +1,85 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cdc_baseline_forecaster.R +\name{cdc_baseline_args_list} +\alias{cdc_baseline_args_list} +\title{CDC baseline forecaster argument constructor} +\usage{ +cdc_baseline_args_list( + data_frequency = "1 week", + aheads = 1:4, + n_training = Inf, + forecast_date = NULL, + quantile_levels = c(0.01, 0.025, 1:19/20, 0.975, 0.99), + nsims = 1000L, + symmetrize = TRUE, + nonneg = TRUE, + quantile_by_key = "geo_value", + nafill_buffer = Inf +) +} +\arguments{ +\item{data_frequency}{Integer or string. This describes the frequency of the +input \code{epi_df}. For typical FluSight forecasts, this would be \code{"1 week"}. +Allowable arguments are integers (taken to mean numbers of days) or a +string like \code{"7 days"} or \code{"2 weeks"}. Currently, all other periods +(other than days or weeks) result in an error.} + +\item{aheads}{Integer vector. Unlike \code{\link[=arx_forecaster]{arx_forecaster()}}, this doesn't have +any effect on the predicted values. +Predictions are always the most recent observation. This determines the +set of prediction horizons for \code{\link[=layer_cdc_flatline_quantiles]{layer_cdc_flatline_quantiles()}}\verb{. It interacts with the }data_frequency\verb{argument. So, for example, if the data is daily and you want forecasts for 1:4 days ahead, then you would use}1:4\verb{. However, if you want one-week predictions, you would set this as }c(7, 14, 21, 28)\verb{. But if }data_frequency\code{is}"1 week"\verb{, then you would set it as }1:4`.} + +\item{n_training}{Integer. An upper limit for the number of rows per +key that are used for training +(in the time unit of the \code{epi_df}).} + +\item{forecast_date}{Date. The date on which the forecast is created. +The default \code{NULL} will attempt to determine this automatically.} + +\item{quantile_levels}{Vector or \code{NULL}. A vector of probabilities to produce +prediction intervals. These are created by computing the quantiles of +training residuals. A \code{NULL} value will result in point forecasts only.} + +\item{nsims}{Positive integer. The number of draws from the empirical CDF. +These samples are spaced evenly on the (0, 1) scale, F_X(x) resulting +in linear interpolation on the X scale. This is achieved with +\code{\link[stats:quantile]{stats::quantile()}} Type 7 (the default for that function).} + +\item{symmetrize}{Logical. The default \code{TRUE} calculates +symmetric prediction intervals. This argument only applies when +residual quantiles are used. It is not applicable with +\code{trainer = quantile_reg()}, for example.} + +\item{nonneg}{Logical. Force all predictive intervals be non-negative. +Because non-negativity is forced \emph{before} propagating forward, this +has slightly different behaviour than would occur if using +\code{\link[=layer_threshold_preds]{layer_threshold_preds()}}.} + +\item{quantile_by_key}{Character vector. Groups residuals by listed keys +before calculating residual quantiles. See the \code{by_key} argument to +\code{\link[=layer_residual_quantiles]{layer_residual_quantiles()}} for more information. The default, +\code{character(0)} performs no grouping. This argument only applies when +residual quantiles are used. It is not applicable with +\code{trainer = quantile_reg()}, for example.} + +\item{nafill_buffer}{At predict time, recent values of the training data +are used to create a forecast. However, these can be \code{NA} due to, e.g., +data latency issues. By default, any missing values will get filled with +less recent data. Setting this value to \code{NULL} will result in 1 extra +recent row (beyond those required for lag creation) to be used. Note that +we require at least \code{min(lags)} rows of recent data per \code{geo_value} to +create a prediction. For this reason, setting \code{nafill_buffer < min(lags)} +will be treated as \emph{additional} allowed recent data rather than the +total amount of recent data to examine.} +} +\value{ +A list containing updated parameter choices with class \code{cdc_flat_fcast}. +} +\description{ +Constructs a list of arguments for \code{\link[=cdc_baseline_forecaster]{cdc_baseline_forecaster()}}. +} +\examples{ +cdc_baseline_args_list() +cdc_baseline_args_list(symmetrize = FALSE) +cdc_baseline_args_list(levels = c(.1, .3, .7, .9), n_training = 120) +} diff --git a/man/cdc_baseline_forecaster.Rd b/man/cdc_baseline_forecaster.Rd new file mode 100644 index 000000000..8d0e1b3ec --- /dev/null +++ b/man/cdc_baseline_forecaster.Rd @@ -0,0 +1,73 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cdc_baseline_forecaster.R +\name{cdc_baseline_forecaster} +\alias{cdc_baseline_forecaster} +\title{Predict the future with the most recent value} +\usage{ +cdc_baseline_forecaster( + epi_data, + outcome, + args_list = cdc_baseline_args_list() +) +} +\arguments{ +\item{epi_data}{An \link[epiprocess:epi_df]{epiprocess::epi_df}} + +\item{outcome}{A scalar character for the column name we wish to predict.} + +\item{args_list}{A list of additional arguments as created by the +\code{\link[=cdc_baseline_args_list]{cdc_baseline_args_list()}} constructor function.} +} +\value{ +A data frame of point and interval forecasts at for all +aheads (unique horizons) for each unique combination of \code{key_vars}. +} +\description{ +This is a simple forecasting model for +\link[epiprocess:epi_df]{epiprocess::epi_df} data. It uses the most recent observation as the +forecast for any future date, and produces intervals by shuffling the quantiles +of the residuals of such a "flatline" forecast and incrementing these +forward over all available training data. +} +\details{ +By default, the predictive intervals are computed separately for each +combination of \code{geo_value} in the \code{epi_data} argument. + +This forecaster is meant to produce exactly the CDC Baseline used for +\href{https://covid19forecasthub.org}{COVID19ForecastHub} +} +\examples{ +library(dplyr) +weekly_deaths <- case_death_rate_subset \%>\% + select(geo_value, time_value, death_rate) \%>\% + left_join(state_census \%>\% select(pop, abbr), by = c("geo_value" = "abbr")) \%>\% + mutate(deaths = pmax(death_rate / 1e5 * pop, 0)) \%>\% + select(-pop, -death_rate) \%>\% + group_by(geo_value) \%>\% + epi_slide(~ sum(.$deaths), before = 6, new_col_name = "deaths") \%>\% + ungroup() \%>\% + filter(weekdays(time_value) == "Saturday") + +cdc <- cdc_baseline_forecaster(deaths, "deaths") +preds <- pivot_quantiles(cdc$predictions, .pred_distn) + +if (require(ggplot2)) { +forecast_date <- unique(preds$forecast_date) +four_states <- c("ca", "pa", "wa", "ny") +preds \%>\% + filter(geo_value \%in\% four_states) \%>\% + ggplot(aes(target_date)) + + geom_ribbon(aes(ymin = `0.1`, ymax = `0.9`), fill = blues9[3]) + + geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`), fill = blues9[6]) + + geom_line(aes(y = .pred), color = "orange") + + geom_line( + data = deaths \%>\% filter(geo_value \%in\% four_states), + aes(x = time_value, y = deaths) + ) + + scale_x_date(limits = c(forecast_date - 90, forecast_date + 30)) + + labs(x = "Date", y = "Weekly deaths") + + facet_wrap(~geo_value, scales = "free_y") + + theme_bw() + + geom_vline(xintercept = forecast_date) +} +} diff --git a/man/layer_cdc_flatline_quantiles.Rd b/man/layer_cdc_flatline_quantiles.Rd index 4f151e854..8eb42b423 100644 --- a/man/layer_cdc_flatline_quantiles.Rd +++ b/man/layer_cdc_flatline_quantiles.Rd @@ -8,7 +8,7 @@ layer_cdc_flatline_quantiles( frosting, ..., aheads = 1:4, - quantiles = c(0.01, 0.025, 1:19/20, 0.975, 0.99), + quantile_levels = c(0.01, 0.025, 1:19/20, 0.975, 0.99), nsims = 1000, by_key = "geo_value", symmetrize = FALSE, @@ -27,10 +27,6 @@ typically observed daily (possibly with missing values), but with weekly forecast targets, you would use \code{c(7, 14, 21, 28)}. But with weekly data, you would use \code{1:4}.} -\item{quantiles}{Numeric vector of probabilities with values in (0,1) -referring to the desired predictive intervals. The default is the standard -set for the COVID Forecast Hub.} - \item{nsims}{Positive integer. The number of draws from the empirical CDF. These samples are spaced evenly on the (0, 1) scale, F_X(x) resulting in linear interpolation on the X scale. This is achieved with @@ -47,6 +43,10 @@ has slightly different behaviour than would occur if using \code{\link[=layer_threshold_preds]{layer_threshold_preds()}}.} \item{id}{a random id string} + +\item{quantiles}{Numeric vector of probabilities with values in (0,1) +referring to the desired predictive intervals. The default is the standard +set for the COVID Forecast Hub.} } \value{ an updated \code{frosting} postprocessor. Calling \code{\link[=predict]{predict()}} will result @@ -105,7 +105,7 @@ preds <- preds \%>\% pivot_quantiles(.pred_distn) \%>\% mutate(target_date = forecast_date + ahead) -library(ggplot2) +if (require("ggplot2")) { four_states <- c("ca", "pa", "wa", "ny") preds \%>\% filter(geo_value \%in\% four_states) \%>\% @@ -122,5 +122,5 @@ preds \%>\% facet_wrap(~geo_value, scales = "free_y") + theme_bw() + geom_vline(xintercept = forecast_date) - +} } diff --git a/man/smooth_quantile_reg.Rd b/man/smooth_quantile_reg.Rd index 6cc2dfc82..b938541f1 100644 --- a/man/smooth_quantile_reg.Rd +++ b/man/smooth_quantile_reg.Rd @@ -74,7 +74,8 @@ abline(v = fd, lty = 2) lines(pl$x, pl$`0.2`, col = "blue") lines(pl$x, pl$`0.8`, col = "blue") lines(pl$x, pl$`0.5`, col = "red") -\dontrun{ + +if (require("ggplot2")) { ggplot(data.frame(x = x, y = y), aes(x)) + geom_ribbon(data = pl, aes(ymin = `0.2`, ymax = `0.8`), fill = "lightblue") + geom_point(aes(y = y), colour = "grey") + # observed data diff --git a/tests/testthat/test-parse_period.R b/tests/testthat/test-parse_period.R new file mode 100644 index 000000000..0adbcec3d --- /dev/null +++ b/tests/testthat/test-parse_period.R @@ -0,0 +1,12 @@ +test_that("parse_period works", { + expect_error(parse_period(c(1, 2))) + expect_error(parse_period(c(1.3))) + expect_error(parse_period("1 year")) + expect_error(parse_period("2 weeks later")) + expect_identical(parse_period(1), 1L) + expect_identical(parse_period("1 day"), 1L) + expect_identical(parse_period("1 days"), 1L) + expect_identical(parse_period("1 week"), 7L) + expect_identical(parse_period("1 weeks"), 7L) + expect_identical(parse_period("2 weeks"), 14L) +}) From d59a691d2eea0a7ae7606464120e18e0b8e22bc3 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Wed, 4 Oct 2023 12:32:16 -0700 Subject: [PATCH 11/22] add cdc baseline to pkgdown --- _pkgdown.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/_pkgdown.yml b/_pkgdown.yml index 2ad03c277..89fd733e1 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -34,6 +34,7 @@ reference: contents: - contains("flatline") - contains("arx") + - contains("cdc") - title: Parsnip engines desc: Prediction methods not available elsewhere contents: From 21b4c85340972e0005af1bc40fd1e0bb4763a9ed Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Wed, 4 Oct 2023 13:27:02 -0700 Subject: [PATCH 12/22] local checks pass --- .Rbuildignore | 1 + NAMESPACE | 1 + R/cdc_baseline_forecaster.R | 8 ++++---- R/layer_cdc_flatline_quantiles.R | 4 ++-- R/make_smooth_quantile_reg.R | 2 +- man/cdc_baseline_args_list.Rd | 4 ++-- man/cdc_baseline_forecaster.Rd | 4 ++-- man/layer_cdc_flatline_quantiles.Rd | 10 +++++----- 8 files changed, 18 insertions(+), 16 deletions(-) diff --git a/.Rbuildignore b/.Rbuildignore index 5139bcabe..cb36bb9d2 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -12,3 +12,4 @@ ^musings$ ^data-raw$ ^vignettes/articles$ +^.git-blame-ignore-revs$ diff --git a/NAMESPACE b/NAMESPACE index d7e941b0f..21cd7b83f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -186,6 +186,7 @@ importFrom(rlang,caller_env) importFrom(rlang,is_empty) importFrom(rlang,is_null) importFrom(rlang,quos) +importFrom(smoothqr,smooth_qr) importFrom(stats,as.formula) importFrom(stats,family) importFrom(stats,lm) diff --git a/R/cdc_baseline_forecaster.R b/R/cdc_baseline_forecaster.R index f5961431d..f1014d666 100644 --- a/R/cdc_baseline_forecaster.R +++ b/R/cdc_baseline_forecaster.R @@ -33,7 +33,7 @@ #' ungroup() %>% #' filter(weekdays(time_value) == "Saturday") #' -#' cdc <- cdc_baseline_forecaster(deaths, "deaths") +#' cdc <- cdc_baseline_forecaster(weekly_deaths, "deaths") #' preds <- pivot_quantiles(cdc$predictions, .pred_distn) #' #' if (require(ggplot2)) { @@ -46,7 +46,7 @@ #' geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`), fill = blues9[6]) + #' geom_line(aes(y = .pred), color = "orange") + #' geom_line( -#' data = deaths %>% filter(geo_value %in% four_states), +#' data = weekly_deaths %>% filter(geo_value %in% four_states), #' aes(x = time_value, y = deaths) #' ) + #' scale_x_date(limits = c(forecast_date - 90, forecast_date + 30)) + @@ -150,7 +150,7 @@ cdc_baseline_forecaster <- function( #' @param nonneg Logical. Force all predictive intervals be non-negative. #' Because non-negativity is forced _before_ propagating forward, this #' has slightly different behaviour than would occur if using -#' [layer_threshold_preds()]. +#' [layer_threshold()]. #' #' @return A list containing updated parameter choices with class `cdc_flat_fcast`. #' @export @@ -158,7 +158,7 @@ cdc_baseline_forecaster <- function( #' @examples #' cdc_baseline_args_list() #' cdc_baseline_args_list(symmetrize = FALSE) -#' cdc_baseline_args_list(levels = c(.1, .3, .7, .9), n_training = 120) +#' cdc_baseline_args_list(quantile_levels = c(.1, .3, .7, .9), n_training = 120) cdc_baseline_args_list <- function( data_frequency = "1 week", aheads = 1:4, diff --git a/R/layer_cdc_flatline_quantiles.R b/R/layer_cdc_flatline_quantiles.R index aa953ad45..f1a159b9a 100644 --- a/R/layer_cdc_flatline_quantiles.R +++ b/R/layer_cdc_flatline_quantiles.R @@ -25,7 +25,7 @@ #' typically observed daily (possibly with missing values), but #' with weekly forecast targets, you would use `c(7, 14, 21, 28)`. But with #' weekly data, you would use `1:4`. -#' @param quantiles Numeric vector of probabilities with values in (0,1) +#' @param quantile_levels Numeric vector of probabilities with values in (0,1) #' referring to the desired predictive intervals. The default is the standard #' set for the COVID Forecast Hub. #' @param nsims Positive integer. The number of draws from the empirical CDF. @@ -35,7 +35,7 @@ #' @param nonneg Logical. Force all predictive intervals be non-negative. #' Because non-negativity is forced _before_ propagating forward, this #' has slightly different behaviour than would occur if using -#' [layer_threshold_preds()]. +#' [layer_threshold()]. #' #' @return an updated `frosting` postprocessor. Calling [predict()] will result #' in an additional `` named `.pred_distn_all` containing 2-column diff --git a/R/make_smooth_quantile_reg.R b/R/make_smooth_quantile_reg.R index cfb08a9c7..7d25a8c6b 100644 --- a/R/make_smooth_quantile_reg.R +++ b/R/make_smooth_quantile_reg.R @@ -21,7 +21,7 @@ #' #' @seealso [fit.model_spec()], [set_engine()] #' -#' @importFrom quantreg rq +#' @importFrom smoothqr smooth_qr #' @examples #' tib <- data.frame( #' y1 = rnorm(100), y2 = rnorm(100), y3 = rnorm(100), diff --git a/man/cdc_baseline_args_list.Rd b/man/cdc_baseline_args_list.Rd index 37c326e9c..2f6546f74 100644 --- a/man/cdc_baseline_args_list.Rd +++ b/man/cdc_baseline_args_list.Rd @@ -53,7 +53,7 @@ residual quantiles are used. It is not applicable with \item{nonneg}{Logical. Force all predictive intervals be non-negative. Because non-negativity is forced \emph{before} propagating forward, this has slightly different behaviour than would occur if using -\code{\link[=layer_threshold_preds]{layer_threshold_preds()}}.} +\code{\link[=layer_threshold]{layer_threshold()}}.} \item{quantile_by_key}{Character vector. Groups residuals by listed keys before calculating residual quantiles. See the \code{by_key} argument to @@ -81,5 +81,5 @@ Constructs a list of arguments for \code{\link[=cdc_baseline_forecaster]{cdc_bas \examples{ cdc_baseline_args_list() cdc_baseline_args_list(symmetrize = FALSE) -cdc_baseline_args_list(levels = c(.1, .3, .7, .9), n_training = 120) +cdc_baseline_args_list(quantile_levels = c(.1, .3, .7, .9), n_training = 120) } diff --git a/man/cdc_baseline_forecaster.Rd b/man/cdc_baseline_forecaster.Rd index 8d0e1b3ec..65de1c29e 100644 --- a/man/cdc_baseline_forecaster.Rd +++ b/man/cdc_baseline_forecaster.Rd @@ -48,7 +48,7 @@ weekly_deaths <- case_death_rate_subset \%>\% ungroup() \%>\% filter(weekdays(time_value) == "Saturday") -cdc <- cdc_baseline_forecaster(deaths, "deaths") +cdc <- cdc_baseline_forecaster(weekly_deaths, "deaths") preds <- pivot_quantiles(cdc$predictions, .pred_distn) if (require(ggplot2)) { @@ -61,7 +61,7 @@ preds \%>\% geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`), fill = blues9[6]) + geom_line(aes(y = .pred), color = "orange") + geom_line( - data = deaths \%>\% filter(geo_value \%in\% four_states), + data = weekly_deaths \%>\% filter(geo_value \%in\% four_states), aes(x = time_value, y = deaths) ) + scale_x_date(limits = c(forecast_date - 90, forecast_date + 30)) + diff --git a/man/layer_cdc_flatline_quantiles.Rd b/man/layer_cdc_flatline_quantiles.Rd index 8eb42b423..71f414e25 100644 --- a/man/layer_cdc_flatline_quantiles.Rd +++ b/man/layer_cdc_flatline_quantiles.Rd @@ -27,6 +27,10 @@ typically observed daily (possibly with missing values), but with weekly forecast targets, you would use \code{c(7, 14, 21, 28)}. But with weekly data, you would use \code{1:4}.} +\item{quantile_levels}{Numeric vector of probabilities with values in (0,1) +referring to the desired predictive intervals. The default is the standard +set for the COVID Forecast Hub.} + \item{nsims}{Positive integer. The number of draws from the empirical CDF. These samples are spaced evenly on the (0, 1) scale, F_X(x) resulting in linear interpolation on the X scale. This is achieved with @@ -40,13 +44,9 @@ calculating quantiles. The default, \code{c()} performs no grouping.} \item{nonneg}{Logical. Force all predictive intervals be non-negative. Because non-negativity is forced \emph{before} propagating forward, this has slightly different behaviour than would occur if using -\code{\link[=layer_threshold_preds]{layer_threshold_preds()}}.} +\code{\link[=layer_threshold]{layer_threshold()}}.} \item{id}{a random id string} - -\item{quantiles}{Numeric vector of probabilities with values in (0,1) -referring to the desired predictive intervals. The default is the standard -set for the COVID Forecast Hub.} } \value{ an updated \code{frosting} postprocessor. Calling \code{\link[=predict]{predict()}} will result From 1f58e67de257b75704acc80a80ff59f5a0b7d6b3 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 4 Oct 2023 14:46:04 -0700 Subject: [PATCH 13/22] Fix incomplete `symmetrize` + document it --- R/layer_cdc_flatline_quantiles.R | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/R/layer_cdc_flatline_quantiles.R b/R/layer_cdc_flatline_quantiles.R index f1a159b9a..0b0db20b7 100644 --- a/R/layer_cdc_flatline_quantiles.R +++ b/R/layer_cdc_flatline_quantiles.R @@ -32,10 +32,20 @@ #' These samples are spaced evenly on the (0, 1) scale, F_X(x) resulting #' in linear interpolation on the X scale. This is achieved with #' [stats::quantile()] Type 7 (the default for that function). +#' @param symmetrize Logical. If `TRUE`, does two things: (i) forces the +#' "empirical" CDF of residuals to be symmetric by pretending that for every +#' actually-observed residual X we also observed another residual -X, and (ii) +#' at each ahead, forces the median simulated value to be equal to the point +#' prediction by adding or subtracting the same amount to every simulated +#' value. Adjustments in (ii) take place before propagating forward and +#' simulating the next ahead. This forces any 1-ahead predictive intervals to +#' be symmetric about the point prediction, and encourages larger aheads to be +#' more symmetric. #' @param nonneg Logical. Force all predictive intervals be non-negative. -#' Because non-negativity is forced _before_ propagating forward, this -#' has slightly different behaviour than would occur if using -#' [layer_threshold()]. +#' Because non-negativity is forced _before_ propagating forward, this has +#' slightly different behaviour than would occur if using [layer_threshold()]. +#' Thresholding at each ahead takes place after any shifting from +#' `symmetrize`. #' #' @return an updated `frosting` postprocessor. Calling [predict()] will result #' in an additional `` named `.pred_distn_all` containing 2-column @@ -232,6 +242,9 @@ slather.layer_cdc_flatline_quantiles <- propogate_samples <- function( r, p, quantile_levels, aheads, nsim, symmetrize, nonneg) { max_ahead <- max(aheads) + if (symmetrize) { + r <- c(r, -r) + } samp <- quantile(r, probs = c(0, seq_len(nsim - 1)) / (nsim - 1), na.rm = TRUE) res <- list() From fe31a790eeed2401ae659a9a070eece75847269d Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 4 Oct 2023 14:47:54 -0700 Subject: [PATCH 14/22] `document()` --- man/layer_cdc_flatline_quantiles.Rd | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/man/layer_cdc_flatline_quantiles.Rd b/man/layer_cdc_flatline_quantiles.Rd index 71f414e25..22219ba7f 100644 --- a/man/layer_cdc_flatline_quantiles.Rd +++ b/man/layer_cdc_flatline_quantiles.Rd @@ -39,12 +39,21 @@ in linear interpolation on the X scale. This is achieved with \item{by_key}{A character vector of keys to group the residuals by before calculating quantiles. The default, \code{c()} performs no grouping.} -\item{symmetrize}{logical. If \code{TRUE} then interval will be symmetric.} +\item{symmetrize}{Logical. If \code{TRUE}, does two things: (i) forces the +"empirical" CDF of residuals to be symmetric by pretending that for every +actually-observed residual X we also observed another residual -X, and (ii) +at each ahead, forces the median simulated value to be equal to the point +prediction by adding or subtracting the same amount to every simulated +value. Adjustments in (ii) take place before propagating forward and +simulating the next ahead. This forces any 1-ahead predictive intervals to +be symmetric about the point prediction, and encourages larger aheads to be +more symmetric.} \item{nonneg}{Logical. Force all predictive intervals be non-negative. -Because non-negativity is forced \emph{before} propagating forward, this -has slightly different behaviour than would occur if using -\code{\link[=layer_threshold]{layer_threshold()}}.} +Because non-negativity is forced \emph{before} propagating forward, this has +slightly different behaviour than would occur if using \code{\link[=layer_threshold]{layer_threshold()}}. +Thresholding at each ahead takes place after any shifting from +\code{symmetrize}.} \item{id}{a random id string} } From 93fac1aca9034ee61de4f993df315223145e4c73 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 4 Oct 2023 14:49:30 -0700 Subject: [PATCH 15/22] Fix death rate 7dav -> weekly sum conversion in example --- R/cdc_baseline_forecaster.R | 2 +- man/cdc_baseline_forecaster.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/cdc_baseline_forecaster.R b/R/cdc_baseline_forecaster.R index f1014d666..4a8c4b7a4 100644 --- a/R/cdc_baseline_forecaster.R +++ b/R/cdc_baseline_forecaster.R @@ -26,7 +26,7 @@ #' weekly_deaths <- case_death_rate_subset %>% #' select(geo_value, time_value, death_rate) %>% #' left_join(state_census %>% select(pop, abbr), by = c("geo_value" = "abbr")) %>% -#' mutate(deaths = pmax(death_rate / 1e5 * pop, 0)) %>% +#' mutate(deaths = pmax(death_rate / 1e5 * pop * 7, 0)) %>% #' select(-pop, -death_rate) %>% #' group_by(geo_value) %>% #' epi_slide(~ sum(.$deaths), before = 6, new_col_name = "deaths") %>% diff --git a/man/cdc_baseline_forecaster.Rd b/man/cdc_baseline_forecaster.Rd index 65de1c29e..31fe75f25 100644 --- a/man/cdc_baseline_forecaster.Rd +++ b/man/cdc_baseline_forecaster.Rd @@ -41,7 +41,7 @@ library(dplyr) weekly_deaths <- case_death_rate_subset \%>\% select(geo_value, time_value, death_rate) \%>\% left_join(state_census \%>\% select(pop, abbr), by = c("geo_value" = "abbr")) \%>\% - mutate(deaths = pmax(death_rate / 1e5 * pop, 0)) \%>\% + mutate(deaths = pmax(death_rate / 1e5 * pop * 7, 0)) \%>\% select(-pop, -death_rate) \%>\% group_by(geo_value) \%>\% epi_slide(~ sum(.$deaths), before = 6, new_col_name = "deaths") \%>\% From 0516836dfcbbcbd58ca19eed8cee3158ac50b32d Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 4 Oct 2023 14:51:08 -0700 Subject: [PATCH 16/22] Copyediting, roxygen link styling, formatting --- R/cdc_baseline_forecaster.R | 6 +++--- R/layer_cdc_flatline_quantiles.R | 14 +++++++------- man/cdc_baseline_forecaster.Rd | 6 +++--- man/layer_cdc_flatline_quantiles.Rd | 10 +++++----- ...ropogate_samples.R => test-propagate_samples.R} | 2 +- 5 files changed, 19 insertions(+), 19 deletions(-) rename tests/testthat/{test-propogate_samples.R => test-propagate_samples.R} (72%) diff --git a/R/cdc_baseline_forecaster.R b/R/cdc_baseline_forecaster.R index 4a8c4b7a4..e66563b65 100644 --- a/R/cdc_baseline_forecaster.R +++ b/R/cdc_baseline_forecaster.R @@ -12,13 +12,13 @@ #' This forecaster is meant to produce exactly the CDC Baseline used for #' [COVID19ForecastHub](https://covid19forecasthub.org) #' -#' @param epi_data An [epiprocess::epi_df] +#' @param epi_data An [`epiprocess::epi_df`] #' @param outcome A scalar character for the column name we wish to predict. #' @param args_list A list of additional arguments as created by the #' [cdc_baseline_args_list()] constructor function. #' -#' @return A data frame of point and interval forecasts at for all -#' aheads (unique horizons) for each unique combination of `key_vars`. +#' @return A data frame of point and interval forecasts for all aheads (unique +#' horizons) for each unique combination of `key_vars`. #' @export #' #' @examples diff --git a/R/layer_cdc_flatline_quantiles.R b/R/layer_cdc_flatline_quantiles.R index 0b0db20b7..16538f0e1 100644 --- a/R/layer_cdc_flatline_quantiles.R +++ b/R/layer_cdc_flatline_quantiles.R @@ -22,15 +22,15 @@ #' @inheritParams layer_residual_quantiles #' @param aheads Numeric vector of desired forecast horizons. These should be #' given in the "units of the training data". So, for example, for data -#' typically observed daily (possibly with missing values), but -#' with weekly forecast targets, you would use `c(7, 14, 21, 28)`. But with -#' weekly data, you would use `1:4`. +#' typically observed daily (possibly with missing values), but with weekly +#' forecast targets, you would use `c(7, 14, 21, 28)`. But with weekly data, +#' you would use `1:4`. #' @param quantile_levels Numeric vector of probabilities with values in (0,1) #' referring to the desired predictive intervals. The default is the standard #' set for the COVID Forecast Hub. #' @param nsims Positive integer. The number of draws from the empirical CDF. -#' These samples are spaced evenly on the (0, 1) scale, F_X(x) resulting -#' in linear interpolation on the X scale. This is achieved with +#' These samples are spaced evenly on the (0, 1) scale, F_X(x) resulting in +#' linear interpolation on the X scale. This is achieved with #' [stats::quantile()] Type 7 (the default for that function). #' @param symmetrize Logical. If `TRUE`, does two things: (i) forces the #' "empirical" CDF of residuals to be symmetric by pretending that for every @@ -223,7 +223,7 @@ slather.layer_cdc_flatline_quantiles <- res <- dplyr::left_join(p, r, by = avail_grps) %>% dplyr::rowwise() %>% dplyr::mutate( - .pred_distn_all = propogate_samples( + .pred_distn_all = propagate_samples( .resid, .pred, object$quantile_levels, object$aheads, object$nsim, object$symmetrize, object$nonneg ) @@ -239,7 +239,7 @@ slather.layer_cdc_flatline_quantiles <- components } -propogate_samples <- function( +propagate_samples <- function( r, p, quantile_levels, aheads, nsim, symmetrize, nonneg) { max_ahead <- max(aheads) if (symmetrize) { diff --git a/man/cdc_baseline_forecaster.Rd b/man/cdc_baseline_forecaster.Rd index 31fe75f25..3f5faa329 100644 --- a/man/cdc_baseline_forecaster.Rd +++ b/man/cdc_baseline_forecaster.Rd @@ -11,7 +11,7 @@ cdc_baseline_forecaster( ) } \arguments{ -\item{epi_data}{An \link[epiprocess:epi_df]{epiprocess::epi_df}} +\item{epi_data}{An \code{\link[epiprocess:epi_df]{epiprocess::epi_df}}} \item{outcome}{A scalar character for the column name we wish to predict.} @@ -19,8 +19,8 @@ cdc_baseline_forecaster( \code{\link[=cdc_baseline_args_list]{cdc_baseline_args_list()}} constructor function.} } \value{ -A data frame of point and interval forecasts at for all -aheads (unique horizons) for each unique combination of \code{key_vars}. +A data frame of point and interval forecasts for all aheads (unique +horizons) for each unique combination of \code{key_vars}. } \description{ This is a simple forecasting model for diff --git a/man/layer_cdc_flatline_quantiles.Rd b/man/layer_cdc_flatline_quantiles.Rd index 22219ba7f..5e72378b3 100644 --- a/man/layer_cdc_flatline_quantiles.Rd +++ b/man/layer_cdc_flatline_quantiles.Rd @@ -23,17 +23,17 @@ layer_cdc_flatline_quantiles( \item{aheads}{Numeric vector of desired forecast horizons. These should be given in the "units of the training data". So, for example, for data -typically observed daily (possibly with missing values), but -with weekly forecast targets, you would use \code{c(7, 14, 21, 28)}. But with -weekly data, you would use \code{1:4}.} +typically observed daily (possibly with missing values), but with weekly +forecast targets, you would use \code{c(7, 14, 21, 28)}. But with weekly data, +you would use \code{1:4}.} \item{quantile_levels}{Numeric vector of probabilities with values in (0,1) referring to the desired predictive intervals. The default is the standard set for the COVID Forecast Hub.} \item{nsims}{Positive integer. The number of draws from the empirical CDF. -These samples are spaced evenly on the (0, 1) scale, F_X(x) resulting -in linear interpolation on the X scale. This is achieved with +These samples are spaced evenly on the (0, 1) scale, F_X(x) resulting in +linear interpolation on the X scale. This is achieved with \code{\link[stats:quantile]{stats::quantile()}} Type 7 (the default for that function).} \item{by_key}{A character vector of keys to group the residuals by before diff --git a/tests/testthat/test-propogate_samples.R b/tests/testthat/test-propagate_samples.R similarity index 72% rename from tests/testthat/test-propogate_samples.R rename to tests/testthat/test-propagate_samples.R index b8a1ff82d..5278ab385 100644 --- a/tests/testthat/test-propogate_samples.R +++ b/tests/testthat/test-propagate_samples.R @@ -1,4 +1,4 @@ -test_that("propogate_samples", { +test_that("propagate_samples", { r <- -30:50 p <- 40 quantiles <- 1:9 / 10 From 0905ba47cb8050a80a548366e5eb971b3a0dadfb Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 4 Oct 2023 16:52:40 -0700 Subject: [PATCH 17/22] "Logical" -> "Scalar logical" as appropriate to match rest of docs --- R/layer_cdc_flatline_quantiles.R | 4 ++-- man/layer_cdc_flatline_quantiles.Rd | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R/layer_cdc_flatline_quantiles.R b/R/layer_cdc_flatline_quantiles.R index 16538f0e1..d9d192563 100644 --- a/R/layer_cdc_flatline_quantiles.R +++ b/R/layer_cdc_flatline_quantiles.R @@ -32,7 +32,7 @@ #' These samples are spaced evenly on the (0, 1) scale, F_X(x) resulting in #' linear interpolation on the X scale. This is achieved with #' [stats::quantile()] Type 7 (the default for that function). -#' @param symmetrize Logical. If `TRUE`, does two things: (i) forces the +#' @param symmetrize Scalar logical. If `TRUE`, does two things: (i) forces the #' "empirical" CDF of residuals to be symmetric by pretending that for every #' actually-observed residual X we also observed another residual -X, and (ii) #' at each ahead, forces the median simulated value to be equal to the point @@ -41,7 +41,7 @@ #' simulating the next ahead. This forces any 1-ahead predictive intervals to #' be symmetric about the point prediction, and encourages larger aheads to be #' more symmetric. -#' @param nonneg Logical. Force all predictive intervals be non-negative. +#' @param nonneg Scalar logical. Force all predictive intervals be non-negative. #' Because non-negativity is forced _before_ propagating forward, this has #' slightly different behaviour than would occur if using [layer_threshold()]. #' Thresholding at each ahead takes place after any shifting from diff --git a/man/layer_cdc_flatline_quantiles.Rd b/man/layer_cdc_flatline_quantiles.Rd index 5e72378b3..55a1a378e 100644 --- a/man/layer_cdc_flatline_quantiles.Rd +++ b/man/layer_cdc_flatline_quantiles.Rd @@ -39,7 +39,7 @@ linear interpolation on the X scale. This is achieved with \item{by_key}{A character vector of keys to group the residuals by before calculating quantiles. The default, \code{c()} performs no grouping.} -\item{symmetrize}{Logical. If \code{TRUE}, does two things: (i) forces the +\item{symmetrize}{Scalar logical. If \code{TRUE}, does two things: (i) forces the "empirical" CDF of residuals to be symmetric by pretending that for every actually-observed residual X we also observed another residual -X, and (ii) at each ahead, forces the median simulated value to be equal to the point @@ -49,7 +49,7 @@ simulating the next ahead. This forces any 1-ahead predictive intervals to be symmetric about the point prediction, and encourages larger aheads to be more symmetric.} -\item{nonneg}{Logical. Force all predictive intervals be non-negative. +\item{nonneg}{Scalar logical. Force all predictive intervals be non-negative. Because non-negativity is forced \emph{before} propagating forward, this has slightly different behaviour than would occur if using \code{\link[=layer_threshold]{layer_threshold()}}. Thresholding at each ahead takes place after any shifting from From 6f14e6a1e950b23cf6fa4f51c2f6ee88527ac96b Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Thu, 5 Oct 2023 06:17:08 -0700 Subject: [PATCH 18/22] add formatter to the correct branch --- R/flusight_hub_formatter.R | 95 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 R/flusight_hub_formatter.R diff --git a/R/flusight_hub_formatter.R b/R/flusight_hub_formatter.R new file mode 100644 index 000000000..78bb5719c --- /dev/null +++ b/R/flusight_hub_formatter.R @@ -0,0 +1,95 @@ +abbr_to_fips <- function(abbr) { + fi <- dplyr::left_join( + tibble::tibble(abbr = tolower(abbr)), + state_census, by = "abbr") %>% + dplyr::mutate(fips = as.character(fips), fips = case_when( + fips == "0" ~ "US", + nchar(fips) < 2L ~ paste0("0", fips), + TRUE ~ fips + )) %>% + pull(.data$fips) + names(fi) <- NULL + fi +} + +#' Format predictions for submission to FluSight forecast Hub +#' +#' +#' +#' @param object a data.frame of predictions or an object of class +#' `canned_epipred` as created by, e.g., [arx_forecaster()] +#' @param ... <[`dynamic-dots`][rlang::dyn-dots]> Name = value pairs of constant +#' columns (or mutations) to perform to the results. +#' @param .fcast_period +#' +#' @return A [tibble::tibble]. +#' @export +#' +#' @examples +flusight_hub_formatter <- function( + object, ..., + .fcast_period = c("daily", "weekly")) { + UseMethod("flusight_hub_formatter") +} + +#' @export +flusight_hub_formatter.canned_epipred <- function( + object, ..., + .fcast_period = c("daily", "weekly")) { + flusight_hub_formatter(object$predictions, ..., .fcast_period = .fcast_period) +} + +#' @export +flusight_hub_formatter.data.frame <- function( + object, ..., + .fcast_period = c("daily", "weekly")) { + required_names <- c(".pred", ".pred_distn", "forecast_date", "geo_value") + optional_names <- c("ahead", "target_date") + hardhat::validate_column_names(object, required_names) + if (!any(optional_names %in% names(object))) { + cli::cli_abort("At least one of {.val {optional_names}} must be present.") + } + + dots <- enquos(..., .named = TRUE) + names <- names(dots) + + object <- object %>% + # combine the predictions and the distribution + dplyr::mutate(.pred_distn = nested_quantiles(.pred_distn)) %>% + dplyr::rowwise() %>% + dplyr::mutate( + .pred_distn = list(add_row(.pred_distn, q = .pred, tau = NA)), + .pred = NULL + ) %>% + tidyr::unnest(.pred_distn) %>% + # now we create the correct column names + dplyr::rename( + value = q, + output_type_id = tau, + reference_date = forecast_date + ) %>% + # convert to fips codes, and add any constant cols passed in ... + dplyr::mutate(location = abbr_to_fips(tolower(geo_value)), geo_value = NULL, !!!dots) + + # create target_end_date / horizon, depending on what is available + pp <- ifelse(match.arg(.fcast_period) == "daily", 1L, 7L) + has_ahead <- charmatch("ahead", names(object)) + if ("target_date" %in% names(object) && !is.na(has_ahead)) { + object <- object %>% + dplyr::rename( + target_end_date = target_date, + horizon = !!names(object)[has_ahead] + ) + } else if (!is.na(has_ahead)) { # ahead present, not target date + object <- object %>% + dplyr::rename(horizon = !!names(object)[has_ahead]) %>% + dplyr::mutate(target_end_date = horizon * pp + reference_date) + } else { # target_date present, not ahead + object <- object %>% + dplyr::rename(target_end_date = target_date) %>% + dplyr::mutate(horizon = as.integer((target_end_date - reference_date)) / pp) + } + object %>% dplyr::relocate( + reference_date, horizon, target_end_date, location, output_type_id, value + ) +} From b1c34cb862146b48344e0174db62916f62d4a2c0 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 5 Oct 2023 07:48:40 -0700 Subject: [PATCH 19/22] Speed up cdc baseline: `quantile(...., names = FALSE)` --- R/layer_cdc_flatline_quantiles.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/layer_cdc_flatline_quantiles.R b/R/layer_cdc_flatline_quantiles.R index d9d192563..312c3630e 100644 --- a/R/layer_cdc_flatline_quantiles.R +++ b/R/layer_cdc_flatline_quantiles.R @@ -245,7 +245,8 @@ propagate_samples <- function( if (symmetrize) { r <- c(r, -r) } - samp <- quantile(r, probs = c(0, seq_len(nsim - 1)) / (nsim - 1), na.rm = TRUE) + samp <- quantile(r, probs = c(0, seq_len(nsim - 1)) / (nsim - 1), + na.rm = TRUE, names = FALSE) res <- list() raw <- samp + p From cf9a44e3fb3cd2a08342272787882b3626e5ff91 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Thu, 5 Oct 2023 11:57:19 -0700 Subject: [PATCH 20/22] CI: trying to change in a particular branch too --- .github/workflows/R-CMD-check.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index c4bcd6b68..eff7367ec 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -4,9 +4,9 @@ # Created with usethis + edited to use API key. on: push: - branches: [main, master] + branches: [main, master, v0.0.6] pull_request: - branches: [main, master] + branches: [main, master, v0.0.6] name: R-CMD-check From 343add145aed2baea3a60ec204c41f03d153f5a7 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Thu, 5 Oct 2023 12:26:16 -0700 Subject: [PATCH 21/22] formatter works --- NAMESPACE | 3 +++ R/flusight_hub_formatter.R | 37 +++++++++++++++++++++---- man/flusight_hub_formatter.Rd | 51 +++++++++++++++++++++++++++++++++++ 3 files changed, 86 insertions(+), 5 deletions(-) create mode 100644 man/flusight_hub_formatter.Rd diff --git a/NAMESPACE b/NAMESPACE index 21cd7b83f..03b159764 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -34,6 +34,8 @@ S3method(extrapolate_quantiles,dist_default) S3method(extrapolate_quantiles,dist_quantiles) S3method(extrapolate_quantiles,distribution) S3method(fit,epi_workflow) +S3method(flusight_hub_formatter,canned_epipred) +S3method(flusight_hub_formatter,data.frame) S3method(format,dist_quantiles) S3method(is.na,dist_quantiles) S3method(is.na,distribution) @@ -126,6 +128,7 @@ export(fit) export(flatline) export(flatline_args_list) export(flatline_forecaster) +export(flusight_hub_formatter) export(frosting) export(get_test_data) export(grab_names) diff --git a/R/flusight_hub_formatter.R b/R/flusight_hub_formatter.R index 78bb5719c..1ddd471b3 100644 --- a/R/flusight_hub_formatter.R +++ b/R/flusight_hub_formatter.R @@ -14,18 +14,44 @@ abbr_to_fips <- function(abbr) { #' Format predictions for submission to FluSight forecast Hub #' -#' +#' This function converts predictions from any of the included forecasters into +#' a format (nearly) ready for submission to the 2023-24 +#' [FluSight-forecast-hub](https://github.com/cdcepi/FluSight-forecast-hub). +#' See there for documentation of the required columns. Currently, only +#' "quantile" forcasts are supported, but the intention is to support both +#' "quantile" and "pmf". For this reason, adding the `output_type` column should +#' be done via the `...` argument. See the examples below. The specific required +#' format for this forecast task is [here](https://github.com/cdcepi/FluSight-forecast-hub/blob/main/model-output/README.md). #' #' @param object a data.frame of predictions or an object of class #' `canned_epipred` as created by, e.g., [arx_forecaster()] #' @param ... <[`dynamic-dots`][rlang::dyn-dots]> Name = value pairs of constant -#' columns (or mutations) to perform to the results. +#' columns (or mutations) to perform to the results. See examples. #' @param .fcast_period #' -#' @return A [tibble::tibble]. +#' @return A [tibble::tibble]. If `...` is empty, the result will contain the +#' columns `reference_date`, `horizon`, `target_end_date`, `location`, +#' `output_type_id`, and `value`. The `...` can perform mutations on any of +#' these. #' @export #' #' @examples +#' library(dplyr) +#' weekly_deaths <- case_death_rate_subset %>% +#' select(geo_value, time_value, death_rate) %>% +#' left_join(state_census %>% select(pop, abbr), by = c("geo_value" = "abbr")) %>% +#' mutate(deaths = pmax(death_rate / 1e5 * pop * 7, 0)) %>% +#' select(-pop, -death_rate) %>% +#' group_by(geo_value) %>% +#' epi_slide(~ sum(.$deaths), before = 6, new_col_name = "deaths") %>% +#' ungroup() %>% +#' filter(weekdays(time_value) == "Saturday") +#' +#' cdc <- cdc_baseline_forecaster(weekly_deaths, "deaths") +#' flusight_hub_formatter(cdc) +#' flusight_hub_formatter(cdc, target = "wk inc covid deaths") +#' flusight_hub_formatter(cdc, target = paste(horizon, "wk inc covid deaths")) +#' flusight_hub_formatter(cdc, target = "wk inc covid deaths", output_type = "quantile") flusight_hub_formatter <- function( object, ..., .fcast_period = c("daily", "weekly")) { @@ -69,7 +95,7 @@ flusight_hub_formatter.data.frame <- function( reference_date = forecast_date ) %>% # convert to fips codes, and add any constant cols passed in ... - dplyr::mutate(location = abbr_to_fips(tolower(geo_value)), geo_value = NULL, !!!dots) + dplyr::mutate(location = abbr_to_fips(tolower(geo_value)), geo_value = NULL) # create target_end_date / horizon, depending on what is available pp <- ifelse(match.arg(.fcast_period) == "daily", 1L, 7L) @@ -91,5 +117,6 @@ flusight_hub_formatter.data.frame <- function( } object %>% dplyr::relocate( reference_date, horizon, target_end_date, location, output_type_id, value - ) + ) %>% + dplyr::mutate(!!!dots) } diff --git a/man/flusight_hub_formatter.Rd b/man/flusight_hub_formatter.Rd new file mode 100644 index 000000000..4a2661080 --- /dev/null +++ b/man/flusight_hub_formatter.Rd @@ -0,0 +1,51 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/flusight_hub_formatter.R +\name{flusight_hub_formatter} +\alias{flusight_hub_formatter} +\title{Format predictions for submission to FluSight forecast Hub} +\usage{ +flusight_hub_formatter(object, ..., .fcast_period = c("daily", "weekly")) +} +\arguments{ +\item{object}{a data.frame of predictions or an object of class +\code{canned_epipred} as created by, e.g., \code{\link[=arx_forecaster]{arx_forecaster()}}} + +\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Name = value pairs of constant +columns (or mutations) to perform to the results. See examples.} + +\item{.fcast_period}{} +} +\value{ +A \link[tibble:tibble]{tibble::tibble}. If \code{...} is empty, the result will contain the +columns \code{reference_date}, \code{horizon}, \code{target_end_date}, \code{location}, +\code{output_type_id}, and \code{value}. The \code{...} can perform mutations on any of +these. +} +\description{ +This function converts predictions from any of the included forecasters into +a format (nearly) ready for submission to the 2023-24 +\href{https://github.com/cdcepi/FluSight-forecast-hub}{FluSight-forecast-hub}. +See there for documentation of the required columns. Currently, only +"quantile" forcasts are supported, but the intention is to support both +"quantile" and "pmf". For this reason, adding the \code{output_type} column should +be done via the \code{...} argument. See the examples below. The specific required +format for this forecast task is \href{https://github.com/cdcepi/FluSight-forecast-hub/blob/main/model-output/README.md}{here}. +} +\examples{ +library(dplyr) +weekly_deaths <- case_death_rate_subset \%>\% + select(geo_value, time_value, death_rate) \%>\% + left_join(state_census \%>\% select(pop, abbr), by = c("geo_value" = "abbr")) \%>\% + mutate(deaths = pmax(death_rate / 1e5 * pop * 7, 0)) \%>\% + select(-pop, -death_rate) \%>\% + group_by(geo_value) \%>\% + epi_slide(~ sum(.$deaths), before = 6, new_col_name = "deaths") \%>\% + ungroup() \%>\% + filter(weekdays(time_value) == "Saturday") + +cdc <- cdc_baseline_forecaster(weekly_deaths, "deaths") +flusight_hub_formatter(cdc) +flusight_hub_formatter(cdc, target = "wk inc covid deaths") +flusight_hub_formatter(cdc, target = paste(horizon, "wk inc covid deaths")) +flusight_hub_formatter(cdc, target = "wk inc covid deaths", output_type = "quantile") +} From b5fe62451d3f9ffc4677e12e7b0ccbc0f82f9d38 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Thu, 5 Oct 2023 12:37:29 -0700 Subject: [PATCH 22/22] local checks pass --- R/flusight_hub_formatter.R | 11 ++++++++++- man/flusight_hub_formatter.Rd | 11 ++++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/R/flusight_hub_formatter.R b/R/flusight_hub_formatter.R index 1ddd471b3..d433ab2a7 100644 --- a/R/flusight_hub_formatter.R +++ b/R/flusight_hub_formatter.R @@ -27,7 +27,16 @@ abbr_to_fips <- function(abbr) { #' `canned_epipred` as created by, e.g., [arx_forecaster()] #' @param ... <[`dynamic-dots`][rlang::dyn-dots]> Name = value pairs of constant #' columns (or mutations) to perform to the results. See examples. -#' @param .fcast_period +#' @param .fcast_period Control whether the `horizon` should represent days or +#' weeks. Depending on whether the forecaster output has target dates +#' from [layer_add_target_date()] or not, we may need to compute the horizon +#' and/or the `target_end_date` from the other available columns in the predictions. +#' When both `ahead` and `target_date` are available, this is ignored. If only +#' `ahead` or `aheads` exists, then the target date may need to be multiplied +#' if the `ahead` represents weekly forecasts. Alternatively, if only, the +#' `target_date` is available, then the `horizon` will be in days, unless +#' this argument is `"weekly"`. Note that these can be adjusted later by the +#' `...` argument. #' #' @return A [tibble::tibble]. If `...` is empty, the result will contain the #' columns `reference_date`, `horizon`, `target_end_date`, `location`, diff --git a/man/flusight_hub_formatter.Rd b/man/flusight_hub_formatter.Rd index 4a2661080..d8a4571f4 100644 --- a/man/flusight_hub_formatter.Rd +++ b/man/flusight_hub_formatter.Rd @@ -13,7 +13,16 @@ flusight_hub_formatter(object, ..., .fcast_period = c("daily", "weekly")) \item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Name = value pairs of constant columns (or mutations) to perform to the results. See examples.} -\item{.fcast_period}{} +\item{.fcast_period}{Control whether the \code{horizon} should represent days or +weeks. Depending on whether the forecaster output has target dates +from \code{\link[=layer_add_target_date]{layer_add_target_date()}} or not, we may need to compute the horizon +and/or the \code{target_end_date} from the other available columns in the predictions. +When both \code{ahead} and \code{target_date} are available, this is ignored. If only +\code{ahead} or \code{aheads} exists, then the target date may need to be multiplied +if the \code{ahead} represents weekly forecasts. Alternatively, if only, the +\code{target_date} is available, then the \code{horizon} will be in days, unless +this argument is \code{"weekly"}. Note that these can be adjusted later by the +\code{...} argument.} } \value{ A \link[tibble:tibble]{tibble::tibble}. If \code{...} is empty, the result will contain the