|
| 1 | +#' predict on smoothed data and the standard deviation |
| 2 | +#' @description |
| 3 | +#' This is a variant of `scaled_pop`, which predicts on a smoothed version of |
| 4 | +#' the data. Even if the target is smoothed when used as a /predictor/, as a |
| 5 | +#' /target/ it still uses the raw value (this captures some of the noise). It |
| 6 | +#' also uses a rolling standard deviation as an auxillary signal, window of |
| 7 | +#' withd `sd_width`, which by default is 28 days. |
| 8 | +#' @param epi_data the actual data used |
| 9 | +#' @param outcome the name of the target variable |
| 10 | +#' @param extra_sources the name of any extra columns to use. This list could be |
| 11 | +#' empty |
| 12 | +#' @param ahead (this is relative to the `as_of` field of the `epi_df`, which is |
| 13 | +#' likely *not* the same as the `ahead` used by epipredict, which is relative |
| 14 | +#' to the max time value of the `epi_df`. how to handle this is a modelling |
| 15 | +#' question left up to each forecaster; see latency_adjusting.R for the |
| 16 | +#' existing examples) |
| 17 | +#' @param pop_scaling an example extra parameter unique to this forecaster |
| 18 | +#' @param trainer an example extra parameter that is fairly common |
| 19 | +#' @param smooth_width the number of days over which to do smoothing. If `NULL`, |
| 20 | +#' then no smoothing is applied. |
| 21 | +#' @param smooth_cols the names of the columns to smooth. If `NULL` it smooths |
| 22 | +#' everything |
| 23 | +#' @param sd_width the number of days over which to take a moving average of the |
| 24 | +#' standard deviation. If `NULL`, the sd_width isn't included. |
| 25 | +#' @param sd_mean_width to calculate the sd, we need a window size for the mean |
| 26 | +#' used. |
| 27 | +#' @param sd_cols the names of the columns to smooth. If `NULL` its includes |
| 28 | +#' the sd of everything |
| 29 | +#' @param quantile_levels The quantile levels to predict. Defaults to those |
| 30 | +#' required by covidhub. |
| 31 | +#' @seealso some utilities for making forecasters: [format_storage], |
| 32 | +#' [perform_sanity_checks] |
| 33 | +#' @importFrom epipredict epi_recipe step_population_scaling frosting arx_args_list layer_population_scaling |
| 34 | +#' @importFrom tibble tibble |
| 35 | +#' @importFrom recipes all_numeric |
| 36 | +#' @export |
| 37 | +smoothed_scaled <- function(epi_data, |
| 38 | + outcome, |
| 39 | + extra_sources = "", |
| 40 | + ahead = 1, |
| 41 | + pop_scaling = TRUE, |
| 42 | + trainer = parsnip::linear_reg(), |
| 43 | + quantile_levels = covidhub_probs(), |
| 44 | + smooth_width = 7, |
| 45 | + smooth_cols = NULL, |
| 46 | + sd_width = 28, |
| 47 | + sd_mean_width = 14, |
| 48 | + sd_cols = NULL, |
| 49 | + ...) { |
| 50 | + # perform any preprocessing not supported by epipredict |
| 51 | + # this is a temp fix until a real fix gets put into epipredict |
| 52 | + epi_data <- clear_lastminute_nas(epi_data) |
| 53 | + # one that every forecaster will need to handle: how to manage max(time_value) |
| 54 | + # that's older than the `as_of` date |
| 55 | + epidataAhead <- extend_ahead(epi_data, ahead) |
| 56 | + # see latency_adjusting for other examples |
| 57 | + # this next part is basically unavoidable boilerplate you'll want to copy |
| 58 | + epi_data <- epidataAhead[[1]] |
| 59 | + effective_ahead <- epidataAhead[[2]] |
| 60 | + args_input <- list(...) |
| 61 | + # edge case where there is no data or less data than the lags; eventually epipredict will handle this |
| 62 | + if (!confirm_sufficient_data(epi_data, effective_ahead, args_input)) { |
| 63 | + null_result <- tibble( |
| 64 | + geo_value = character(), |
| 65 | + forecast_date = lubridate::Date(), |
| 66 | + target_end_date = lubridate::Date(), |
| 67 | + quantile = numeric(), |
| 68 | + value = numeric() |
| 69 | + ) |
| 70 | + return(null_result) |
| 71 | + } |
| 72 | + args_input[["ahead"]] <- effective_ahead |
| 73 | + args_input[["quantile_levels"]] <- quantile_levels |
| 74 | + args_list <- do.call(arx_args_list, args_input) |
| 75 | + # if you want to ignore extra_sources, setting predictors is the way to do it |
| 76 | + predictors <- c(outcome, extra_sources) |
| 77 | + # TODO: Partial match quantile_level coming from here (on Dmitry's machine) |
| 78 | + argsPredictorsTrainer <- perform_sanity_checks(epi_data, outcome, predictors, trainer, args_list) |
| 79 | + args_list <- argsPredictorsTrainer[[1]] |
| 80 | + predictors <- argsPredictorsTrainer[[2]] |
| 81 | + trainer <- argsPredictorsTrainer[[3]] |
| 82 | + # end of the copypasta |
| 83 | + # finally, any other pre-processing (e.g. smoothing) that isn't performed by |
| 84 | + # epipredict |
| 85 | + # smoothing |
| 86 | + keep_mean <- (smooth_width == sd_mean_width) # do we need to do the mean separately? |
| 87 | + if (!is.null(smooth_width) && !keep_mean) { |
| 88 | + epi_data %<>% rolling_mean( |
| 89 | + width = smooth_width, |
| 90 | + cols_to_mean = smooth_cols |
| 91 | + ) |
| 92 | + } |
| 93 | + |
| 94 | + # measuring standard deviation |
| 95 | + if (!is.null(sd_width)) { |
| 96 | + epi_data %<>% rolling_sd( |
| 97 | + sd_width = sd_width, |
| 98 | + mean_width = sd_mean_width, |
| 99 | + cols_to_sd = sd_cols, |
| 100 | + keep_mean = keep_mean |
| 101 | + ) |
| 102 | + } |
| 103 | + # even |
| 104 | + |
| 105 | + # preprocessing supported by epipredict |
| 106 | + preproc <- epi_recipe(epi_data) |
| 107 | + if (pop_scaling) { |
| 108 | + preproc %<>% step_population_scaling( |
| 109 | + all_numeric(), |
| 110 | + df = epipredict::state_census, |
| 111 | + df_pop_col = "pop", |
| 112 | + create_new = FALSE, |
| 113 | + rate_rescaling = 1e5, |
| 114 | + by = c("geo_value" = "abbr") |
| 115 | + ) |
| 116 | + } |
| 117 | + preproc %<>% arx_preprocess(outcome, predictors, args_list) |
| 118 | + |
| 119 | + # postprocessing supported by epipredict |
| 120 | + postproc <- frosting() |
| 121 | + postproc %<>% arx_postprocess(trainer, args_list) |
| 122 | + if (pop_scaling) { |
| 123 | + postproc %<>% layer_population_scaling( |
| 124 | + .pred, .pred_distn, |
| 125 | + df = epipredict::state_census, |
| 126 | + df_pop_col = "pop", |
| 127 | + create_new = FALSE, |
| 128 | + rate_rescaling = 1e5, |
| 129 | + by = c("geo_value" = "abbr") |
| 130 | + ) |
| 131 | + } |
| 132 | + # with all the setup done, we execute and format |
| 133 | + pred <- run_workflow_and_format(preproc, postproc, trainer, epi_data) |
| 134 | + # now pred has the columns |
| 135 | + # (geo_value, forecast_date, target_end_date, quantile, value) |
| 136 | + # finally, any postprocessing not supported by epipredict e.g. calibration |
| 137 | + return(pred) |
| 138 | +} |
0 commit comments