consistent name, only smooth non-smoothed, init forecaster

dsweber2 · dsweber2 · commit 2f6106923180 · 2023-12-11T15:27:36.000-08:00
diff --git a/NAMESPACE b/NAMESPACE
@@ -38,7 +38,7 @@ export(scaled_pop)
 export(sharpness)
 export(single_id)
 export(slide_forecaster)
-export(smooth_scaled)
+export(smoothed_scaled)
 export(underprediction)
 export(weighted_interval_score)
 importFrom(assertthat,assert_that)
diff --git a/R/data_transforms.R b/R/data_transforms.R
@@ -1,13 +1,15 @@
 # various reusable transforms to apply before handing to epipredict
 
-#' extract the non-key columns from epi_data
+#' extract the non-key, non-smoothed columns from epi_data
 #' @keywords internal
 #' @param epi_data the epi_data tibble
 #' @param cols vector of column names to use. If `NULL`, fill with all non-key columns
 get_trainable_names <- function(epi_data, cols) {
   if (is.null(cols)) {
     cols <- names(epi_data)
     cols <- cols[!(cols %in% c("geo_value", "time_value", attr(epi_data, "metadata")$other_keys))]
+    # exclude anything with the same naming schema as the rolling average/sd created below
+    cols <- cols[!grepl("_\\w{1,2}\\d+", cols)]
   }
   return(cols)
 }
@@ -27,7 +29,7 @@ rolling_mean <- function(epi_data, width = 7L, cols_to_mean = NULL) {
   cols_to_mean <- get_trainable_names(epi_data, cols_to_mean)
   epi_data %<>% group_by(geo_value)
   for (col in cols_to_mean) {
-    mean_name <- paste0(col, width)
+    mean_name <- paste0(col, "_m", width)
     epi_data %<>% mutate({{ mean_name }} := slider::slide_dbl(.data[[col]], mean, .before = width))
   }
   epi_data %<>% ungroup()
diff --git a/R/forecaster_smoothed_scaled.R b/R/forecaster_smoothed_scaled.R
@@ -0,0 +1,138 @@
+#' predict on smoothed data and the standard deviation
+#' @description
+#' This is a variant of `scaled_pop`, which predicts on a smoothed version of
+#'   the data. Even if the target is smoothed when used as a /predictor/, as a
+#'   /target/ it still uses the raw value (this captures some of the noise).  It
+#'   also uses a rolling standard deviation as an auxillary signal, window of
+#'   withd `sd_width`, which by default is 28 days.
+#' @param epi_data the actual data used
+#' @param outcome the name of the target variable
+#' @param extra_sources the name of any extra columns to use. This list could be
+#'   empty
+#' @param ahead (this is relative to the `as_of` field of the `epi_df`, which is
+#'   likely *not* the same as the `ahead` used by epipredict, which is relative
+#'   to the max time value of the `epi_df`. how to handle this is a modelling
+#'   question left up to each forecaster; see latency_adjusting.R for the
+#'   existing examples)
+#' @param pop_scaling an example extra parameter unique to this forecaster
+#' @param trainer an example extra parameter that is fairly common
+#' @param smooth_width the number of days over which to do smoothing. If `NULL`,
+#'   then no smoothing is applied.
+#' @param smooth_cols the names of the columns to smooth. If `NULL` it smooths
+#'   everything
+#' @param sd_width the number of days over which to take a moving average of the
+#'   standard deviation. If `NULL`, the sd_width isn't included.
+#' @param sd_mean_width to calculate the sd, we need a window size for the mean
+#'   used.
+#' @param sd_cols the names of the columns to smooth. If `NULL` its includes
+#'   the sd of everything
+#' @param quantile_levels The quantile levels to predict. Defaults to those
+#'   required by covidhub.
+#' @seealso some utilities for making forecasters: [format_storage],
+#'   [perform_sanity_checks]
+#' @importFrom epipredict epi_recipe step_population_scaling frosting arx_args_list layer_population_scaling
+#' @importFrom tibble tibble
+#' @importFrom recipes all_numeric
+#' @export
+smoothed_scaled <- function(epi_data,
+                          outcome,
+                          extra_sources = "",
+                          ahead = 1,
+                          pop_scaling = TRUE,
+                          trainer = parsnip::linear_reg(),
+                          quantile_levels = covidhub_probs(),
+                          smooth_width = 7,
+                          smooth_cols = NULL,
+                          sd_width = 28,
+                          sd_mean_width = 14,
+                          sd_cols = NULL,
+                          ...) {
+  # perform any preprocessing not supported by epipredict
+  # this is a temp fix until a real fix gets put into epipredict
+  epi_data <- clear_lastminute_nas(epi_data)
+  # one that every forecaster will need to handle: how to manage max(time_value)
+  # that's older than the `as_of` date
+  epidataAhead <- extend_ahead(epi_data, ahead)
+  # see latency_adjusting for other examples
+  # this next part is basically unavoidable boilerplate you'll want to copy
+  epi_data <- epidataAhead[[1]]
+  effective_ahead <- epidataAhead[[2]]
+  args_input <- list(...)
+  # edge case where there is no data or less data than the lags; eventually epipredict will handle this
+  if (!confirm_sufficient_data(epi_data, effective_ahead, args_input)) {
+    null_result <- tibble(
+      geo_value = character(),
+      forecast_date = lubridate::Date(),
+      target_end_date = lubridate::Date(),
+      quantile = numeric(),
+      value = numeric()
+    )
+    return(null_result)
+  }
+  args_input[["ahead"]] <- effective_ahead
+  args_input[["quantile_levels"]] <- quantile_levels
+  args_list <- do.call(arx_args_list, args_input)
+  # if you want to ignore extra_sources, setting predictors is the way to do it
+  predictors <- c(outcome, extra_sources)
+  # TODO: Partial match quantile_level coming from here (on Dmitry's machine)
+  argsPredictorsTrainer <- perform_sanity_checks(epi_data, outcome, predictors, trainer, args_list)
+  args_list <- argsPredictorsTrainer[[1]]
+  predictors <- argsPredictorsTrainer[[2]]
+  trainer <- argsPredictorsTrainer[[3]]
+  # end of the copypasta
+  # finally, any other pre-processing (e.g. smoothing) that isn't performed by
+  # epipredict
+  # smoothing
+  keep_mean <- (smooth_width == sd_mean_width) # do we need to do the mean separately?
+  if (!is.null(smooth_width) && !keep_mean) {
+    epi_data %<>% rolling_mean(
+      width = smooth_width,
+      cols_to_mean = smooth_cols
+    )
+  }
+
+  # measuring standard deviation
+  if (!is.null(sd_width)) {
+    epi_data %<>% rolling_sd(
+      sd_width = sd_width,
+      mean_width = sd_mean_width,
+      cols_to_sd = sd_cols,
+      keep_mean = keep_mean
+    )
+  }
+  # even
+
+  # preprocessing supported by epipredict
+  preproc <- epi_recipe(epi_data)
+  if (pop_scaling) {
+    preproc %<>% step_population_scaling(
+      all_numeric(),
+      df = epipredict::state_census,
+      df_pop_col = "pop",
+      create_new = FALSE,
+      rate_rescaling = 1e5,
+      by = c("geo_value" = "abbr")
+    )
+  }
+  preproc %<>% arx_preprocess(outcome, predictors, args_list)
+
+  # postprocessing supported by epipredict
+  postproc <- frosting()
+  postproc %<>% arx_postprocess(trainer, args_list)
+  if (pop_scaling) {
+    postproc %<>% layer_population_scaling(
+      .pred, .pred_distn,
+      df = epipredict::state_census,
+      df_pop_col = "pop",
+      create_new = FALSE,
+      rate_rescaling = 1e5,
+      by = c("geo_value" = "abbr")
+    )
+  }
+  # with all the setup done, we execute and format
+  pred <- run_workflow_and_format(preproc, postproc, trainer, epi_data)
+  # now pred has the columns
+  # (geo_value, forecast_date, target_end_date, quantile, value)
+  # finally, any postprocessing not supported by epipredict e.g. calibration
+  return(pred)
+}
diff --git a/man/get_trainable_names.Rd b/man/get_trainable_names.Rd
diff --git a/man/smoothed_scaled.Rd b/man/smoothed_scaled.Rd
diff --git a/tests/testthat/test-forecasters-basics.R b/tests/testthat/test-forecasters-basics.R
@@ -2,7 +2,8 @@ library(dplyr)
 # TODO better way to do this than copypasta
 forecasters <- list(
   c("scaled_pop", scaled_pop),
-  c("flatline_fc", flatline_fc)
+  c("flatline_fc", flatline_fc),
+  c("smoothed_scaled", smoothed_scaled)
 )
 for (forecaster in forecasters) {
   test_that(paste(forecaster[[1]], "gets the date and columns right"), {
diff --git a/tests/testthat/test-transforms.R b/tests/testthat/test-transforms.R
@@ -2,36 +2,41 @@ n_days <- 40
 simple_dates <- seq(as.Date("2012-01-01"), by = "day", length.out = n_days)
 rand_vals <- rnorm(n_days)
 epi_data <- epiprocess::as_epi_df(rbind(tibble(
-geo_value = "al",
-time_value = simple_dates,
-a = 1:n_days,
-b = rand_vals
+  geo_value = "al",
+  time_value = simple_dates,
+  a = 1:n_days,
+  b = rand_vals
 ), tibble(
-geo_value = "ca",
-time_value = simple_dates,
-a = n_days:1,
-b = rand_vals + 10
+  geo_value = "ca",
+  time_value = simple_dates,
+  a = n_days:1,
+  b = rand_vals + 10
 )))
 test_that("rolling_mean generates correct mean", {
   rolled <- rolling_mean(epi_data)
-  expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a7", "b7"))
+  expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a_m7", "b_m7"))
   # hand specified rolling mean with a rear window of 7, noting that mean(1:7) = 4
-  linear_roll_mean <- c(seq(from=1, to = 4, by = .5), seq(from = 4.5, to = 36.5, by = 1))
-  expect_equal(rolled %>% filter(geo_value == "al") %>% pull("a7"), linear_roll_mean)
+  linear_roll_mean <- c(seq(from = 1, to = 4, by = .5), seq(from = 4.5, to = 36.5, by = 1))
+  expect_equal(rolled %>% filter(geo_value == "al") %>% pull("a_m7"), linear_roll_mean)
   # same, but "ca" is reversed, noting mean(40:(40-7)) =36.5
-  linear_reverse_roll_mean <- c(seq(from=40, to = 36.5, by = -0.5), seq(from = 35.5, to = 4.5, by = -1))
-  expect_equal(rolled %>% filter(geo_value == "ca") %>% pull("a7"), linear_reverse_roll_mean)
+  linear_reverse_roll_mean <- c(seq(from = 40, to = 36.5, by = -0.5), seq(from = 35.5, to = 4.5, by = -1))
+  expect_equal(rolled %>% filter(geo_value == "ca") %>% pull("a_m7"), linear_reverse_roll_mean)
 })
 
 test_that("rolling_sd generates correct standard deviation", {
   rolled <- rolling_sd(epi_data)
   expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a_SD28", "b_SD28"))
   # hand specified rolling mean with a rear window of 7, noting that mean(1:14) = 7.5
-  linear_roll_mean <- c(seq(from=1, to = 7.5, by = .5), seq(from = 8, to = 33, by = 1))
+  linear_roll_mean <- c(seq(from = 1, to = 7.5, by = .5), seq(from = 8, to = 33, by = 1))
   # and the standard deviation is
   linear_roll_sd <- sqrt(slider::slide_dbl((1:40 - linear_roll_mean)^2, mean, .before = 28))
   expect_equal(rolled %>% filter(geo_value == "al") %>% pull("a_SD28"), linear_roll_sd)
   # even though ca is reversed, the changes are all the same, so the standard deviation is *exactly* the same values
   expect_equal(rolled %>% filter(geo_value == "ca") %>% pull("a_SD28"), linear_roll_sd)
-  })
+})
+testthat("get_trainable_names pulls out mean and sd columns", {
+  rolled <- rolling_sd(epi_data, keep_mean = TRUE)
+  expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a_m14", "a_SD28", "b_m14", "b_SD28"))
+  expect_equal(get_trainable_names(rolled, NULL), c("a", "b"))
+})
 # TODO example with NA's, example with missing days, only one column, keep_mean