various suggestions from logan, before=n_points-1

dsweber2 · dsweber2 · commit d542be27e214 · 2023-12-22T12:08:35.000-08:00
diff --git a/NAMESPACE b/NAMESPACE
@@ -113,3 +113,4 @@ importFrom(tidyr,drop_na)
 importFrom(tidyr,expand_grid)
 importFrom(tidyr,pivot_wider)
 importFrom(tidyr,unnest)
+importFrom(zeallot,"%<-%")
diff --git a/R/data_transforms.R b/R/data_transforms.R
@@ -71,7 +71,7 @@ rolling_mean <- function(epi_data, width = 7L, cols_to_mean = NULL) {
   epi_data %<>% group_by(geo_value)
   for (col in cols_to_mean) {
     mean_name <- paste0(col, "_m", width)
-    epi_data %<>% epi_slide(~ mean(.x[[col]]), before = width-1L, new_col_name = mean_name)
+    epi_data %<>% epi_slide(~ mean(.x[[col]], rm.na = TRUE), before = width-1L, new_col_name = mean_name)
   }
   epi_data %<>% ungroup()
   return(epi_data)
@@ -102,8 +102,8 @@ rolling_sd <- function(epi_data, sd_width = 28L, mean_width = NULL, cols_to_sd =
     result %<>% group_by(geo_value)
     mean_name <- paste0(col, "_m", mean_width)
     sd_name <- paste0(col, "_sd", sd_width)
-    result %<>% epi_slide(~ mean(.x[[col]]), before = mean_width-1L, new_col_name = mean_name)
-    result %<>% epi_slide(~ sqrt(mean((.x[[mean_name]] - .x[[col]])^2)), before = sd_width-1, new_col_name = sd_name)
+    result %<>% epi_slide(~ mean(.x[[col]], na.rm = TRUE), before = mean_width-1L, new_col_name = mean_name)
+    result %<>% epi_slide(~ sqrt(mean((.x[[mean_name]] - .x[[col]])^2, na.rm = TRUE)), before = sd_width-1, new_col_name = sd_name)
     if (!keep_mean) {
       # TODO make sure the extra info sticks around
       result %<>% select(-{{ mean_name }})
diff --git a/R/forecaster_scaled_pop.R b/R/forecaster_scaled_pop.R
@@ -73,7 +73,7 @@ scaled_pop <- function(epi_data,
   args_input[["ahead"]] <- effective_ahead
   args_input[["quantile_levels"]] <- quantile_levels
   args_list <- do.call(arx_args_list, args_input)
-  # if you want to ignore extra_sources, setting predictors is the way to do it
+  # if you want to hardcode particular predictors in a particular forecaster
   predictors <- c(outcome, extra_sources)
   # TODO: Partial match quantile_level coming from here (on Dmitry's machine)
   argsPredictorsTrainer <- perform_sanity_checks(epi_data, outcome, predictors, trainer, args_list)
diff --git a/R/forecaster_smoothed_scaled.R b/R/forecaster_smoothed_scaled.R
@@ -14,8 +14,17 @@
 #'   to the max time value of the `epi_df`. how to handle this is a modelling
 #'   question left up to each forecaster; see latency_adjusting.R for the
 #'   existing examples)
-#' @param pop_scaling an example extra parameter unique to this forecaster
-#' @param trainer an example extra parameter that is fairly common
+#' @param pop_scaling bool; if `TRUE`, assume all numeric columns are on the
+#'   count scale and translate them to a rate scale for model fitting.
+#'   Predictions will be translated back to count scale. Any
+#'   `layer_residual_quantiles` (for non-`"quantile_reg"` `trainer`s) will be
+#'   done on the rate scale. When specifying predictor lags, note that rate
+#'   variables will use the same names as and overwrite the count variables.
+#'   Rates here will be counts per 100k population, based on
+#'   `epipredict::state_census`.
+#' @param trainer optional; parsnip model specification to use for the core
+#'   fitting & prediction (the `spec` of the internal
+#'   [`epipredict::epi_workflow`]).  Default is `parsnip::linear_reg()`.
 #' @param smooth_width the number of days over which to do smoothing. If `NULL`,
 #'   then no smoothing is applied.
 #' @param smooth_cols the names of the columns to smooth. If `NULL` it smooths
@@ -34,57 +43,52 @@
 #' @importFrom epipredict epi_recipe step_population_scaling frosting arx_args_list layer_population_scaling
 #' @importFrom tibble tibble
 #' @importFrom recipes all_numeric
+#' @importFrom zeallot %<-%
 #' @export
 smoothed_scaled <- function(epi_data,
-                          outcome,
-                          extra_sources = "",
-                          ahead = 1,
-                          pop_scaling = TRUE,
-                          trainer = parsnip::linear_reg(),
-                          quantile_levels = covidhub_probs(),
-                          smooth_width = 7,
-                          smooth_cols = NULL,
-                          sd_width = 28,
-                          sd_mean_width = 14,
-                          sd_cols = NULL,
-                          ...) {
+                            outcome,
+                            extra_sources = "",
+                            ahead = 1,
+                            pop_scaling = TRUE,
+                            trainer = parsnip::linear_reg(),
+                            quantile_levels = covidhub_probs(),
+                            smooth_width = 7,
+                            smooth_cols = NULL,
+                            sd_width = 28,
+                            sd_mean_width = 14,
+                            sd_cols = NULL,
+                            ...) {
   # perform any preprocessing not supported by epipredict
   # this is a temp fix until a real fix gets put into epipredict
   epi_data <- clear_lastminute_nas(epi_data)
   # one that every forecaster will need to handle: how to manage max(time_value)
   # that's older than the `as_of` date
-  epidataAhead <- extend_ahead(epi_data, ahead)
+  c(epi_data, effective_ahead) %<-% extend_ahead(epi_data, ahead)
   # see latency_adjusting for other examples
-  # this next part is basically unavoidable boilerplate you'll want to copy
-  epi_data <- epidataAhead[[1]]
-  effective_ahead <- epidataAhead[[2]]
   args_input <- list(...)
   # edge case where there is no data or less data than the lags; eventually epipredict will handle this
   if (!confirm_sufficient_data(epi_data, effective_ahead, args_input)) {
-    null_result <- tibble(
-      geo_value = character(),
-      forecast_date = lubridate::Date(),
-      target_end_date = lubridate::Date(),
-      quantile = numeric(),
-      value = numeric()
-    )
+    null_result <- epi_data[0L, c("geo_value", attr(epi_data, "metadata", exact = TRUE)[["other_keys"]])] %>%
+      mutate(
+        forecast_date = epi_data$time_value[0],
+        target_end_date = epi_data$time_value[0],
+        quantile = numeric(),
+        value = numeric()
+      )
     return(null_result)
   }
   args_input[["ahead"]] <- effective_ahead
   args_input[["quantile_levels"]] <- quantile_levels
   args_list <- do.call(arx_args_list, args_input)
-  # if you want to ignore extra_sources, setting predictors is the way to do it
+  # `extra_sources` sets which variables beyond the outcome are lagged and used as predictors
+  # any which are modified by `rolling_mean` or `rolling_sd` have their original values dropped later
   predictors <- c(outcome, extra_sources)
-  # TODO: Partial match quantile_level coming from here (on Dmitry's machine)
-  argsPredictorsTrainer <- perform_sanity_checks(epi_data, outcome, predictors, trainer, args_list)
-  args_list <- argsPredictorsTrainer[[1]]
-  predictors <- argsPredictorsTrainer[[2]]
-  trainer <- argsPredictorsTrainer[[3]]
   # end of the copypasta
   # finally, any other pre-processing (e.g. smoothing) that isn't performed by
   # epipredict
   # smoothing
-  keep_mean <- (smooth_width == sd_mean_width) # do we need to do the mean separately?
+  keep_mean <- !is.null(smooth_width) && !is.null(sd_mean_width) &&
+    smooth_width == sd_mean_width # do we (not) need to do the mean separately?
   if (!is.null(smooth_width) && !keep_mean) {
     epi_data %<>% rolling_mean(
       width = smooth_width,
@@ -101,8 +105,10 @@ smoothed_scaled <- function(epi_data,
       keep_mean = keep_mean
     )
   }
-  # and need to make sure we exclude the original varialbes as predictors
+  # and need to make sure we exclude the original variables as predictors
   predictors <- update_predictors(epi_data, c(smooth_cols, sd_cols), predictors)
+  # TODO: Partial match quantile_level coming from here (on Dmitry's machine)
+  c(args_list, predictors, trainer) %<-% perform_sanity_checks(epi_data, outcome, predictors, trainer, args_list)
   # preprocessing supported by epipredict
   preproc <- epi_recipe(epi_data)
   if (pop_scaling) {
diff --git a/R/targets_utils.R b/R/targets_utils.R
@@ -133,7 +133,7 @@ make_shared_grids <- function() {
       forecaster = "scaled_pop",
       trainer = c("linreg", "quantreg"),
       ahead = c(1:7, 14, 21, 28),
-      lags = list(c(0, 3, 5, 7, 14), c(0, 7, 14)),
+      lags = list(c(0, 3, 5, 7, 14), c(0, 7, 14), c(0,7,14,24)),
       pop_scaling = c(FALSE)
     ),
     tidyr::expand_grid(
@@ -144,7 +144,7 @@ make_shared_grids <- function() {
       forecaster = "smoothed_scaled",
       trainer = c("quantreg"),
       ahead = c(1:7, 14, 21, 28),
-      lags = list(list(c(0, 3, 5, 7, 14), c(0),), c(0, 7, 14)),
+      lags = list(list(c(0, 3, 5, 7, 14), c(0),c(0, 3, 5, 7, 14), c(0),), c(0, 7, 14), c(0,2,4,7,14,21,28)),
       pop_scaling = c(FALSE)
     )
   )
diff --git a/man/get_trainable_names.Rd b/man/get_trainable_names.Rd
diff --git a/man/smoothed_scaled.Rd b/man/smoothed_scaled.Rd
diff --git a/man/update_predictors.Rd b/man/update_predictors.Rd
diff --git a/tests/testthat/test-forecasters-basics.R b/tests/testthat/test-forecasters-basics.R
@@ -5,7 +5,6 @@ forecasters <- list(
   c("flatline_fc", flatline_fc),
   c("smoothed_scaled", smoothed_scaled)
 )
-forecaster <- forecasters[[3]]
 for (forecaster in forecasters) {
   test_that(paste(forecaster[[1]], "gets the date and columns right"), {
     jhu <- epipredict::case_death_rate_subset %>%
diff --git a/tests/testthat/test-transforms.R b/tests/testthat/test-transforms.R
@@ -52,14 +52,19 @@ test_that("rolling_sd generates correct standard deviation", {
   rolled <- rolling_sd(epi_data, keep_mean = TRUE)
   expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a_m14", "a_sd28", "b_m14", "b_sd28"))
   # hand specified rolling mean with a rear window of 7, noting that mean(1:14) = 7.5
-  linear_roll_mean <- c(seq(from = 1, to = 7.5, by = .5), seq(from = 8.5, to = 16.5, by = 1), seq(from = 17, to = 32, by = 1))
-  linear_roll_mean
+  linear_roll_mean <- c(seq(from = 1, to = 7, by = .5), seq(from = 8, to = 16, by = 1), seq(from = 16.5, to = 32.5, by = 1))
+##   linear_roll_mean <- c(seq(from = 1, by = .5, length.out = 14), seq(from = 8.5, to = 32.5, by = 1))
+##   gap_starts <- epi_data %>% filter(geo_value == "al" & time_value == as.Date("2012-01-11")) %>% pull(a)
+##   unusual_days <- map_vec(seq(from = 0, to = 5), \(d) mean(((gap_starts + d) - 0):max((gap_starts + d) - 14, 1)))
+##   map(seq(from = 0, to = 5), \(d) mean(((gap_starts + d) - 0):max((gap_starts + d) - 13, 1)))
+##   linear_roll_mean
+## rolled %>% filter(geo_value == "al") %>% pull("a_m14")
   expect_equal(rolled %>% filter(geo_value == "al") %>% pull("a_m14"), linear_roll_mean)
   # and the standard deviation is
   linear_roll_mean <- append(linear_roll_mean, NA, after = removed_date - 1)
   linear_values <- 1:39
   linear_values <- append(linear_values, NA, after = removed_date - 1)
-  linear_roll_sd <- sqrt(slider::slide_dbl((linear_values - linear_roll_mean)^2, \(x) mean(x, na.rm = TRUE), .before = 28))
+  linear_roll_sd <- sqrt(slider::slide_dbl((linear_values - linear_roll_mean)^2, \(x) mean(x, na.rm = TRUE), .before = 28 - 1))
   # drop the extra date caused by the inclusion of the NAs
   linear_roll_sd <- linear_roll_sd[-(removed_date)]
   expect_equal(rolled %>% filter(geo_value == "al") %>% pull("a_sd28"), linear_roll_sd)

Original file line number	Diff line number	Diff line change
`@@ -5,7 +5,6 @@ forecasters <- list(`
`5`	`5`	`c("flatline_fc", flatline_fc),`
`6`	`6`	`c("smoothed_scaled", smoothed_scaled)`
`7`	`7`	`)`
`8`		`-forecaster <- forecasters[[3]]`
`9`	`8`	`for (forecaster in forecasters) {`
`10`	`9`	`test_that(paste(forecaster[[1]], "gets the date and columns right"), {`
`11`	`10`	`jhu <- epipredict::case_death_rate_subset %>%`