feat+test: make a functional flatline forecaster

dsweber2 · dsweber2 · commit 9d270646ea85 · 2023-10-18T10:20:45.000-07:00
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -1,3 +1,6 @@
 ^renv$
 ^renv\.lock$
 ^LICENSE\.md$
+^.lintr$
+^.renvignore$
+^.github$
diff --git a/NAMESPACE b/NAMESPACE
@@ -9,6 +9,7 @@ export(confirm_insufficient_data)
 export(covidhub_probs)
 export(evaluate_predictions)
 export(extend_ahead)
+export(flatline_fc)
 export(forecaster_pred)
 export(format_storage)
 export(interval_coverage)
@@ -39,6 +40,7 @@ importFrom(purrr,map)
 importFrom(purrr,transpose)
 importFrom(rlang,.data)
 importFrom(rlang,quo)
+importFrom(rlang,sym)
 importFrom(rlang,syms)
 importFrom(tibble,tibble)
 importFrom(tidyr,pivot_wider)
diff --git a/R/forecaster.R b/R/forecaster.R
@@ -123,8 +123,7 @@ arx_postprocess <- function(postproc,
     postproc %<>% layer_threshold(dplyr::starts_with(".pred"))
   }
 
-  postproc %<>% layer_naomit(dplyr::starts_with(".pred"))
-  postproc %<>% layer_add_forecast_date(forecast_date = forecast_date) %>%
+  postproc %<>% layer_naomit(dplyr::starts_with(".pred")) %>%
     layer_add_target_date(target_date = target_date)
   return(postproc)
 }
diff --git a/R/forecaster_flatline.R b/R/forecaster_flatline.R
@@ -0,0 +1,54 @@
+#' flatline forecaster (aka baseline)
+#' @description
+#' a minimal forecaster whose median is just the last value
+#' does not support `lags` as a parameter, but otherwise has the same parameters as `arx_forecaster`
+#' @inheritParams scaled_pop
+#' @importFrom rlang sym
+#' @export
+flatline_fc <- function(epi_data,
+                        outcome,
+                        extra_sources = "",
+                        ahead = 1,
+                        trainer = parsnip::linear_reg(),
+                        levels = covidhub_probs(),
+                        ...) {
+  # perform any preprocessing not supported by epipredict
+  # one that every forecaster will need to handle: how to manage max(time_value)
+  # that's older than the `as_of` date
+  epidataAhead <- extend_ahead(epi_data, ahead)
+  # see latency_adjusting for other examples
+  # this next part is basically unavoidable boilerplate you'll want to copy
+  epi_data <- epidataAhead[[1]]
+  effective_ahead <- epidataAhead[[2]]
+  args_input <- list(...)
+  # edge case where there is no data or less data than the lags; eventually epipredict will handle this
+  if (confirm_insufficient_data(epi_data, effective_ahead, args_input)) {
+    null_result <- tibble(
+      geo_value = character(),
+      forecast_date = Date(),
+      target_end_date = Date(),
+      quantile = numeric(),
+      value = numeric()
+    )
+    return(null_result)
+  }
+  args_input[["ahead"]] <- effective_ahead
+  args_input[["levels"]] <- levels
+  args_list <- do.call(flatline_args_list, args_input)
+  # if you want to ignore extra_sources, setting predictors is the way to do it
+  predictors <- c(outcome, extra_sources)
+  argsPredictorsTrainer <- perform_sanity_checks(epi_data, outcome, predictors, NULL, args_list)
+  args_list <- argsPredictorsTrainer[[1]]
+  predictors <- argsPredictorsTrainer[[2]]
+  # end of the copypasta
+  # finally, any other pre-processing (e.g. smoothing) that isn't performed by
+  # epipredict
+
+  # since this is just the flatline, we don't need much of anything
+  res <- flatline_forecaster(epi_data, outcome = outcome, args_list = args_list)
+  true_forecast_date <- attributes(epi_data)$metadata$as_of
+  pred <- format_storage(res$predictions, true_forecast_date)
+  # (geo_value, forecast_date, target_end_date, quantile, value)
+  # finally, any postprocessing not supported by epipredict e.g. calibration
+  return(pred)
+}
diff --git a/R/forecaster_scaled_pop.R b/R/forecaster_scaled_pop.R
@@ -45,7 +45,7 @@
 scaled_pop <- function(epi_data,
                        outcome,
                        extra_sources = "",
-                       ahead=1,
+                       ahead = 1,
                        pop_scaling = TRUE,
                        trainer = parsnip::linear_reg(),
                        levels = covidhub_probs(),
diff --git a/R/formatters.R b/R/formatters.R
@@ -18,7 +18,7 @@ format_storage <- function(pred, true_forecast_date, target_end_date) {
       .dstn = nested_quantiles(.pred_distn)
     ) %>%
     unnest(.dstn) %>%
-    select(-.pred_distn, -.pred, -time_value) %>%
+    select(-any_of(c(".pred_distn", ".pred", "time_value"))) %>%
     rename(quantile = tau, value = q, target_end_date = target_date) %>%
     relocate(geo_value, forecast_date, target_end_date, quantile, value)
 }
diff --git a/man/flatline_fc.Rd b/man/flatline_fc.Rd
diff --git a/tests/testthat/test-example_spec.R b/tests/testthat/test-example_spec.R
@@ -1,45 +1,55 @@
-test_that("scaled_pop", {
-  library(epipredict)
-  jhu <- case_death_rate_subset %>%
-    dplyr::filter(time_value >= as.Date("2021-12-01"))
-  # the as_of for this is wildly far in the future
-  attributes(jhu)$metadata$as_of <- max(jhu$time_value) + 3
-  expect_warning(res <- scaled_pop(jhu, "case_rate", c("death_rate"), -2L))
-  expect_equal(
-    names(res),
-    c("geo_value", "forecast_date", "target_end_date", "quantile", "value")
-  )
-  expect_true(all(
-    res$target_end_date ==
-      as.Date("2022-01-01")
-  ))
-  # confirm scaling produces different results
-  expect_warning(res_unscaled <- scaled_pop(jhu,
-    "case_rate",
-    c("death_rate"),
-    -2L,
-    pop_scaling = FALSE
-  ))
-  expect_false(res_unscaled %>%
-    full_join(res,
-      by = join_by(geo_value, forecast_date, target_end_date, quantile),
-      suffix = c(".unscaled", ".scaled")
-    ) %>%
-    mutate(equal = value.unscaled == value.scaled) %>%
-    summarize(all(equal)) %>% pull(`all(equal)`))
-  # confirming that it produces exactly the same result as arx_forecaster
-  # test case where extra_sources is "empty"
-  expect_warning(scaled_pop(
-    jhu,
-    "case_rate",
-    c(""),
-    1L
-  ))
-  # test case where the epi_df is empty
-  null_jhu <- jhu %>% filter(time_value < as.Date("0009-01-01"))
-  expect_no_error(null_res <- scaled_pop(null_jhu, "case_rate", c("death_rate")))
-  null_res <- scaled_pop(null_jhu, "case_rate", c("death_rate"))
-  expect_identical(names(null_res), names(res))
-  expect_equal(nrow(null_res), 0)
-  expect_identical(null_res, tibble(geo_value = character(), forecast_date = Date(), target_end_date = Date(), quantile = numeric(), value = numeric()))
-})
+# TODO better way to do this than copypasta
+forecasters <- list(
+  c("scaled_pop", scaled_pop),
+  c("flatline_fc", flatline_fc)
+)
+forecaster <- c("flatline", flatline_fc)
+for (forecaster in forecasters) {
+  test_that(forecaster[[1]], {
+    jhu <- case_death_rate_subset %>%
+      dplyr::filter(time_value >= as.Date("2021-12-01"))
+    # the as_of for this is wildly far in the future
+    attributes(jhu)$metadata$as_of <- max(jhu$time_value) + 3
+    res <- forecaster[[2]](jhu, "case_rate", c("death_rate"), -2L)
+    expect_equal(
+      names(res),
+      c("geo_value", "forecast_date", "target_end_date", "quantile", "value")
+    )
+    expect_true(all(
+      res$target_end_date ==
+        as.Date("2022-01-01")
+    ))
+    # any forecaster specific tests
+    if (forecaster[[1]] == "scaled_pop") {
+      # confirm scaling produces different results
+      res_unscaled <- forecaster[[2]](jhu,
+        "case_rate",
+        c("death_rate"),
+        -2L,
+        pop_scaling = FALSE
+      )
+      expect_false(res_unscaled %>%
+        full_join(res,
+          by = join_by(geo_value, forecast_date, target_end_date, quantile),
+          suffix = c(".unscaled", ".scaled")
+        ) %>%
+        mutate(equal = value.unscaled == value.scaled) %>%
+        summarize(all(equal)) %>% pull(`all(equal)`))
+    }
+    # TODO confirming that it produces exactly the same result as arx_forecaster
+    # test case where extra_sources is "empty"
+    forecaster[[2]](
+      jhu,
+      "case_rate",
+      c(""),
+      1L
+    )
+    # test case where the epi_df is empty
+    null_jhu <- jhu %>% filter(time_value < as.Date("0009-01-01"))
+    expect_no_error(null_res <- forecaster[[2]](null_jhu, "case_rate", c("death_rate")))
+    null_res <- forecaster[[2]](null_jhu, "case_rate", c("death_rate"))
+    expect_identical(names(null_res), names(res))
+    expect_equal(nrow(null_res), 0)
+    expect_identical(null_res, tibble(geo_value = character(), forecast_date = Date(), target_end_date = Date(), quantile = numeric(), value = numeric()))
+  })
+}
diff --git a/tests/testthat/test-forecasters.R b/tests/testthat/test-forecasters.R
@@ -3,6 +3,7 @@ forecasters <- tribble(
   ~forecaster, ~extra_params, ~extra_params_names,
   scaled_pop, list(1, TRUE), list("ahead", "pop_scaling"),
   scaled_pop, list(1, FALSE), list("ahead", "pop_scaling"),
+  flatline_fc, list(1), list("ahead")
 )
 synth_mean <- 25
 synth_sd <- 2
@@ -16,6 +17,7 @@ constant <- as_epi_archive(tibble(
   version = simple_dates,
   a = synth_mean + approx_zero
 ))
+ii <- 3
 # wrap a call that is made quite frequently
 # n_training_pad is set to avoid warnings from the trainer
 get_pred <- function(dataset,
@@ -42,6 +44,7 @@ test_that("constant", {
       a = 4 * synth_mean + approx_zero
     )
   ))
+  different_constants
   for (ii in 1:nrow(forecasters)) {
     res <- get_pred(different_constants, ii)
 
@@ -77,7 +80,9 @@ test_that("white noise", {
     values <- res %>%
       filter(quantile == .5) %>%
       pull(value)
-    expect_true(sd(values) < synth_sd)
+
+    # shouldn't expect the sample sd to actually match the true sd exactly, so giving it some leeway
+    expect_true(sd(values) < 2*synth_sd)
     # how much is each quantile off from the expected value?
     # should be fairly generous here, we just want the right order of magnitude
     quantile_deviation <- res %>%
@@ -106,6 +111,7 @@ test_that("delayed state", {
       a = synth_mean + approx_zero
     )
   ))
+  missing_state$DT %>% filter(geo_value == "ca")
   for (ii in seq_len(nrow(forecasters))) {
     expect_no_error(res <- get_pred(missing_state, ii))
     expect_equal(length(unique(res$geo_value)), 2)
@@ -119,7 +125,15 @@ test_that("delayed state", {
     counts_al <- counts %>%
       filter(geo_value == "al") %>%
       pull(n)
-    expect_true(counts_al > counts_ca)
+    counts_al
+    counts_ca
+    res %>% filter(geo_value == "ca" & quantile == .5)
+    # flatline is more aggressive about forecasting
+    if (identical(forecasters$forecaster[[ii]], flatline_fc)) {
+      expect_true(counts_al == counts_ca)
+    } else {
+      expect_true(counts_al > counts_ca)
+    }
     expect_true(sum(state_delay == 0) > counts_ca)
     expect_true(counts_ca > 0)
   }
@@ -139,12 +153,15 @@ test_that("linear", {
     )
   )
   for (ii in seq_len(nrow(forecasters))) {
+    #flatline will definitely fail this, so it's exempt
+    if (!identical(forecasters$forecaster[[ii]], flatline_fc)) {
     res <- get_pred(linear, ii)
     # make sure that the median is on the sloped line
     median_err <- res %>%
       filter(quantile == .5) %>%
       mutate(err = value - as.integer(target_end_date - start_date + 1), .keep = "none") %>%
       mutate(is_right = near(err,0, tol=tiny_sd ^ 0.5), .keep = "none")
     expect_true(all(median_err))
+    }
   }
 })

Original file line number	Diff line number	Diff line change
`@@ -123,8 +123,7 @@ arx_postprocess <- function(postproc,`
`123`	`123`	`postproc %<>% layer_threshold(dplyr::starts_with(".pred"))`
`124`	`124`	`}`
`125`	`125`
`126`		`- postproc %<>% layer_naomit(dplyr::starts_with(".pred"))`
`127`		`- postproc %<>% layer_add_forecast_date(forecast_date = forecast_date) %>%`
	`126`	`+ postproc %<>% layer_naomit(dplyr::starts_with(".pred")) %>%`
`128`	`127`	`layer_add_target_date(target_date = target_date)`
`129`	`128`	`return(postproc)`
`130`	`129`	`}`
Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,7 @@ format_storage <- function(pred, true_forecast_date, target_end_date) {`
`18`	`18`	`.dstn = nested_quantiles(.pred_distn)`
`19`	`19`	`) %>%`
`20`	`20`	`unnest(.dstn) %>%`
`21`		`- select(-.pred_distn, -.pred, -time_value) %>%`
	`21`	`+ select(-any_of(c(".pred_distn", ".pred", "time_value"))) %>%`
`22`	`22`	`rename(quantile = tau, value = q, target_end_date = target_date) %>%`
`23`	`23`	`relocate(geo_value, forecast_date, target_end_date, quantile, value)`
`24`	`24`	`}`