test+fix: drop_na in data counting, default as_of

dsweber2 · dshemetov · commit 9fe4f850680c · 2023-11-06T18:18:59.000-08:00
diff --git a/R/forecaster.R b/R/forecaster.R
@@ -62,13 +62,10 @@ confirm_sufficient_data <- function(epi_data, ahead, args_input, buffer = 15) {
   } else {
     lag_max <- 14 # default value of 2 weeks
   }
-
   return(
     !is.infinite(ahead) &&
       epi_data %>%
-        # TODO: This isn't generalizable to other signals.
-        filter(!is.na(hhs) & !is.na(chng)) %>%
-        # TODO: Quitting forecasting because of one geo_value is bad.
+        drop_na() %>%
         group_by(geo_value) %>%
         summarise(has_enough_data = n_distinct(time_value) >= lag_max + ahead + buffer) %>%
         pull(has_enough_data) %>%
@@ -159,7 +156,11 @@ run_workflow_and_format <- function(preproc, postproc, trainer, epi_data) {
   latest <- get_test_data(recipe = preproc, x = epi_data)
   pred <- predict(workflow, latest)
   # the forecast_date may currently be the max time_value
-  true_forecast_date <- attributes(epi_data)$metadata$as_of
+  as_of <- attributes(epi_data)$metadata$as_of
+  if (is.null(as_of)) {
+    as_of <- max(epi_data$time_value)
+  }
+  true_forecast_date <- as_of
   return(format_storage(pred, true_forecast_date))
 }
 
diff --git a/R/latency_adjusting.R b/R/latency_adjusting.R
@@ -11,10 +11,13 @@
 extend_ahead <- function(epi_data, ahead) {
   time_values <- epi_data$time_value
   if (length(time_values) > 0) {
+    as_of <- attributes(epi_data)$metadata$as_of
+    max_time <- max(time_values)
+    if (is.null(as_of)) {
+      as_of <- max_time
+    }
     effective_ahead <- as.integer(
-      as.Date(attributes(epi_data)$metadata$as_of) -
-        max(time_values) +
-        ahead
+      as.Date(as_of) - max_time + ahead
     )
   } else {
     effective_ahead <- Inf
diff --git a/tests/testthat/test-forecasters-basics.R b/tests/testthat/test-forecasters-basics.R
@@ -5,7 +5,7 @@ forecasters <- list(
   c("flatline_fc", flatline_fc)
 )
 for (forecaster in forecasters) {
-  test_that(forecaster[[1]], {
+  test_that(paste(forecaster[[1]], "gets the date and columns right"), {
     jhu <- epipredict::case_death_rate_subset %>%
       dplyr::filter(time_value >= as.Date("2021-12-01"))
     # the as_of for this is wildly far in the future
@@ -19,8 +19,50 @@ for (forecaster in forecasters) {
       res$target_end_date ==
         as.Date("2022-01-01")
     ))
+    })
+
+  test_that(paste(forecaster[[1]], "deals with no as_of"), {
+    jhu <- epipredict::case_death_rate_subset %>%
+      dplyr::filter(time_value >= as.Date("2021-12-01"))
+    # what if we have no as_of date? assume they mean the last available data
+    attributes(jhu)$metadata$as_of <- NULL
+    expect_no_error(res <- forecaster[[2]](jhu, "case_rate", c("death_rate"), 2L))
+    expect_equal(res$target_end_date %>% unique, max(jhu$time_value)+2)
+    }
+
+  test_that(paste(forecaster[[1]], "handles last second NA's"), {
+    # if the last entries are NA, we should still predict
+    # TODO: currently this checks that we DON'T predict
+    jhu <- epipredict::case_death_rate_subset %>%
+      dplyr::filter(time_value >= as.Date("2021-12-01"))
+    geo_values <-jhu$geo_value %>% unique()
+    one_day_nas <- tibble(
+      geo_value = geo_values,
+      time_value = as.Date("2022-01-01"),
+      case_rate = NA,
+      death_rate = runif(length(geo_values))
+    )
+    second_day_nas <- one_day_nas %>%
+      mutate(time_value = as.Date("2022-01-02"))
+    jhu_nad <- jhu %>%
+      as_tibble() %>%
+      bind_rows(one_day_nas, second_day_nas) %>%
+      as_epi_df()
+    attributes(jhu_nad)$metadata$as_of <- max(jhu_nad$time_value) + 3
+    expect_no_error(nas_forecast <- forecaster[[2]](jhu_nad, "case_rate", c("death_rate")))
+    # TODO: this shouldn't actually be null, it should be a bit further delayed
+    expect_equal(nrow(nas_forecast), 0)
+    })
+
+    #################################
     # any forecaster specific tests
     if (forecaster[[1]] == "scaled_pop") {
+  test_that(paste(forecaster[[1]], "scaled and unscaled don't make the same predictions"), {
+      jhu <- epipredict::case_death_rate_subset %>%
+        dplyr::filter(time_value >= as.Date("2021-12-01"))
+      # the as_of for this is wildly far in the future
+      attributes(jhu)$metadata$as_of <- max(jhu$time_value) + 3
+      res <- forecaster[[2]](jhu, "case_rate", c("death_rate"), -2L)
       # confirm scaling produces different results
       res_unscaled <- forecaster[[2]](jhu,
         "case_rate",
@@ -35,10 +77,16 @@ for (forecaster in forecasters) {
         ) %>%
         mutate(equal = value.unscaled == value.scaled) %>%
         summarize(all(equal)) %>% pull(`all(equal)`))
+      })
     }
     # TODO confirming that it produces exactly the same result as arx_forecaster
     # test case where extra_sources is "empty"
     # test case where the epi_df is empty
+  test_that(paste(forecaster[[1]], "scaled and unscaled don't make the same predictions"), {
+      jhu <- epipredict::case_death_rate_subset %>%
+        dplyr::filter(time_value >= as.Date("2021-12-01"))
+      # the as_of for this is wildly far in the future
+      attributes(jhu)$metadata$as_of <- max(jhu$time_value) + 3
     null_jhu <- jhu %>% filter(time_value < as.Date("0009-01-01"))
     expect_no_error(null_res <- forecaster[[2]](null_jhu, "case_rate", c("death_rate")))
     expect_identical(names(null_res), names(res))