cmu-delphi · dajmcdon · Sep 15, 2022 · Sep 14, 2022 · Sep 14, 2022 · Sep 14, 2022
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -21,13 +21,13 @@ URL: https://github.com/cmu-delphi/epipredict/,
     https://cmu-delphi.github.io/epipredict
 BugReports: https://github.com/cmu-delphi/epipredict/issues/
 Depends: 
-    R (>= 3.5.0)
+    R (>= 3.5.0),
+    epiprocess
 Imports: 
     assertthat,
     cli,
     distributional,
     dplyr,
-    epiprocess,
     fs,
     generics,
     glue,

diff --git a/NAMESPACE b/NAMESPACE
@@ -137,6 +137,5 @@ importFrom(stats,predict)
 importFrom(stats,qnorm)
 importFrom(stats,quantile)
 importFrom(stats,residuals)
-importFrom(stats,setNames)
 importFrom(tibble,is_tibble)
 importFrom(tibble,tibble)
diff --git a/R/arx_forecaster.R b/R/arx_forecaster.R
@@ -27,10 +27,10 @@
 #' out <- arx_forecaster(jhu, "death_rate",
 #'   c("case_rate", "death_rate"))
 arx_forecaster <- function(epi_data,
-                               outcome,
-                               predictors,
-                               trainer = parsnip::linear_reg(),
-                               args_list = arx_args_list()) {
+                           outcome,
+                           predictors,
+                           trainer = parsnip::linear_reg(),
+                           args_list = arx_args_list()) {
 
   validate_forecaster_inputs(epi_data, outcome, predictors)
   if (!is.list(trainer) || trainer$mode != "regression")
@@ -60,7 +60,7 @@ arx_forecaster <- function(epi_data,
     layer_add_target_date(target_date = target_date)
   if (args_list$nonneg) f <- layer_threshold(f, dplyr::starts_with(".pred"))
 
-  latest <- get_test_data(r, epi_data)
+  latest <- get_test_data(r, epi_data, TRUE)
 
   wf <- epi_workflow(r, trainer, f) %>% generics::fit(epi_data)
   list(

diff --git a/R/get_test_data.R b/R/get_test_data.R
@@ -6,12 +6,25 @@
 #' and other variables in the original dataset,
 #' which will be used to create test data.
 #'
+#' It also optionally fills missing values
+#' using the last-observation-carried-forward (LOCF) method. If this
+#' is not possible (say because there would be only `NA`'s in some location),
+#' it will produce an error suggesting alternative options to handle missing
+#' values with more advanced techniques.
+#'
 #' @param recipe A recipe object. The step will be added to the
 #'  sequence of operations for this recipe.
 #' @param x A data frame, tibble, or epi_df data set.
+#' @param fill_locf Logical. Should we use `locf` to fill in missing data?
+#' @param n_recent Integer or NULL. If filling missing data with `locf=TRUE`,
+#'   how far back are we willing to tolerate missing data? Larger values allow
+#'   more filling. The default `NULL` will determine this from the maximum
+#'   lags used in the `recipe`. For example, suppose n_recent = 3, then if the
+#'   3 most recent observations in some region are all `NA`’s, we won’t be able
+#'   to fill anything, and an error message will be thrown.
 #'
-#' @return A tibble with columns `geo_value`, `time_value`
-#' and other variables in the original dataset.
+#' @return A tibble with columns `geo_value`, `time_value`, any additional
+#'   keys, as well other variables in the original dataset.
 #' @examples
 #' # create recipe
 #'  rec <- epi_recipe(case_death_rate_subset) %>%
@@ -21,11 +34,17 @@
 #'  get_test_data(recipe = rec, x = case_death_rate_subset)
 #' @export
 
-get_test_data <- function(recipe, x) {
+get_test_data <- function(recipe, x, fill_locf = FALSE, n_recent = NULL) {
   stopifnot(is.data.frame(x))
-  if (! all(colnames(x) %in% colnames(recipe$template)))
+  arg_is_lgl(fill_locf)
+  arg_is_scalar(fill_locf)
+  arg_is_pos_int(n_recent, allow_null = TRUE)
+  arg_is_scalar(n_recent, allow_null = TRUE)
+
+  if (!all(colnames(x) %in% colnames(recipe$template)))
     cli_stop("some variables used for training are not available in `x`.")
   max_lags <- max(map_dbl(recipe$steps, ~ max(.x$lag %||% 0)), 0)
+  if (is.null(n_recent)) n_recent <- max_lags + 1
 
   # CHECK: Return NA if insufficient training data
   if (dplyr::n_distinct(x$time_value) < max_lags) {
@@ -36,15 +55,29 @@ get_test_data <- function(recipe, x) {
   groups <- epi_keys(recipe)
   groups <- groups[groups != "time_value"]
 
-  x %>%
-    dplyr::filter(
-      dplyr::if_any(
-        .cols = recipe$term_info$variable[which(recipe$var_info$role == 'raw')],
-        .fns = ~ !is.na(.x)
-      )
-    ) %>%
+  x <- x %>%
     epiprocess::group_by(dplyr::across(dplyr::all_of(groups))) %>%
+    dplyr::slice_tail(n = max(n_recent, max_lags + 1))
+
+  if (fill_locf) {
+    cannot_be_used <- x %>%
+      dplyr::slice_tail(n = n_recent) %>%
+      dplyr::summarize(dplyr::across(
+        !time_value, ~ !is.na(.x[1])), .groups = "drop") %>%
+      dplyr::summarise(dplyr::across(-dplyr::all_of(groups), ~ any(!.x))) %>%
+      unlist()
+    if (any(cannot_be_used)) {
+      bad_vars <- names(cannot_be_used)[cannot_be_used]
+      cli_stop("The variables {bad_vars} have ",
+               "too many recent missing values to be filled automatically. ",
+               "You should either choose `n_recent` larger than its current ",
+               "value {n_recent}, or perform NA imputation manually, perhaps with ",
+               "{.code recipes::step_impute_*()} or with {.code tidyr::fill()}.")
+    }
+    x <- x %>% tidyr::fill(!time_value)
+  }
+
+  x %>%
     dplyr::slice_tail(n = max_lags + 1) %>%
     epiprocess::ungroup()
-
 }
diff --git a/R/layer_residual_quantiles.R b/R/layer_residual_quantiles.R
@@ -5,7 +5,7 @@
 #' @param probs numeric vector of probabilities with values in (0,1)
 #'   referring to the desired quantile.
 #' @param symmetrize logical. If `TRUE` then interval will be symmetric.
-#' @param by_key A character vector of keys to group the residuls by before
+#' @param by_key A character vector of keys to group the residuals by before
 #'   calculating quantiles. The default, `c()` performs no grouping.
 #' @param name character. The name for the output column.
 #' @param .flag a logical to determine if the layer is added. Passed on to
@@ -45,7 +45,7 @@
 #'
 #' p2 <- predict(wf2, latest)
 layer_residual_quantiles <- function(frosting, ...,
-                                     probs = c(0.0275, 0.975),
+                                     probs = c(0.05, 0.95),
                                      symmetrize = TRUE,
                                      by_key = character(0L),
                                      name = ".pred_distn",

diff --git a/R/utils_arg.R b/R/utils_arg.R
@@ -59,7 +59,7 @@ arg_is_pos_int = function(..., allow_null = FALSE) {
     ...,
     tests = function(name, value) {
       if (!((is.numeric(value) && all(value > 0) && all(value%%1 == 0)) |
-            (is.null(value) & !allow_null)))
+            (is.null(value) & allow_null)))
         cli_stop("All {.val {name}} must be whole positive number(s).")
     }
   )

diff --git a/man/bake.Rd b/man/bake.Rd
diff --git a/man/epi_juice.Rd b/man/epi_juice.Rd
diff --git a/man/epi_workflow.Rd b/man/epi_workflow.Rd
diff --git a/man/flatline.Rd b/man/flatline.Rd
diff --git a/man/get_test_data.Rd b/man/get_test_data.Rd
diff --git a/man/layer_residual_quantiles.Rd b/man/layer_residual_quantiles.Rd
diff --git a/tests/testthat/test-get_test_data.R b/tests/testthat/test-get_test_data.R
@@ -10,7 +10,7 @@ test_that("return expected number of rows and returned dataset is ungrouped", {
   test <- get_test_data(recipe = r, x = case_death_rate_subset)
 
   expect_equal(nrow(test),
-               dplyr::n_distinct(case_death_rate_subset$geo_value)* 29)
+               dplyr::n_distinct(case_death_rate_subset$geo_value) * 29)
 
   expect_false(dplyr::is.grouped_df(test))
 })
@@ -39,3 +39,41 @@ test_that("expect error that geo_value or time_value does not exist", {
   expect_error(get_test_data(recipe = r, x = wrong_epi_df))
 })
 
+
+test_that("NA fill behaves as desired", {
+ df <- tibble::tibble(
+   geo_value = rep(c("ca", "ny"), each = 10),
+   time_value = rep(1:10, times = 2),
+   x1 = rnorm(20),
+   x2 = rnorm(20)) %>%
+   epiprocess::as_epi_df()
+
+ r <- epi_recipe(df) %>%
+   step_epi_ahead(x1, ahead = 3) %>%
+   step_epi_lag(x1, x2, lag = c(1,3)) %>%
+   step_epi_naomit()
+
+ expect_silent(tt <- get_test_data(r, df))
+ expect_s3_class(tt, "epi_df")
+
+ expect_error(get_test_data(r, df, "A"))
+ expect_error(get_test_data(r, df, TRUE, -3))
+
+ df2 <- df
+ df2$x1[df2$geo_value == "ca"] <- NA
+
+ td <- get_test_data(r, df2)
+ expect_true(any(is.na(td)))
+ expect_error(get_test_data(r, df2, TRUE))
+
+ df1 <- df2
+ df1$x1[1:4] <- 1:4
+ td1 <- get_test_data(r, df1, TRUE, n_recent = 7)
+ expect_true(!any(is.na(td1)))
+
+ df2$x1[7:8] <- 1:2
+ td2 <- get_test_data(r, df2, TRUE)
+ expect_true(!any(is.na(td2)))
+
+
+})