fix drop na subtleties, unify prep/bake usage

dsweber2 · dsweber2 · commit c5b25681b322 · 2025-03-25T11:45:16.000-05:00
A couple of simultaneous problems that were making this tricky:
1. drop_na can completely remove states
2. checking each column individually misses cases where combinations of
   the states cause the signal to be left out.
3. checking all columns simultaneously doesn't let the user know which
   columns to check.
diff --git a/R/check_enough_data.R b/R/check_enough_data.R
@@ -94,22 +94,8 @@ prep.check_enough_data <- function(x, training, info = NULL, ...) {
     x$n <- length(col_names)
   }
 
-  if (x$drop_na) {
-    training <- tidyr::drop_na(training, any_of(unname(col_names)))
-  }
-  cols_not_enough_data <- training %>%
-    group_by(across(all_of(.env$x$epi_keys))) %>%
-    summarise(across(all_of(.env$col_names), ~ dplyr::n() < .env$x$n), .groups = "drop") %>%
-    summarise(across(all_of(.env$col_names), any), .groups = "drop") %>%
-    unlist() %>%
-    names(.)[.]
+  check_enough_data_core(training, x, col_names, "train")
 
-  if (length(cols_not_enough_data) > 0) {
-    cli_abort(
-      "The following columns don't have enough data to predict: {cols_not_enough_data}.",
-      class = "epipredict__not_enough_data"
-    )
-  }
 
   check_enough_data_new(
     n = x$n,
@@ -127,24 +113,7 @@ prep.check_enough_data <- function(x, training, info = NULL, ...) {
 #' @export
 bake.check_enough_data <- function(object, new_data, ...) {
   col_names <- object$columns
-  if (object$drop_na) {
-    non_na_data <- tidyr::drop_na(new_data, any_of(unname(col_names)))
-  } else {
-    non_na_data <- new_data
-  }
-  cols_not_enough_data <- non_na_data %>%
-    group_by(across(all_of(.env$object$epi_keys))) %>%
-    summarise(across(all_of(.env$col_names), ~ dplyr::n() < .env$object$n), .groups = "drop") %>%
-    summarise(across(all_of(.env$col_names), any), .groups = "drop") %>%
-    unlist() %>%
-    names(.)[.]
-
-  if (length(cols_not_enough_data) > 0) {
-    cli_abort(
-      "The following columns don't have enough data to predict: {cols_not_enough_data}.",
-      class = "epipredict__not_enough_data"
-    )
-  }
+  check_enough_data_core(new_data, object, col_names, "predict")
   new_data
 }
 
@@ -168,3 +137,59 @@ tidy.check_enough_data <- function(x, ...) {
   res$drop_na <- x$drop_na
   res
 }
+
+check_enough_data_core <- function(epi_df, step_obj, col_names, train_or_predict) {
+  epi_df <- epi_df %>%
+    group_by(across(all_of(.env$step_obj$epi_keys)))
+  if (step_obj$drop_na) {
+    any_missing_data <- epi_df %>%
+      mutate(any_are_na = rowSums(across(any_of(.env$col_names), ~ is.na(.x))) > 0) %>%
+      # count the number of rows where they're all not na
+      summarise(sum(any_are_na == 0) < .env$step_obj$n, .groups = "drop")
+    any_missing_data <- any_missing_data %>%
+      summarize(across(all_of(setdiff(names(any_missing_data), step_obj$epi_keys)), any)) %>%
+      any()
+
+    # figuring out which individual columns (if any) are to blame for this darth
+    # of data
+    cols_not_enough_data <- epi_df %>%
+      summarise(
+        across(
+          all_of(.env$col_names),
+          ~ sum(!is.na(.x)) < .env$step_obj$n
+        ),
+        .groups = "drop"
+      ) %>%
+      summarise(across(all_of(.env$col_names), any), .groups = "drop") %>%
+      unlist() %>%
+      names(.)[.]
+
+    if (length(cols_not_enough_data) == 0) {
+      cols_not_enough_data <-
+        glue::glue("no single column, but the combination of {paste0(col_names, collapse = ', ')}")
+    }
+  } else {
+    # if we're not dropping na values, just count
+    cols_not_enough_data <- epi_df %>%
+      summarise(
+        across(
+          all_of(.env$col_names),
+          ~ dplyr::n() < .env$step_obj$n
+        )
+      )
+    any_missing_data <- cols_not_enough_data %>%
+      summarize(across(all_of(.env$col_names), all)) %>%
+      all()
+    cols_not_enough_data <- cols_not_enough_data %>%
+      summarise(across(all_of(.env$col_names), any), .groups = "drop") %>%
+      unlist() %>%
+      names(.)[.]
+  }
+
+  if (any_missing_data) {
+    cli_abort(
+      "The following columns don't have enough data to {train_or_predict}: {cols_not_enough_data}.",
+      class = "epipredict__not_enough_data"
+    )
+  }
+}
diff --git a/tests/testthat/_snaps/check_enough_data.md b/tests/testthat/_snaps/check_enough_data.md
@@ -2,44 +2,53 @@
 
     Code
       epi_recipe(toy_epi_df) %>% check_enough_data(x, y, n = 2 * n + 1, drop_na = FALSE) %>%
-        prep(toy_epi_df) %>% bake(new_data = NULL)
+        prep(toy_epi_df)
     Condition
-      Error in `prep()`:
-      ! The following columns don't have enough data to predict: x and y.
+      Error in `check_enough_data_core()`:
+      ! The following columns don't have enough data to train: x and y.
 
 ---
 
     Code
       epi_recipe(toy_epi_df) %>% check_enough_data(x, y, n = 2 * n - 1, drop_na = TRUE) %>%
-        prep(toy_epi_df) %>% bake(new_data = NULL)
+        prep(toy_epi_df)
     Condition
-      Error in `prep()`:
-      ! The following columns don't have enough data to predict: x and y.
+      Error in `check_enough_data_core()`:
+      ! The following columns don't have enough data to train: x.
 
 # check_enough_data works on unpooled data
 
     Code
       epi_recipe(toy_epi_df) %>% check_enough_data(x, y, n = n + 1, epi_keys = "geo_value",
-      drop_na = FALSE) %>% prep(toy_epi_df) %>% bake(new_data = NULL)
+      drop_na = FALSE) %>% prep(toy_epi_df)
     Condition
-      Error in `prep()`:
-      ! The following columns don't have enough data to predict: x and y.
+      Error in `check_enough_data_core()`:
+      ! The following columns don't have enough data to train: x and y.
 
 ---
 
     Code
       epi_recipe(toy_epi_df) %>% check_enough_data(x, y, n = 2 * n - 3, epi_keys = "geo_value",
-      drop_na = TRUE) %>% prep(toy_epi_df) %>% bake(new_data = NULL)
+      drop_na = TRUE) %>% prep(toy_epi_df)
     Condition
-      Error in `prep()`:
-      ! The following columns don't have enough data to predict: x and y.
+      Error in `check_enough_data_core()`:
+      ! The following columns don't have enough data to train: x and y.
+
+# check_enough_data only checks train data when skip = FALSE
+
+    Code
+      forecaster %>% predict(new_data = toy_test_data %>% filter(time_value >
+        "2020-01-08"))
+    Condition
+      Error in `check_enough_data_core()`:
+      ! The following columns don't have enough data to predict: x.
 
 # check_enough_data works with all_predictors() downstream of constructed terms
 
     Code
       epi_recipe(toy_epi_df) %>% step_epi_lag(x, lag = c(1, 2)) %>% check_enough_data(
-        all_predictors(), y, n = 2 * n - 4) %>% prep(toy_epi_df) %>% bake(new_data = NULL)
+        all_predictors(), y, n = 2 * n - 4) %>% prep(toy_epi_df)
     Condition
-      Error in `prep()`:
-      ! The following columns don't have enough data to predict: lag_1_x, lag_2_x, and y.
+      Error in `check_enough_data_core()`:
+      ! The following columns don't have enough data to train: no single column, but the combination of lag_1_x, lag_2_x, y.
 
diff --git a/tests/testthat/test-check_enough_data.R b/tests/testthat/test-check_enough_data.R
@@ -27,16 +27,14 @@ test_that("check_enough_data works on pooled data", {
     error = TRUE,
     epi_recipe(toy_epi_df) %>%
       check_enough_data(x, y, n = 2 * n + 1, drop_na = FALSE) %>%
-      prep(toy_epi_df) %>%
-      bake(new_data = NULL)
+      prep(toy_epi_df)
   )
   # Check drop_na works
   expect_snapshot(
     error = TRUE,
     epi_recipe(toy_epi_df) %>%
       check_enough_data(x, y, n = 2 * n - 1, drop_na = TRUE) %>%
-      prep(toy_epi_df) %>%
-      bake(new_data = NULL)
+      prep(toy_epi_df)
   )
 })
 
@@ -53,16 +51,14 @@ test_that("check_enough_data works on unpooled data", {
     error = TRUE,
     epi_recipe(toy_epi_df) %>%
       check_enough_data(x, y, n = n + 1, epi_keys = "geo_value", drop_na = FALSE) %>%
-      prep(toy_epi_df) %>%
-      bake(new_data = NULL)
+      prep(toy_epi_df)
   )
   # Check drop_na works
   expect_snapshot(
     error = TRUE,
     epi_recipe(toy_epi_df) %>%
       check_enough_data(x, y, n = 2 * n - 3, epi_keys = "geo_value", drop_na = TRUE) %>%
-      prep(toy_epi_df) %>%
-      bake(new_data = NULL)
+      prep(toy_epi_df)
   )
 })
 
@@ -85,7 +81,7 @@ test_that("check_enough_data outputs the correct recipe values", {
   expect_equal(p$geo_value, rep(c("ca", "hi"), each = n))
 })
 
-test_that("check_enough_train_data only checks train data", {
+test_that("check_enough_data only checks train data when skip = FALSE", {
   # Check that the train data has enough data, the test data does not, but
   # the check passes anyway (because it should be applied to training data)
   toy_test_data <- toy_epi_df %>%
@@ -94,16 +90,32 @@ test_that("check_enough_train_data only checks train data", {
     epiprocess::as_epi_df()
   expect_no_error(
     epi_recipe(toy_epi_df) %>%
-      check_enough_train_data(x, y, n = n - 2, epi_keys = "geo_value", skip = TRUE) %>%
+      check_enough_data(x, y, n = n - 2, epi_keys = "geo_value") %>%
       prep(toy_epi_df) %>%
       bake(new_data = toy_test_data)
   )
-  # Same thing, but skip = FALSE
+  # Making sure `skip = TRUE` is working correctly in `predict`
   expect_no_error(
     epi_recipe(toy_epi_df) %>%
-      check_enough_train_data(y, n = n - 2, epi_keys = "geo_value") %>%
-      prep(toy_epi_df) %>%
-      bake(new_data = toy_test_data)
+      add_role(y, new_role = "outcome") %>%
+      check_enough_data(x, n = n - 2, epi_keys = "geo_value") %>%
+      epi_workflow(linear_reg()) %>%
+      fit(toy_epi_df) %>%
+      predict(new_data = toy_test_data %>% filter(time_value > "2020-01-08"))
+  )
+  # making sure it works for skip = FALSE, where there's enough data to train
+  # but not enough to predict
+  expect_no_error(
+    forecaster <- epi_recipe(toy_epi_df) %>%
+      add_role(y, new_role = "outcome") %>%
+      check_enough_data(x, n = 1, epi_keys = "geo_value", skip = FALSE) %>%
+      epi_workflow(linear_reg()) %>%
+      fit(toy_epi_df)
+  )
+  expect_snapshot(
+    error = TRUE,
+    forecaster %>%
+      predict(new_data = toy_test_data %>% filter(time_value > "2020-01-08"))
   )
 })
 
@@ -122,7 +134,6 @@ test_that("check_enough_data works with all_predictors() downstream of construct
     epi_recipe(toy_epi_df) %>%
       step_epi_lag(x, lag = c(1, 2)) %>%
       check_enough_data(all_predictors(), y, n = 2 * n - 4) %>%
-      prep(toy_epi_df) %>%
-      bake(new_data = NULL)
+      prep(toy_epi_df)
   )
 })