From f421a2d29be5774970b438c5a6c8ad0de35bbca2 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Fri, 21 Mar 2025 17:56:25 -0500
Subject: [PATCH 01/12] using check_enough_train_data in practice

---
 R/arx_forecaster.R                   |  4 +++-
 R/check_enough_train_data.R          | 24 +++++++++++++++++++++---
 R/epi_workflow.R                     |  2 +-
 tests/testthat/test-arx_forecaster.R | 19 +++++++++++++++++++
 4 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/R/arx_forecaster.R b/R/arx_forecaster.R
index fe9128c00..fd8303658 100644
--- a/R/arx_forecaster.R
+++ b/R/arx_forecaster.R
@@ -171,7 +171,9 @@ arx_fcast_epi_workflow <- function(
     step_epi_ahead(!!outcome, ahead = args_list$ahead)
   r <- r %>%
     step_epi_naomit() %>%
-    step_training_window(n_recent = args_list$n_training)
+    step_training_window(n_recent = args_list$n_training) %>%
+    check_enough_train_data(all_predictors(), skip = FALSE)
+
   if (!is.null(args_list$check_enough_data_n)) {
     r <- r %>% check_enough_train_data(
       all_predictors(),
diff --git a/R/check_enough_train_data.R b/R/check_enough_train_data.R
index 1279a3712..d71daecc3 100644
--- a/R/check_enough_train_data.R
+++ b/R/check_enough_train_data.R
@@ -47,7 +47,7 @@ check_enough_train_data <-
            role = NA,
            trained = FALSE,
            columns = NULL,
-           skip = TRUE,
+           skip = FALSE,
            id = rand_id("enough_train_data")) {
     recipes::add_check(
       recipe,
@@ -90,7 +90,7 @@ prep.check_enough_train_data <- function(x, training, info = NULL, ...) {
   }
 
   if (x$drop_na) {
-    training <- tidyr::drop_na(training)
+    training <- tidyr::drop_na(training, any_of(unname(col_names)))
   }
   cols_not_enough_data <- training %>%
     group_by(across(all_of(.env$x$epi_keys))) %>%
@@ -101,7 +101,8 @@ prep.check_enough_train_data <- function(x, training, info = NULL, ...) {
 
   if (length(cols_not_enough_data) > 0) {
     cli_abort(
-      "The following columns don't have enough data to predict: {cols_not_enough_data}."
+      "The following columns don't have enough data to predict: {cols_not_enough_data}.",
+      class = "epipredict__not_enough_train_data"
     )
   }
 
@@ -120,6 +121,23 @@ prep.check_enough_train_data <- function(x, training, info = NULL, ...) {
 
 #' @export
 bake.check_enough_train_data <- function(object, new_data, ...) {
+  col_names <- object$columns
+  if (object$drop_na) {
+    newish_data <- tidyr::drop_na(new_data, any_of(unname(col_names)))
+  }
+  cols_not_enough_data <- newish_data %>%
+    group_by(across(all_of(.env$object$epi_keys))) %>%
+    summarise(across(all_of(.env$col_names), ~ dplyr::n() < .env$object$n), .groups = "drop") %>%
+    summarise(across(all_of(.env$col_names), any), .groups = "drop") %>%
+    unlist() %>%
+    names(.)[.]
+
+  if (length(cols_not_enough_data) > 0) {
+    cli_abort(
+      "The following columns don't have enough data to predict: {cols_not_enough_data}.",
+      class = "epipredict__not_enough_train_data"
+    )
+  }
   new_data
 }
 
diff --git a/R/epi_workflow.R b/R/epi_workflow.R
index 81b443e7b..826ae7f1b 100644
--- a/R/epi_workflow.R
+++ b/R/epi_workflow.R
@@ -270,6 +270,6 @@ forecast.epi_workflow <- function(object, ..., n_recent = NULL, forecast_date =
     hardhat::extract_preprocessor(object),
     object$original_data
   )
-
+  test_data
   predict(object, new_data = test_data)
 }
diff --git a/tests/testthat/test-arx_forecaster.R b/tests/testthat/test-arx_forecaster.R
index 0f2b9bd16..cdc21769d 100644
--- a/tests/testthat/test-arx_forecaster.R
+++ b/tests/testthat/test-arx_forecaster.R
@@ -24,3 +24,22 @@ test_that("arx_forecaster errors if forecast date, target date, and ahead are in
     class = "epipredict__arx_args__inconsistent_target_ahead_forecaste_date"
   )
 })
+
+test_that("warns if there's not enough data to predict", {
+  edf <- tibble(
+    geo_value = "ct",
+    time_value = seq(as.Date("2020-10-01"), as.Date("2023-05-31"), by = "day"),
+  ) %>%
+    mutate(value = seq_len(nrow(.)) + rnorm(nrow(.))) %>%
+    # Oct to May (flu season, ish) only:
+    filter(!between(as.POSIXlt(time_value)$mon + 1L, 6L, 9L)) %>%
+    # and actually, pretend we're around mid-October 2022:
+    filter(time_value <= as.Date("2022-10-12")) %>%
+    as_epi_df(as_of = as.Date("2022-10-12"))
+  edf %>% filter(time_value > "2022-08-01")
+
+  expect_error(
+    edf %>% arx_forecaster("value"),
+    class = "epipredict__not_enough_train_data"
+  )
+})

From b06e7e68488c2153e868aa74259228a0072abc15 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Fri, 21 Mar 2025 17:59:58 -0500
Subject: [PATCH 02/12] news & description

---
 DESCRIPTION | 2 +-
 NEWS.md     | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index d6f37ef63..36a84f425 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: epipredict
 Title: Basic epidemiology forecasting methods
-Version: 0.1.13
+Version: 0.1.14
 Authors@R: c(
     person("Daniel J.", "McDonald", , "daniel@stat.ubc.ca", role = c("aut", "cre")),
     person("Ryan", "Tibshirani", , "ryantibs@cmu.edu", role = "aut"),
diff --git a/NEWS.md b/NEWS.md
index e117f3f52..79daf4fdf 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -33,6 +33,7 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.0.x will indicat
 - Add `climatological_forecaster()` to automatically create climate baselines
 - Replace `dist_quantiles()` with `hardhat::quantile_pred()`
 - Allow `quantile()` to threshold to an interval if desired (#434)
+- `arx_forecaster()` detects if there's enough data to predict
 
 ## Bug fixes
 

From 2940f9576cbc8c4ba0784d03c05f2908b2b5dc54 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Mon, 24 Mar 2025 10:39:08 -0500
Subject: [PATCH 03/12] actually passing all the tests

---
 R/arx_forecaster.R                               |  2 +-
 R/check_enough_train_data.R                      |  6 ++++--
 tests/testthat/_snaps/check_enough_train_data.md |  2 +-
 tests/testthat/test-check_enough_train_data.R    | 11 ++++++-----
 tests/testthat/test-layer_residual_quantiles.R   |  2 +-
 5 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/R/arx_forecaster.R b/R/arx_forecaster.R
index fd8303658..97bd81f08 100644
--- a/R/arx_forecaster.R
+++ b/R/arx_forecaster.R
@@ -172,7 +172,7 @@ arx_fcast_epi_workflow <- function(
   r <- r %>%
     step_epi_naomit() %>%
     step_training_window(n_recent = args_list$n_training) %>%
-    check_enough_train_data(all_predictors(), skip = FALSE)
+    check_enough_train_data(all_predictors(), n = args_list$check_enough_data_n, skip = FALSE)
 
   if (!is.null(args_list$check_enough_data_n)) {
     r <- r %>% check_enough_train_data(
diff --git a/R/check_enough_train_data.R b/R/check_enough_train_data.R
index d71daecc3..3b2af6298 100644
--- a/R/check_enough_train_data.R
+++ b/R/check_enough_train_data.R
@@ -123,9 +123,11 @@ prep.check_enough_train_data <- function(x, training, info = NULL, ...) {
 bake.check_enough_train_data <- function(object, new_data, ...) {
   col_names <- object$columns
   if (object$drop_na) {
-    newish_data <- tidyr::drop_na(new_data, any_of(unname(col_names)))
+    non_na_data <- tidyr::drop_na(new_data, any_of(unname(col_names)))
+  } else {
+    non_na_data <- new_data
   }
-  cols_not_enough_data <- newish_data %>%
+  cols_not_enough_data <- non_na_data %>%
     group_by(across(all_of(.env$object$epi_keys))) %>%
     summarise(across(all_of(.env$col_names), ~ dplyr::n() < .env$object$n), .groups = "drop") %>%
     summarise(across(all_of(.env$col_names), any), .groups = "drop") %>%
diff --git a/tests/testthat/_snaps/check_enough_train_data.md b/tests/testthat/_snaps/check_enough_train_data.md
index 8f2389acb..8f852092a 100644
--- a/tests/testthat/_snaps/check_enough_train_data.md
+++ b/tests/testthat/_snaps/check_enough_train_data.md
@@ -38,7 +38,7 @@
 
     Code
       epi_recipe(toy_epi_df) %>% step_epi_lag(x, lag = c(1, 2)) %>%
-        check_enough_train_data(all_predictors(), y, n = 2 * n - 5) %>% prep(
+        check_enough_train_data(all_predictors(), y, n = 2 * n - 4) %>% prep(
         toy_epi_df) %>% bake(new_data = NULL)
     Condition
       Error in `prep()`:
diff --git a/tests/testthat/test-check_enough_train_data.R b/tests/testthat/test-check_enough_train_data.R
index 9b2ef5f34..11e86a915 100644
--- a/tests/testthat/test-check_enough_train_data.R
+++ b/tests/testthat/test-check_enough_train_data.R
@@ -94,25 +94,26 @@ test_that("check_enough_train_data only checks train data", {
     epiprocess::as_epi_df()
   expect_no_error(
     epi_recipe(toy_epi_df) %>%
-      check_enough_train_data(x, y, n = n - 2, epi_keys = "geo_value") %>%
+      check_enough_train_data(x, y, n = n - 2, epi_keys = "geo_value", skip = TRUE) %>%
       prep(toy_epi_df) %>%
       bake(new_data = toy_test_data)
   )
   # Same thing, but skip = FALSE
   expect_no_error(
     epi_recipe(toy_epi_df) %>%
-      check_enough_train_data(y, n = n - 2, epi_keys = "geo_value", skip = FALSE) %>%
+      check_enough_train_data(y, n = n - 2, epi_keys = "geo_value") %>%
       prep(toy_epi_df) %>%
       bake(new_data = toy_test_data)
   )
 })
 
 test_that("check_enough_train_data works with all_predictors() downstream of constructed terms", {
-  # With a lag of 2, we will get 2 * n - 6 non-NA rows
+  # With a lag of 2, we will get 2 * n - 5 non-NA rows (NA's in x but not in the
+  # lags don't count)
   expect_no_error(
     epi_recipe(toy_epi_df) %>%
       step_epi_lag(x, lag = c(1, 2)) %>%
-      check_enough_train_data(all_predictors(), y, n = 2 * n - 6) %>%
+      check_enough_train_data(all_predictors(), y, n = 2 * n - 5) %>%
       prep(toy_epi_df) %>%
       bake(new_data = NULL)
   )
@@ -120,7 +121,7 @@ test_that("check_enough_train_data works with all_predictors() downstream of con
     error = TRUE,
     epi_recipe(toy_epi_df) %>%
       step_epi_lag(x, lag = c(1, 2)) %>%
-      check_enough_train_data(all_predictors(), y, n = 2 * n - 5) %>%
+      check_enough_train_data(all_predictors(), y, n = 2 * n - 4) %>%
       prep(toy_epi_df) %>%
       bake(new_data = NULL)
   )
diff --git a/tests/testthat/test-layer_residual_quantiles.R b/tests/testthat/test-layer_residual_quantiles.R
index 3d5883c72..2421b8a1c 100644
--- a/tests/testthat/test-layer_residual_quantiles.R
+++ b/tests/testthat/test-layer_residual_quantiles.R
@@ -103,7 +103,7 @@ test_that("Canned forecasters work with / without", {
   )
 
   expect_silent(
-    arx_forecaster(jhu, "death_rate", c("case_rate", "death_rate"))
+    arx_forecaster(jhu, "death_rate", c("case_rate", "death_rate"), args_list = arx_args_list(check_enough_data_n = 1))
   )
   expect_silent(
     flatline_forecaster(

From 5ef9823708c2a4abf84099da4d7e4fe720f71a4f Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Mon, 24 Mar 2025 19:25:21 -0500
Subject: [PATCH 04/12] rename, old default skip, news, all_outcomes() for
 test, docs

---
 NAMESPACE                                     | 10 ++--
 NEWS.md                                       |  3 ++
 R/arx_classifier.R                            |  2 +-
 R/arx_forecaster.R                            |  6 +--
 ...nough_train_data.R => check_enough_data.R} | 53 ++++++++++---------
 _pkgdown.yml                                  |  2 +-
 ...ugh_train_data.Rd => check_enough_data.Rd} | 37 +++++++------
 man/step_adjust_latency.Rd                    | 10 ++--
 ...ugh_train_data.md => check_enough_data.md} | 25 +++++----
 tests/testthat/test-arx_forecaster.R          |  2 +-
 ..._train_data.R => test-check_enough_data.R} | 26 ++++-----
 11 files changed, 94 insertions(+), 82 deletions(-)
 rename R/{check_enough_train_data.R => check_enough_data.R} (71%)
 rename man/{check_enough_train_data.Rd => check_enough_data.Rd} (54%)
 rename tests/testthat/_snaps/{check_enough_train_data.md => check_enough_data.md} (51%)
 rename tests/testthat/{test-check_enough_train_data.R => test-check_enough_data.R} (74%)

diff --git a/NAMESPACE b/NAMESPACE
index 939a1f01a..f39d9bfbc 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -15,7 +15,7 @@ S3method(apply_frosting,epi_workflow)
 S3method(augment,epi_workflow)
 S3method(autoplot,canned_epipred)
 S3method(autoplot,epi_workflow)
-S3method(bake,check_enough_train_data)
+S3method(bake,check_enough_data)
 S3method(bake,epi_recipe)
 S3method(bake,step_adjust_latency)
 S3method(bake,step_climate)
@@ -49,7 +49,7 @@ S3method(key_colnames,recipe)
 S3method(mean,quantile_pred)
 S3method(predict,epi_workflow)
 S3method(predict,flatline)
-S3method(prep,check_enough_train_data)
+S3method(prep,check_enough_data)
 S3method(prep,epi_recipe)
 S3method(prep,step_adjust_latency)
 S3method(prep,step_climate)
@@ -65,7 +65,7 @@ S3method(print,arx_class)
 S3method(print,arx_fcast)
 S3method(print,canned_epipred)
 S3method(print,cdc_baseline_fcast)
-S3method(print,check_enough_train_data)
+S3method(print,check_enough_data)
 S3method(print,climate_fcast)
 S3method(print,epi_recipe)
 S3method(print,epi_workflow)
@@ -109,7 +109,7 @@ S3method(slather,layer_threshold)
 S3method(slather,layer_unnest)
 S3method(snap,default)
 S3method(snap,quantile_pred)
-S3method(tidy,check_enough_train_data)
+S3method(tidy,check_enough_data)
 S3method(tidy,frosting)
 S3method(tidy,layer)
 S3method(update,layer)
@@ -142,7 +142,7 @@ export(autoplot)
 export(bake)
 export(cdc_baseline_args_list)
 export(cdc_baseline_forecaster)
-export(check_enough_train_data)
+export(check_enough_data)
 export(clean_f_name)
 export(climate_args_list)
 export(climatological_forecaster)
diff --git a/NEWS.md b/NEWS.md
index 79daf4fdf..24c8e3b73 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -20,6 +20,9 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.0.x will indicat
 - Removes dependence on the `distributional` package, replacing the quantiles 
   with `hardhat::quantile_pred()`. Some associated functions are deprecated with
   `lifecycle` messages.
+- Rename `check_enough_train_data()` to `check_enough_data()`, and generalize it
+  enough to use as a check on either training or testing.
+- Add check for enough data to predict in `arx_forecaster()`
 
 ## Improvements
 
diff --git a/R/arx_classifier.R b/R/arx_classifier.R
index d1aa292dd..bc8783610 100644
--- a/R/arx_classifier.R
+++ b/R/arx_classifier.R
@@ -222,7 +222,7 @@ arx_class_epi_workflow <- function(
     step_training_window(n_recent = args_list$n_training)
 
   if (!is.null(args_list$check_enough_data_n)) {
-    r <- check_enough_train_data(
+    r <- check_enough_data(
       r,
       recipes::all_predictors(),
       recipes::all_outcomes(),
diff --git a/R/arx_forecaster.R b/R/arx_forecaster.R
index 97bd81f08..3a87719eb 100644
--- a/R/arx_forecaster.R
+++ b/R/arx_forecaster.R
@@ -172,12 +172,12 @@ arx_fcast_epi_workflow <- function(
   r <- r %>%
     step_epi_naomit() %>%
     step_training_window(n_recent = args_list$n_training) %>%
-    check_enough_train_data(all_predictors(), n = args_list$check_enough_data_n, skip = FALSE)
+    check_enough_data(all_predictors(), n = 1, skip = FALSE)
 
   if (!is.null(args_list$check_enough_data_n)) {
-    r <- r %>% check_enough_train_data(
+    r <- r %>% check_enough_data(
       all_predictors(),
-      !!outcome,
+      all_outcomes(),
       n = args_list$check_enough_data_n,
       epi_keys = args_list$check_enough_data_epi_keys,
       drop_na = FALSE
diff --git a/R/check_enough_train_data.R b/R/check_enough_data.R
similarity index 71%
rename from R/check_enough_train_data.R
rename to R/check_enough_data.R
index 3b2af6298..e87984e49 100644
--- a/R/check_enough_train_data.R
+++ b/R/check_enough_data.R
@@ -1,13 +1,13 @@
 #' Check the dataset contains enough data points.
 #'
-#' `check_enough_train_data` creates a *specification* of a recipe
+#' `check_enough_data` creates a *specification* of a recipe
 #'  operation that will check if variables contain enough data.
 #'
 #' @param recipe A recipe object. The check will be added to the
 #'  sequence of operations for this recipe.
 #' @param ... One or more selector functions to choose variables for this check.
 #'  See [selections()] for more details. You will usually want to use
-#'  [recipes::all_predictors()] here.
+#'  [recipes::all_predictors()] and/or [recipes::all_outcomes()] here.
 #' @param n The minimum number of data points required for training. If this is
 #'   NULL, the total number of predictors will be used.
 #' @param epi_keys A character vector of column names on which to group the data
@@ -21,24 +21,29 @@
 #' @param columns An internal argument that tracks which columns are evaluated
 #'   for this check. Should not be used by the user.
 #' @param id A character string that is unique to this check to identify it.
-#' @param skip A logical. Should the check be skipped when the
-#'  recipe is baked by [bake()]? While all operations are baked
-#'  when [prep()] is run, some operations may not be able to be
-#'  conducted on new data (e.g. processing the outcome variable(s)).
-#'  Care should be taken when using `skip = TRUE` as it may affect
-#'  the computations for subsequent operations.
+#' @param skip A logical. If `TRUE`, only training data is checked, while if
+#'   `FALSE`, both training and predicting data is checked. Technically, this
+#'   answers the question "should the check be skipped when the recipe is baked
+#'   by [bake()]?" While all operations are baked when [prep()] is run, some
+#'   operations may not be able to be conducted on new data (e.g. processing the
+#'   outcome variable(s)).  Care should be taken when using `skip = TRUE` as it
+#'   may affect the computations for subsequent operations.
 #' @family checks
 #' @export
-#' @details This check will break the `bake` function if any of the checked
-#'  columns have not enough non-NA values. If the check passes, nothing is
-#'  changed to the data.
+#' @details This check will break the `prep` and/or bake function if any of the
+#'   checked columns have not enough non-NA values. If the check passes, nothing
+#'   is changed in the data. It is best used after every other step.
+#'
+#'   For checking training data, it is best to set `...` to be
+#'   `all_predictors(), all_outcomes()`, while for checking prediction data, it
+#'   is best to set `...` to be `all_predictors()` only, with `n = 1`.
 #'
 #'  # tidy() results
 #'
 #'  When you [`tidy()`][tidy.recipe()] this check, a tibble with column
 #'  `terms` (the selectors or variables selected) is returned.
 #'
-check_enough_train_data <-
+check_enough_data <-
   function(recipe,
            ...,
            n = NULL,
@@ -47,11 +52,11 @@ check_enough_train_data <-
            role = NA,
            trained = FALSE,
            columns = NULL,
-           skip = FALSE,
-           id = rand_id("enough_train_data")) {
+           skip = TRUE,
+           id = rand_id("enough_data")) {
     recipes::add_check(
       recipe,
-      check_enough_train_data_new(
+      check_enough_data_new(
         n = n,
         epi_keys = epi_keys,
         drop_na = drop_na,
@@ -65,10 +70,10 @@ check_enough_train_data <-
     )
   }
 
-check_enough_train_data_new <-
+check_enough_data_new <-
   function(n, epi_keys, drop_na, terms, role, trained, columns, skip, id) {
     recipes::check(
-      subclass = "enough_train_data",
+      subclass = "enough_data",
       prefix = "check_",
       n = n,
       epi_keys = epi_keys,
@@ -83,7 +88,7 @@ check_enough_train_data_new <-
   }
 
 #' @export
-prep.check_enough_train_data <- function(x, training, info = NULL, ...) {
+prep.check_enough_data <- function(x, training, info = NULL, ...) {
   col_names <- recipes::recipes_eval_select(x$terms, training, info)
   if (is.null(x$n)) {
     x$n <- length(col_names)
@@ -102,11 +107,11 @@ prep.check_enough_train_data <- function(x, training, info = NULL, ...) {
   if (length(cols_not_enough_data) > 0) {
     cli_abort(
       "The following columns don't have enough data to predict: {cols_not_enough_data}.",
-      class = "epipredict__not_enough_train_data"
+      class = "epipredict__not_enough_data"
     )
   }
 
-  check_enough_train_data_new(
+  check_enough_data_new(
     n = x$n,
     epi_keys = x$epi_keys,
     drop_na = x$drop_na,
@@ -120,7 +125,7 @@ prep.check_enough_train_data <- function(x, training, info = NULL, ...) {
 }
 
 #' @export
-bake.check_enough_train_data <- function(object, new_data, ...) {
+bake.check_enough_data <- function(object, new_data, ...) {
   col_names <- object$columns
   if (object$drop_na) {
     non_na_data <- tidyr::drop_na(new_data, any_of(unname(col_names)))
@@ -137,21 +142,21 @@ bake.check_enough_train_data <- function(object, new_data, ...) {
   if (length(cols_not_enough_data) > 0) {
     cli_abort(
       "The following columns don't have enough data to predict: {cols_not_enough_data}.",
-      class = "epipredict__not_enough_train_data"
+      class = "epipredict__not_enough_data"
     )
   }
   new_data
 }
 
 #' @export
-print.check_enough_train_data <- function(x, width = max(20, options()$width - 30), ...) {
+print.check_enough_data <- function(x, width = max(20, options()$width - 30), ...) {
   title <- paste0("Check enough data (n = ", x$n, ") for ")
   recipes::print_step(x$columns, x$terms, x$trained, title, width)
   invisible(x)
 }
 
 #' @export
-tidy.check_enough_train_data <- function(x, ...) {
+tidy.check_enough_data <- function(x, ...) {
   if (recipes::is_trained(x)) {
     res <- tibble(terms = unname(x$columns))
   } else {
diff --git a/_pkgdown.yml b/_pkgdown.yml
index fe34c3b82..814bf6aa4 100644
--- a/_pkgdown.yml
+++ b/_pkgdown.yml
@@ -77,7 +77,7 @@ reference:
 
   - title: Epi recipe verification checks
     contents:
-      - check_enough_train_data
+      - check_enough_data
 
   - title: Forecast postprocessing
     desc: Create a series of postprocessing operations
diff --git a/man/check_enough_train_data.Rd b/man/check_enough_data.Rd
similarity index 54%
rename from man/check_enough_train_data.Rd
rename to man/check_enough_data.Rd
index 57a4a9d78..57c321fd1 100644
--- a/man/check_enough_train_data.Rd
+++ b/man/check_enough_data.Rd
@@ -1,10 +1,10 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/check_enough_train_data.R
-\name{check_enough_train_data}
-\alias{check_enough_train_data}
+% Please edit documentation in R/check_enough_data.R
+\name{check_enough_data}
+\alias{check_enough_data}
 \title{Check the dataset contains enough data points.}
 \usage{
-check_enough_train_data(
+check_enough_data(
   recipe,
   ...,
   n = NULL,
@@ -14,7 +14,7 @@ check_enough_train_data(
   trained = FALSE,
   columns = NULL,
   skip = TRUE,
-  id = rand_id("enough_train_data")
+  id = rand_id("enough_data")
 )
 }
 \arguments{
@@ -23,7 +23,7 @@ sequence of operations for this recipe.}
 
 \item{...}{One or more selector functions to choose variables for this check.
 See \code{\link[=selections]{selections()}} for more details. You will usually want to use
-\code{\link[recipes:has_role]{recipes::all_predictors()}} here.}
+\code{\link[recipes:has_role]{recipes::all_predictors()}} and/or \code{\link[recipes:has_role]{recipes::all_outcomes()}} here.}
 
 \item{n}{The minimum number of data points required for training. If this is
 NULL, the total number of predictors will be used.}
@@ -43,23 +43,28 @@ have been resolved by \code{\link[=prep]{prep()}}.}
 \item{columns}{An internal argument that tracks which columns are evaluated
 for this check. Should not be used by the user.}
 
-\item{skip}{A logical. Should the check be skipped when the
-recipe is baked by \code{\link[=bake]{bake()}}? While all operations are baked
-when \code{\link[=prep]{prep()}} is run, some operations may not be able to be
-conducted on new data (e.g. processing the outcome variable(s)).
-Care should be taken when using \code{skip = TRUE} as it may affect
-the computations for subsequent operations.}
+\item{skip}{A logical. If \code{TRUE}, only training data is checked, while if
+\code{FALSE}, both training and predicting data is checked. Technically, this
+answers the question "should the check be skipped when the recipe is baked
+by \code{\link[=bake]{bake()}}?" While all operations are baked when \code{\link[=prep]{prep()}} is run, some
+operations may not be able to be conducted on new data (e.g. processing the
+outcome variable(s)).  Care should be taken when using \code{skip = TRUE} as it
+may affect the computations for subsequent operations.}
 
 \item{id}{A character string that is unique to this check to identify it.}
 }
 \description{
-\code{check_enough_train_data} creates a \emph{specification} of a recipe
+\code{check_enough_data} creates a \emph{specification} of a recipe
 operation that will check if variables contain enough data.
 }
 \details{
-This check will break the \code{bake} function if any of the checked
-columns have not enough non-NA values. If the check passes, nothing is
-changed to the data.
+This check will break the \code{prep} and/or bake function if any of the
+checked columns have not enough non-NA values. If the check passes, nothing
+is changed in the data. It is best used after every other step.
+
+For checking training data, it is best to set \code{...} to be
+\verb{all_predictors(), all_outcomes()}, while for checking prediction data, it
+is best to set \code{...} to be \code{all_predictors()} only, with \code{n = 1}.
 }
 \section{tidy() results}{
 When you \code{\link[=tidy.recipe]{tidy()}} this check, a tibble with column
diff --git a/man/step_adjust_latency.Rd b/man/step_adjust_latency.Rd
index 75d674472..9e1bafbd5 100644
--- a/man/step_adjust_latency.Rd
+++ b/man/step_adjust_latency.Rd
@@ -143,7 +143,7 @@ toy_recipe \%>\%
 #> 
 #> # A tibble: 8 x 4
 #>   geo_value time_value     a     b
-#> * <chr>     <date>     <dbl> <dbl>
+#>   <chr>     <date>     <dbl> <dbl>
 #> 1 ca        2015-01-11   100     5
 #> 2 ca        2015-01-12   103    10
 #> 3 ca        2015-01-13   103    10
@@ -179,7 +179,7 @@ toy_recipe \%>\%
 #> 
 #> # A tibble: 21 x 7
 #>    geo_value time_value     a     b lag_3_a lag_4_b ahead_1_a
-#>  * <chr>     <date>     <dbl> <dbl>   <dbl>   <dbl>     <dbl>
+#>    <chr>     <date>     <dbl> <dbl>   <dbl>   <dbl>     <dbl>
 #>  1 ca        2015-01-10    NA    NA      NA      NA       100
 #>  2 ca        2015-01-11   100     5      NA      NA       103
 #>  3 ca        2015-01-12   103    10      NA      NA        NA
@@ -227,7 +227,7 @@ toy_recipe \%>\%
 #> 
 #> # A tibble: 10 x 6
 #>    geo_value time_value     a     b lag_0_a ahead_3_a
-#>  * <chr>     <date>     <dbl> <dbl>   <dbl>     <dbl>
+#>    <chr>     <date>     <dbl> <dbl>   <dbl>     <dbl>
 #>  1 ca        2015-01-08    NA    NA      NA       100
 #>  2 ca        2015-01-09    NA    NA      NA       103
 #>  3 ca        2015-01-11   100     5     100        NA
@@ -267,8 +267,8 @@ while this will not:
 \if{html}{\out{<div class="sourceCode r">}}\preformatted{toy_recipe <- epi_recipe(toy_df) \%>\%
    step_epi_lag(a, lag=0) \%>\%
    step_adjust_latency(a, method = "extend_lags")
-#> Warning: If `method` is "extend_lags" or "locf", then the previous `step_epi_lag`s won't
-#> work with modified data.
+#> Warning: If `method` is "extend_lags" or "locf", then the previous `step_epi_lag`s won't work with
+#> modified data.
 }\if{html}{\out{</div>}}
 
 If you create columns that you then apply lags to (such as
diff --git a/tests/testthat/_snaps/check_enough_train_data.md b/tests/testthat/_snaps/check_enough_data.md
similarity index 51%
rename from tests/testthat/_snaps/check_enough_train_data.md
rename to tests/testthat/_snaps/check_enough_data.md
index 8f852092a..37c13b3c3 100644
--- a/tests/testthat/_snaps/check_enough_train_data.md
+++ b/tests/testthat/_snaps/check_enough_data.md
@@ -1,8 +1,8 @@
-# check_enough_train_data works on pooled data
+# check_enough_data works on pooled data
 
     Code
-      epi_recipe(toy_epi_df) %>% check_enough_train_data(x, y, n = 2 * n + 1,
-      drop_na = FALSE) %>% prep(toy_epi_df) %>% bake(new_data = NULL)
+      epi_recipe(toy_epi_df) %>% check_enough_data(x, y, n = 2 * n + 1, drop_na = FALSE) %>%
+        prep(toy_epi_df) %>% bake(new_data = NULL)
     Condition
       Error in `prep()`:
       ! The following columns don't have enough data to predict: x and y.
@@ -10,16 +10,16 @@
 ---
 
     Code
-      epi_recipe(toy_epi_df) %>% check_enough_train_data(x, y, n = 2 * n - 1,
-      drop_na = TRUE) %>% prep(toy_epi_df) %>% bake(new_data = NULL)
+      epi_recipe(toy_epi_df) %>% check_enough_data(x, y, n = 2 * n - 1, drop_na = TRUE) %>%
+        prep(toy_epi_df) %>% bake(new_data = NULL)
     Condition
       Error in `prep()`:
       ! The following columns don't have enough data to predict: x and y.
 
-# check_enough_train_data works on unpooled data
+# check_enough_data works on unpooled data
 
     Code
-      epi_recipe(toy_epi_df) %>% check_enough_train_data(x, y, n = n + 1, epi_keys = "geo_value",
+      epi_recipe(toy_epi_df) %>% check_enough_data(x, y, n = n + 1, epi_keys = "geo_value",
       drop_na = FALSE) %>% prep(toy_epi_df) %>% bake(new_data = NULL)
     Condition
       Error in `prep()`:
@@ -28,18 +28,17 @@
 ---
 
     Code
-      epi_recipe(toy_epi_df) %>% check_enough_train_data(x, y, n = 2 * n - 3,
-      epi_keys = "geo_value", drop_na = TRUE) %>% prep(toy_epi_df) %>% bake(new_data = NULL)
+      epi_recipe(toy_epi_df) %>% check_enough_data(x, y, n = 2 * n - 3, epi_keys = "geo_value",
+      drop_na = TRUE) %>% prep(toy_epi_df) %>% bake(new_data = NULL)
     Condition
       Error in `prep()`:
       ! The following columns don't have enough data to predict: x and y.
 
-# check_enough_train_data works with all_predictors() downstream of constructed terms
+# check_enough_data works with all_predictors() downstream of constructed terms
 
     Code
-      epi_recipe(toy_epi_df) %>% step_epi_lag(x, lag = c(1, 2)) %>%
-        check_enough_train_data(all_predictors(), y, n = 2 * n - 4) %>% prep(
-        toy_epi_df) %>% bake(new_data = NULL)
+      epi_recipe(toy_epi_df) %>% step_epi_lag(x, lag = c(1, 2)) %>% check_enough_data(
+        all_predictors(), y, n = 2 * n - 4) %>% prep(toy_epi_df) %>% bake(new_data = NULL)
     Condition
       Error in `prep()`:
       ! The following columns don't have enough data to predict: lag_1_x, lag_2_x, and y.
diff --git a/tests/testthat/test-arx_forecaster.R b/tests/testthat/test-arx_forecaster.R
index cdc21769d..a72222ddb 100644
--- a/tests/testthat/test-arx_forecaster.R
+++ b/tests/testthat/test-arx_forecaster.R
@@ -40,6 +40,6 @@ test_that("warns if there's not enough data to predict", {
 
   expect_error(
     edf %>% arx_forecaster("value"),
-    class = "epipredict__not_enough_train_data"
+    class = "epipredict__not_enough_data"
   )
 })
diff --git a/tests/testthat/test-check_enough_train_data.R b/tests/testthat/test-check_enough_data.R
similarity index 74%
rename from tests/testthat/test-check_enough_train_data.R
rename to tests/testthat/test-check_enough_data.R
index 11e86a915..06526ace7 100644
--- a/tests/testthat/test-check_enough_train_data.R
+++ b/tests/testthat/test-check_enough_data.R
@@ -14,11 +14,11 @@ toy_epi_df <- tibble::tibble(
   y = 1:(2 * n)
 ) %>% epiprocess::as_epi_df()
 
-test_that("check_enough_train_data works on pooled data", {
+test_that("check_enough_data works on pooled data", {
   # Check both columns have enough data
   expect_no_error(
     epi_recipe(toy_epi_df) %>%
-      check_enough_train_data(x, y, n = 2 * n, drop_na = FALSE) %>%
+      check_enough_data(x, y, n = 2 * n, drop_na = FALSE) %>%
       prep(toy_epi_df) %>%
       bake(new_data = NULL)
   )
@@ -26,7 +26,7 @@ test_that("check_enough_train_data works on pooled data", {
   expect_snapshot(
     error = TRUE,
     epi_recipe(toy_epi_df) %>%
-      check_enough_train_data(x, y, n = 2 * n + 1, drop_na = FALSE) %>%
+      check_enough_data(x, y, n = 2 * n + 1, drop_na = FALSE) %>%
       prep(toy_epi_df) %>%
       bake(new_data = NULL)
   )
@@ -34,17 +34,17 @@ test_that("check_enough_train_data works on pooled data", {
   expect_snapshot(
     error = TRUE,
     epi_recipe(toy_epi_df) %>%
-      check_enough_train_data(x, y, n = 2 * n - 1, drop_na = TRUE) %>%
+      check_enough_data(x, y, n = 2 * n - 1, drop_na = TRUE) %>%
       prep(toy_epi_df) %>%
       bake(new_data = NULL)
   )
 })
 
-test_that("check_enough_train_data works on unpooled data", {
+test_that("check_enough_data works on unpooled data", {
   # Check both columns have enough data
   expect_no_error(
     epi_recipe(toy_epi_df) %>%
-      check_enough_train_data(x, y, n = n, epi_keys = "geo_value", drop_na = FALSE) %>%
+      check_enough_data(x, y, n = n, epi_keys = "geo_value", drop_na = FALSE) %>%
       prep(toy_epi_df) %>%
       bake(new_data = NULL)
   )
@@ -52,7 +52,7 @@ test_that("check_enough_train_data works on unpooled data", {
   expect_snapshot(
     error = TRUE,
     epi_recipe(toy_epi_df) %>%
-      check_enough_train_data(x, y, n = n + 1, epi_keys = "geo_value", drop_na = FALSE) %>%
+      check_enough_data(x, y, n = n + 1, epi_keys = "geo_value", drop_na = FALSE) %>%
       prep(toy_epi_df) %>%
       bake(new_data = NULL)
   )
@@ -60,16 +60,16 @@ test_that("check_enough_train_data works on unpooled data", {
   expect_snapshot(
     error = TRUE,
     epi_recipe(toy_epi_df) %>%
-      check_enough_train_data(x, y, n = 2 * n - 3, epi_keys = "geo_value", drop_na = TRUE) %>%
+      check_enough_data(x, y, n = 2 * n - 3, epi_keys = "geo_value", drop_na = TRUE) %>%
       prep(toy_epi_df) %>%
       bake(new_data = NULL)
   )
 })
 
-test_that("check_enough_train_data outputs the correct recipe values", {
+test_that("check_enough_data outputs the correct recipe values", {
   expect_no_error(
     p <- epi_recipe(toy_epi_df) %>%
-      check_enough_train_data(x, y, n = 2 * n - 2) %>%
+      check_enough_data(x, y, n = 2 * n - 2) %>%
       prep(toy_epi_df) %>%
       bake(new_data = NULL)
   )
@@ -107,13 +107,13 @@ test_that("check_enough_train_data only checks train data", {
   )
 })
 
-test_that("check_enough_train_data works with all_predictors() downstream of constructed terms", {
+test_that("check_enough_data works with all_predictors() downstream of constructed terms", {
   # With a lag of 2, we will get 2 * n - 5 non-NA rows (NA's in x but not in the
   # lags don't count)
   expect_no_error(
     epi_recipe(toy_epi_df) %>%
       step_epi_lag(x, lag = c(1, 2)) %>%
-      check_enough_train_data(all_predictors(), y, n = 2 * n - 5) %>%
+      check_enough_data(all_predictors(), y, n = 2 * n - 5) %>%
       prep(toy_epi_df) %>%
       bake(new_data = NULL)
   )
@@ -121,7 +121,7 @@ test_that("check_enough_train_data works with all_predictors() downstream of con
     error = TRUE,
     epi_recipe(toy_epi_df) %>%
       step_epi_lag(x, lag = c(1, 2)) %>%
-      check_enough_train_data(all_predictors(), y, n = 2 * n - 4) %>%
+      check_enough_data(all_predictors(), y, n = 2 * n - 4) %>%
       prep(toy_epi_df) %>%
       bake(new_data = NULL)
   )

From c5b25681b3221253aa11c18c75b4aa7dc164d25a Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Tue, 25 Mar 2025 11:45:16 -0500
Subject: [PATCH 05/12] fix drop na subtleties, unify prep/bake usage

A couple of simultaneous problems that were making this tricky:
1. drop_na can completely remove states
2. checking each column individually misses cases where combinations of
   the states cause the signal to be left out.
3. checking all columns simultaneously doesn't let the user know which
   columns to check.
---
 R/check_enough_data.R                      | 91 ++++++++++++++--------
 tests/testthat/_snaps/check_enough_data.md | 39 ++++++----
 tests/testthat/test-check_enough_data.R    | 43 ++++++----
 3 files changed, 109 insertions(+), 64 deletions(-)

diff --git a/R/check_enough_data.R b/R/check_enough_data.R
index e87984e49..a43a93c7a 100644
--- a/R/check_enough_data.R
+++ b/R/check_enough_data.R
@@ -94,22 +94,8 @@ prep.check_enough_data <- function(x, training, info = NULL, ...) {
     x$n <- length(col_names)
   }
 
-  if (x$drop_na) {
-    training <- tidyr::drop_na(training, any_of(unname(col_names)))
-  }
-  cols_not_enough_data <- training %>%
-    group_by(across(all_of(.env$x$epi_keys))) %>%
-    summarise(across(all_of(.env$col_names), ~ dplyr::n() < .env$x$n), .groups = "drop") %>%
-    summarise(across(all_of(.env$col_names), any), .groups = "drop") %>%
-    unlist() %>%
-    names(.)[.]
+  check_enough_data_core(training, x, col_names, "train")
 
-  if (length(cols_not_enough_data) > 0) {
-    cli_abort(
-      "The following columns don't have enough data to predict: {cols_not_enough_data}.",
-      class = "epipredict__not_enough_data"
-    )
-  }
 
   check_enough_data_new(
     n = x$n,
@@ -127,24 +113,7 @@ prep.check_enough_data <- function(x, training, info = NULL, ...) {
 #' @export
 bake.check_enough_data <- function(object, new_data, ...) {
   col_names <- object$columns
-  if (object$drop_na) {
-    non_na_data <- tidyr::drop_na(new_data, any_of(unname(col_names)))
-  } else {
-    non_na_data <- new_data
-  }
-  cols_not_enough_data <- non_na_data %>%
-    group_by(across(all_of(.env$object$epi_keys))) %>%
-    summarise(across(all_of(.env$col_names), ~ dplyr::n() < .env$object$n), .groups = "drop") %>%
-    summarise(across(all_of(.env$col_names), any), .groups = "drop") %>%
-    unlist() %>%
-    names(.)[.]
-
-  if (length(cols_not_enough_data) > 0) {
-    cli_abort(
-      "The following columns don't have enough data to predict: {cols_not_enough_data}.",
-      class = "epipredict__not_enough_data"
-    )
-  }
+  check_enough_data_core(new_data, object, col_names, "predict")
   new_data
 }
 
@@ -168,3 +137,59 @@ tidy.check_enough_data <- function(x, ...) {
   res$drop_na <- x$drop_na
   res
 }
+
+check_enough_data_core <- function(epi_df, step_obj, col_names, train_or_predict) {
+  epi_df <- epi_df %>%
+    group_by(across(all_of(.env$step_obj$epi_keys)))
+  if (step_obj$drop_na) {
+    any_missing_data <- epi_df %>%
+      mutate(any_are_na = rowSums(across(any_of(.env$col_names), ~ is.na(.x))) > 0) %>%
+      # count the number of rows where they're all not na
+      summarise(sum(any_are_na == 0) < .env$step_obj$n, .groups = "drop")
+    any_missing_data <- any_missing_data %>%
+      summarize(across(all_of(setdiff(names(any_missing_data), step_obj$epi_keys)), any)) %>%
+      any()
+
+    # figuring out which individual columns (if any) are to blame for this darth
+    # of data
+    cols_not_enough_data <- epi_df %>%
+      summarise(
+        across(
+          all_of(.env$col_names),
+          ~ sum(!is.na(.x)) < .env$step_obj$n
+        ),
+        .groups = "drop"
+      ) %>%
+      summarise(across(all_of(.env$col_names), any), .groups = "drop") %>%
+      unlist() %>%
+      names(.)[.]
+
+    if (length(cols_not_enough_data) == 0) {
+      cols_not_enough_data <-
+        glue::glue("no single column, but the combination of {paste0(col_names, collapse = ', ')}")
+    }
+  } else {
+    # if we're not dropping na values, just count
+    cols_not_enough_data <- epi_df %>%
+      summarise(
+        across(
+          all_of(.env$col_names),
+          ~ dplyr::n() < .env$step_obj$n
+        )
+      )
+    any_missing_data <- cols_not_enough_data %>%
+      summarize(across(all_of(.env$col_names), all)) %>%
+      all()
+    cols_not_enough_data <- cols_not_enough_data %>%
+      summarise(across(all_of(.env$col_names), any), .groups = "drop") %>%
+      unlist() %>%
+      names(.)[.]
+  }
+
+  if (any_missing_data) {
+    cli_abort(
+      "The following columns don't have enough data to {train_or_predict}: {cols_not_enough_data}.",
+      class = "epipredict__not_enough_data"
+    )
+  }
+}
diff --git a/tests/testthat/_snaps/check_enough_data.md b/tests/testthat/_snaps/check_enough_data.md
index 37c13b3c3..72ce8af99 100644
--- a/tests/testthat/_snaps/check_enough_data.md
+++ b/tests/testthat/_snaps/check_enough_data.md
@@ -2,44 +2,53 @@
 
     Code
       epi_recipe(toy_epi_df) %>% check_enough_data(x, y, n = 2 * n + 1, drop_na = FALSE) %>%
-        prep(toy_epi_df) %>% bake(new_data = NULL)
+        prep(toy_epi_df)
     Condition
-      Error in `prep()`:
-      ! The following columns don't have enough data to predict: x and y.
+      Error in `check_enough_data_core()`:
+      ! The following columns don't have enough data to train: x and y.
 
 ---
 
     Code
       epi_recipe(toy_epi_df) %>% check_enough_data(x, y, n = 2 * n - 1, drop_na = TRUE) %>%
-        prep(toy_epi_df) %>% bake(new_data = NULL)
+        prep(toy_epi_df)
     Condition
-      Error in `prep()`:
-      ! The following columns don't have enough data to predict: x and y.
+      Error in `check_enough_data_core()`:
+      ! The following columns don't have enough data to train: x.
 
 # check_enough_data works on unpooled data
 
     Code
       epi_recipe(toy_epi_df) %>% check_enough_data(x, y, n = n + 1, epi_keys = "geo_value",
-      drop_na = FALSE) %>% prep(toy_epi_df) %>% bake(new_data = NULL)
+      drop_na = FALSE) %>% prep(toy_epi_df)
     Condition
-      Error in `prep()`:
-      ! The following columns don't have enough data to predict: x and y.
+      Error in `check_enough_data_core()`:
+      ! The following columns don't have enough data to train: x and y.
 
 ---
 
     Code
       epi_recipe(toy_epi_df) %>% check_enough_data(x, y, n = 2 * n - 3, epi_keys = "geo_value",
-      drop_na = TRUE) %>% prep(toy_epi_df) %>% bake(new_data = NULL)
+      drop_na = TRUE) %>% prep(toy_epi_df)
     Condition
-      Error in `prep()`:
-      ! The following columns don't have enough data to predict: x and y.
+      Error in `check_enough_data_core()`:
+      ! The following columns don't have enough data to train: x and y.
+
+# check_enough_data only checks train data when skip = FALSE
+
+    Code
+      forecaster %>% predict(new_data = toy_test_data %>% filter(time_value >
+        "2020-01-08"))
+    Condition
+      Error in `check_enough_data_core()`:
+      ! The following columns don't have enough data to predict: x.
 
 # check_enough_data works with all_predictors() downstream of constructed terms
 
     Code
       epi_recipe(toy_epi_df) %>% step_epi_lag(x, lag = c(1, 2)) %>% check_enough_data(
-        all_predictors(), y, n = 2 * n - 4) %>% prep(toy_epi_df) %>% bake(new_data = NULL)
+        all_predictors(), y, n = 2 * n - 4) %>% prep(toy_epi_df)
     Condition
-      Error in `prep()`:
-      ! The following columns don't have enough data to predict: lag_1_x, lag_2_x, and y.
+      Error in `check_enough_data_core()`:
+      ! The following columns don't have enough data to train: no single column, but the combination of lag_1_x, lag_2_x, y.
 
diff --git a/tests/testthat/test-check_enough_data.R b/tests/testthat/test-check_enough_data.R
index 06526ace7..f31bc47ba 100644
--- a/tests/testthat/test-check_enough_data.R
+++ b/tests/testthat/test-check_enough_data.R
@@ -27,16 +27,14 @@ test_that("check_enough_data works on pooled data", {
     error = TRUE,
     epi_recipe(toy_epi_df) %>%
       check_enough_data(x, y, n = 2 * n + 1, drop_na = FALSE) %>%
-      prep(toy_epi_df) %>%
-      bake(new_data = NULL)
+      prep(toy_epi_df)
   )
   # Check drop_na works
   expect_snapshot(
     error = TRUE,
     epi_recipe(toy_epi_df) %>%
       check_enough_data(x, y, n = 2 * n - 1, drop_na = TRUE) %>%
-      prep(toy_epi_df) %>%
-      bake(new_data = NULL)
+      prep(toy_epi_df)
   )
 })
 
@@ -53,16 +51,14 @@ test_that("check_enough_data works on unpooled data", {
     error = TRUE,
     epi_recipe(toy_epi_df) %>%
       check_enough_data(x, y, n = n + 1, epi_keys = "geo_value", drop_na = FALSE) %>%
-      prep(toy_epi_df) %>%
-      bake(new_data = NULL)
+      prep(toy_epi_df)
   )
   # Check drop_na works
   expect_snapshot(
     error = TRUE,
     epi_recipe(toy_epi_df) %>%
       check_enough_data(x, y, n = 2 * n - 3, epi_keys = "geo_value", drop_na = TRUE) %>%
-      prep(toy_epi_df) %>%
-      bake(new_data = NULL)
+      prep(toy_epi_df)
   )
 })
 
@@ -85,7 +81,7 @@ test_that("check_enough_data outputs the correct recipe values", {
   expect_equal(p$geo_value, rep(c("ca", "hi"), each = n))
 })
 
-test_that("check_enough_train_data only checks train data", {
+test_that("check_enough_data only checks train data when skip = FALSE", {
   # Check that the train data has enough data, the test data does not, but
   # the check passes anyway (because it should be applied to training data)
   toy_test_data <- toy_epi_df %>%
@@ -94,16 +90,32 @@ test_that("check_enough_train_data only checks train data", {
     epiprocess::as_epi_df()
   expect_no_error(
     epi_recipe(toy_epi_df) %>%
-      check_enough_train_data(x, y, n = n - 2, epi_keys = "geo_value", skip = TRUE) %>%
+      check_enough_data(x, y, n = n - 2, epi_keys = "geo_value") %>%
       prep(toy_epi_df) %>%
       bake(new_data = toy_test_data)
   )
-  # Same thing, but skip = FALSE
+  # Making sure `skip = TRUE` is working correctly in `predict`
   expect_no_error(
     epi_recipe(toy_epi_df) %>%
-      check_enough_train_data(y, n = n - 2, epi_keys = "geo_value") %>%
-      prep(toy_epi_df) %>%
-      bake(new_data = toy_test_data)
+      add_role(y, new_role = "outcome") %>%
+      check_enough_data(x, n = n - 2, epi_keys = "geo_value") %>%
+      epi_workflow(linear_reg()) %>%
+      fit(toy_epi_df) %>%
+      predict(new_data = toy_test_data %>% filter(time_value > "2020-01-08"))
+  )
+  # making sure it works for skip = FALSE, where there's enough data to train
+  # but not enough to predict
+  expect_no_error(
+    forecaster <- epi_recipe(toy_epi_df) %>%
+      add_role(y, new_role = "outcome") %>%
+      check_enough_data(x, n = 1, epi_keys = "geo_value", skip = FALSE) %>%
+      epi_workflow(linear_reg()) %>%
+      fit(toy_epi_df)
+  )
+  expect_snapshot(
+    error = TRUE,
+    forecaster %>%
+      predict(new_data = toy_test_data %>% filter(time_value > "2020-01-08"))
   )
 })
 
@@ -122,7 +134,6 @@ test_that("check_enough_data works with all_predictors() downstream of construct
     epi_recipe(toy_epi_df) %>%
       step_epi_lag(x, lag = c(1, 2)) %>%
       check_enough_data(all_predictors(), y, n = 2 * n - 4) %>%
-      prep(toy_epi_df) %>%
-      bake(new_data = NULL)
+      prep(toy_epi_df)
   )
 })

From bba711e5ac26a7e506faf786016d043ae02e20d3 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Tue, 25 Mar 2025 13:09:00 -0500
Subject: [PATCH 06/12] note about weird branch

---
 R/check_enough_data.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/R/check_enough_data.R b/R/check_enough_data.R
index a43a93c7a..823485ba6 100644
--- a/R/check_enough_data.R
+++ b/R/check_enough_data.R
@@ -164,6 +164,7 @@ check_enough_data_core <- function(epi_df, step_obj, col_names, train_or_predict
       unlist() %>%
       names(.)[.]
 
+    # if none of the single columns have enough data, that means its the combination of all of them
     if (length(cols_not_enough_data) == 0) {
       cols_not_enough_data <-
         glue::glue("no single column, but the combination of {paste0(col_names, collapse = ', ')}")

From a899188a1d51debe4b08d6371514277a99ec84c7 Mon Sep 17 00:00:00 2001
From: Dmitry Shemetov <dshemetov@ucdavis.edu>
Date: Tue, 25 Mar 2025 11:25:57 -0700
Subject: [PATCH 07/12] doc: add comments

---
 R/check_enough_data.R | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/R/check_enough_data.R b/R/check_enough_data.R
index 823485ba6..f0d594ec2 100644
--- a/R/check_enough_data.R
+++ b/R/check_enough_data.R
@@ -160,11 +160,14 @@ check_enough_data_core <- function(epi_df, step_obj, col_names, train_or_predict
         ),
         .groups = "drop"
       ) %>%
+      # Aggregate across keys (if present)
       summarise(across(all_of(.env$col_names), any), .groups = "drop") %>%
       unlist() %>%
       names(.)[.]
 
-    # if none of the single columns have enough data, that means its the combination of all of them
+    # Either all columns have enough data, in which case this message won't be
+    # sent later or none of the single columns have enough data, that means its
+    # the combination of all of them.
     if (length(cols_not_enough_data) == 0) {
       cols_not_enough_data <-
         glue::glue("no single column, but the combination of {paste0(col_names, collapse = ', ')}")

From 4e37fe632f69e15c23a2429268b4ef714b133b03 Mon Sep 17 00:00:00 2001
From: Dmitry Shemetov <dshemetov@ucdavis.edu>
Date: Tue, 25 Mar 2025 11:35:57 -0700
Subject: [PATCH 08/12] doc: another comment for a weird R idiom

---
 R/check_enough_data.R | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/R/check_enough_data.R b/R/check_enough_data.R
index f0d594ec2..ace7e4e25 100644
--- a/R/check_enough_data.R
+++ b/R/check_enough_data.R
@@ -163,6 +163,7 @@ check_enough_data_core <- function(epi_df, step_obj, col_names, train_or_predict
       # Aggregate across keys (if present)
       summarise(across(all_of(.env$col_names), any), .groups = "drop") %>%
       unlist() %>%
+      # Select the names of the columns that are TRUE
       names(.)[.]
 
     # Either all columns have enough data, in which case this message won't be
@@ -187,6 +188,7 @@ check_enough_data_core <- function(epi_df, step_obj, col_names, train_or_predict
     cols_not_enough_data <- cols_not_enough_data %>%
       summarise(across(all_of(.env$col_names), any), .groups = "drop") %>%
       unlist() %>%
+      # Select the names of the columns that are TRUE
       names(.)[.]
   }
 

From 51a1c6c713f57c51823b03469ad0f3e20b8c84d5 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Wed, 26 Mar 2025 11:15:49 -0500
Subject: [PATCH 09/12] extraneous test_data

---
 R/epi_workflow.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/epi_workflow.R b/R/epi_workflow.R
index 826ae7f1b..81b443e7b 100644
--- a/R/epi_workflow.R
+++ b/R/epi_workflow.R
@@ -270,6 +270,6 @@ forecast.epi_workflow <- function(object, ..., n_recent = NULL, forecast_date =
     hardhat::extract_preprocessor(object),
     object$original_data
   )
-  test_data
+
   predict(object, new_data = test_data)
 }

From 1b05e99f96d9d4510b3c67c956e1f9ba82813031 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Thu, 27 Mar 2025 12:48:53 -0500
Subject: [PATCH 10/12] suggestions from Dan

---
 R/arx_forecaster.R                         |  4 +--
 R/check_enough_data.R                      | 42 +++++++++-------------
 tests/testthat/_snaps/check_enough_data.md | 18 +++++-----
 tests/testthat/test-check_enough_data.R    | 24 ++++++-------
 4 files changed, 40 insertions(+), 48 deletions(-)

diff --git a/R/arx_forecaster.R b/R/arx_forecaster.R
index 3a87719eb..3cbcdeb4a 100644
--- a/R/arx_forecaster.R
+++ b/R/arx_forecaster.R
@@ -172,13 +172,13 @@ arx_fcast_epi_workflow <- function(
   r <- r %>%
     step_epi_naomit() %>%
     step_training_window(n_recent = args_list$n_training) %>%
-    check_enough_data(all_predictors(), n = 1, skip = FALSE)
+    check_enough_data(all_predictors(), min_data_points = 1, skip = FALSE)
 
   if (!is.null(args_list$check_enough_data_n)) {
     r <- r %>% check_enough_data(
       all_predictors(),
       all_outcomes(),
-      n = args_list$check_enough_data_n,
+      min_data_points = args_list$check_enough_data_n,
       epi_keys = args_list$check_enough_data_epi_keys,
       drop_na = FALSE
     )
diff --git a/R/check_enough_data.R b/R/check_enough_data.R
index ace7e4e25..2a434f6a8 100644
--- a/R/check_enough_data.R
+++ b/R/check_enough_data.R
@@ -8,8 +8,8 @@
 #' @param ... One or more selector functions to choose variables for this check.
 #'  See [selections()] for more details. You will usually want to use
 #'  [recipes::all_predictors()] and/or [recipes::all_outcomes()] here.
-#' @param n The minimum number of data points required for training. If this is
-#'   NULL, the total number of predictors will be used.
+#' @param min_data_points The minimum number of data points required for
+#'   training. If this is NULL, the total number of predictors will be used.
 #' @param epi_keys A character vector of column names on which to group the data
 #'   and check threshold within each group. Useful if your forecaster trains
 #'   per group (for example, per geo_value).
@@ -18,8 +18,6 @@
 #'  created.
 #' @param trained A logical for whether the selectors in `...`
 #' have been resolved by [prep()].
-#' @param columns An internal argument that tracks which columns are evaluated
-#'   for this check. Should not be used by the user.
 #' @param id A character string that is unique to this check to identify it.
 #' @param skip A logical. If `TRUE`, only training data is checked, while if
 #'   `FALSE`, both training and predicting data is checked. Technically, this
@@ -46,24 +44,23 @@
 check_enough_data <-
   function(recipe,
            ...,
-           n = NULL,
+           min_data_points = NULL,
            epi_keys = NULL,
            drop_na = TRUE,
            role = NA,
            trained = FALSE,
-           columns = NULL,
            skip = TRUE,
            id = rand_id("enough_data")) {
     recipes::add_check(
       recipe,
       check_enough_data_new(
-        n = n,
+        min_data_points = min_data_points,
         epi_keys = epi_keys,
         drop_na = drop_na,
         terms = enquos(...),
         role = role,
         trained = trained,
-        columns = columns,
+        columns = NULL,
         skip = skip,
         id = id
       )
@@ -71,11 +68,12 @@ check_enough_data <-
   }
 
 check_enough_data_new <-
-  function(n, epi_keys, drop_na, terms, role, trained, columns, skip, id) {
+  function(min_data_points, epi_keys, drop_na, terms,
+           role, trained, columns, skip, id) {
     recipes::check(
       subclass = "enough_data",
       prefix = "check_",
-      n = n,
+      min_data_points = min_data_points,
       epi_keys = epi_keys,
       drop_na = drop_na,
       terms = terms,
@@ -90,15 +88,14 @@ check_enough_data_new <-
 #' @export
 prep.check_enough_data <- function(x, training, info = NULL, ...) {
   col_names <- recipes::recipes_eval_select(x$terms, training, info)
-  if (is.null(x$n)) {
-    x$n <- length(col_names)
+  if (is.null(x$min_data_points)) {
+    x$min_data_points <- length(col_names)
   }
 
   check_enough_data_core(training, x, col_names, "train")
 
-
   check_enough_data_new(
-    n = x$n,
+    min_data_points = x$min_data_points,
     epi_keys = x$epi_keys,
     drop_na = x$drop_na,
     terms = x$terms,
@@ -119,7 +116,7 @@ bake.check_enough_data <- function(object, new_data, ...) {
 
 #' @export
 print.check_enough_data <- function(x, width = max(20, options()$width - 30), ...) {
-  title <- paste0("Check enough data (n = ", x$n, ") for ")
+  title <- paste0("Check enough data (n = ", x$min_data_points, ") for ")
   recipes::print_step(x$columns, x$terms, x$trained, title, width)
   invisible(x)
 }
@@ -132,7 +129,7 @@ tidy.check_enough_data <- function(x, ...) {
     res <- tibble(terms = recipes::sel2char(x$terms))
   }
   res$id <- x$id
-  res$n <- x$n
+  res$min_data_points <- x$min_data_points
   res$epi_keys <- x$epi_keys
   res$drop_na <- x$drop_na
   res
@@ -145,18 +142,18 @@ check_enough_data_core <- function(epi_df, step_obj, col_names, train_or_predict
     any_missing_data <- epi_df %>%
       mutate(any_are_na = rowSums(across(any_of(.env$col_names), ~ is.na(.x))) > 0) %>%
       # count the number of rows where they're all not na
-      summarise(sum(any_are_na == 0) < .env$step_obj$n, .groups = "drop")
+      summarise(sum(any_are_na == 0) < .env$step_obj$min_data_points, .groups = "drop")
     any_missing_data <- any_missing_data %>%
       summarize(across(all_of(setdiff(names(any_missing_data), step_obj$epi_keys)), any)) %>%
       any()
 
-    # figuring out which individual columns (if any) are to blame for this darth
+    # figuring out which individual columns (if any) are to blame for this dearth
     # of data
     cols_not_enough_data <- epi_df %>%
       summarise(
         across(
           all_of(.env$col_names),
-          ~ sum(!is.na(.x)) < .env$step_obj$n
+          ~ sum(!is.na(.x)) < .env$step_obj$min_data_points
         ),
         .groups = "drop"
       ) %>%
@@ -176,12 +173,7 @@ check_enough_data_core <- function(epi_df, step_obj, col_names, train_or_predict
   } else {
     # if we're not dropping na values, just count
     cols_not_enough_data <- epi_df %>%
-      summarise(
-        across(
-          all_of(.env$col_names),
-          ~ dplyr::n() < .env$step_obj$n
-        )
-      )
+      summarise(across(all_of(.env$col_names), ~ dplyr::n() < .env$step_obj$min_data_points))
     any_missing_data <- cols_not_enough_data %>%
       summarize(across(all_of(.env$col_names), all)) %>%
       all()
diff --git a/tests/testthat/_snaps/check_enough_data.md b/tests/testthat/_snaps/check_enough_data.md
index 72ce8af99..f98293354 100644
--- a/tests/testthat/_snaps/check_enough_data.md
+++ b/tests/testthat/_snaps/check_enough_data.md
@@ -1,8 +1,8 @@
 # check_enough_data works on pooled data
 
     Code
-      epi_recipe(toy_epi_df) %>% check_enough_data(x, y, n = 2 * n + 1, drop_na = FALSE) %>%
-        prep(toy_epi_df)
+      epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_data_points = 2 * n + 1,
+      drop_na = FALSE) %>% prep(toy_epi_df)
     Condition
       Error in `check_enough_data_core()`:
       ! The following columns don't have enough data to train: x and y.
@@ -10,8 +10,8 @@
 ---
 
     Code
-      epi_recipe(toy_epi_df) %>% check_enough_data(x, y, n = 2 * n - 1, drop_na = TRUE) %>%
-        prep(toy_epi_df)
+      epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_data_points = 2 * n - 1,
+      drop_na = TRUE) %>% prep(toy_epi_df)
     Condition
       Error in `check_enough_data_core()`:
       ! The following columns don't have enough data to train: x.
@@ -19,8 +19,8 @@
 # check_enough_data works on unpooled data
 
     Code
-      epi_recipe(toy_epi_df) %>% check_enough_data(x, y, n = n + 1, epi_keys = "geo_value",
-      drop_na = FALSE) %>% prep(toy_epi_df)
+      epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_data_points = n + 1,
+      epi_keys = "geo_value", drop_na = FALSE) %>% prep(toy_epi_df)
     Condition
       Error in `check_enough_data_core()`:
       ! The following columns don't have enough data to train: x and y.
@@ -28,8 +28,8 @@
 ---
 
     Code
-      epi_recipe(toy_epi_df) %>% check_enough_data(x, y, n = 2 * n - 3, epi_keys = "geo_value",
-      drop_na = TRUE) %>% prep(toy_epi_df)
+      epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_data_points = 2 * n - 3,
+      epi_keys = "geo_value", drop_na = TRUE) %>% prep(toy_epi_df)
     Condition
       Error in `check_enough_data_core()`:
       ! The following columns don't have enough data to train: x and y.
@@ -47,7 +47,7 @@
 
     Code
       epi_recipe(toy_epi_df) %>% step_epi_lag(x, lag = c(1, 2)) %>% check_enough_data(
-        all_predictors(), y, n = 2 * n - 4) %>% prep(toy_epi_df)
+        all_predictors(), y, min_data_points = 2 * n - 4) %>% prep(toy_epi_df)
     Condition
       Error in `check_enough_data_core()`:
       ! The following columns don't have enough data to train: no single column, but the combination of lag_1_x, lag_2_x, y.
diff --git a/tests/testthat/test-check_enough_data.R b/tests/testthat/test-check_enough_data.R
index f31bc47ba..e2398e5c6 100644
--- a/tests/testthat/test-check_enough_data.R
+++ b/tests/testthat/test-check_enough_data.R
@@ -18,7 +18,7 @@ test_that("check_enough_data works on pooled data", {
   # Check both columns have enough data
   expect_no_error(
     epi_recipe(toy_epi_df) %>%
-      check_enough_data(x, y, n = 2 * n, drop_na = FALSE) %>%
+      check_enough_data(x, y, min_data_points = 2 * n, drop_na = FALSE) %>%
       prep(toy_epi_df) %>%
       bake(new_data = NULL)
   )
@@ -26,14 +26,14 @@ test_that("check_enough_data works on pooled data", {
   expect_snapshot(
     error = TRUE,
     epi_recipe(toy_epi_df) %>%
-      check_enough_data(x, y, n = 2 * n + 1, drop_na = FALSE) %>%
+      check_enough_data(x, y, min_data_points = 2 * n + 1, drop_na = FALSE) %>%
       prep(toy_epi_df)
   )
   # Check drop_na works
   expect_snapshot(
     error = TRUE,
     epi_recipe(toy_epi_df) %>%
-      check_enough_data(x, y, n = 2 * n - 1, drop_na = TRUE) %>%
+      check_enough_data(x, y, min_data_points = 2 * n - 1, drop_na = TRUE) %>%
       prep(toy_epi_df)
   )
 })
@@ -42,7 +42,7 @@ test_that("check_enough_data works on unpooled data", {
   # Check both columns have enough data
   expect_no_error(
     epi_recipe(toy_epi_df) %>%
-      check_enough_data(x, y, n = n, epi_keys = "geo_value", drop_na = FALSE) %>%
+      check_enough_data(x, y, min_data_points = n, epi_keys = "geo_value", drop_na = FALSE) %>%
       prep(toy_epi_df) %>%
       bake(new_data = NULL)
   )
@@ -50,14 +50,14 @@ test_that("check_enough_data works on unpooled data", {
   expect_snapshot(
     error = TRUE,
     epi_recipe(toy_epi_df) %>%
-      check_enough_data(x, y, n = n + 1, epi_keys = "geo_value", drop_na = FALSE) %>%
+      check_enough_data(x, y, min_data_points = n + 1, epi_keys = "geo_value", drop_na = FALSE) %>%
       prep(toy_epi_df)
   )
   # Check drop_na works
   expect_snapshot(
     error = TRUE,
     epi_recipe(toy_epi_df) %>%
-      check_enough_data(x, y, n = 2 * n - 3, epi_keys = "geo_value", drop_na = TRUE) %>%
+      check_enough_data(x, y, min_data_points = 2 * n - 3, epi_keys = "geo_value", drop_na = TRUE) %>%
       prep(toy_epi_df)
   )
 })
@@ -65,7 +65,7 @@ test_that("check_enough_data works on unpooled data", {
 test_that("check_enough_data outputs the correct recipe values", {
   expect_no_error(
     p <- epi_recipe(toy_epi_df) %>%
-      check_enough_data(x, y, n = 2 * n - 2) %>%
+      check_enough_data(x, y, min_data_points = 2 * n - 2) %>%
       prep(toy_epi_df) %>%
       bake(new_data = NULL)
   )
@@ -90,7 +90,7 @@ test_that("check_enough_data only checks train data when skip = FALSE", {
     epiprocess::as_epi_df()
   expect_no_error(
     epi_recipe(toy_epi_df) %>%
-      check_enough_data(x, y, n = n - 2, epi_keys = "geo_value") %>%
+      check_enough_data(x, y, min_data_points = n - 2, epi_keys = "geo_value") %>%
       prep(toy_epi_df) %>%
       bake(new_data = toy_test_data)
   )
@@ -98,7 +98,7 @@ test_that("check_enough_data only checks train data when skip = FALSE", {
   expect_no_error(
     epi_recipe(toy_epi_df) %>%
       add_role(y, new_role = "outcome") %>%
-      check_enough_data(x, n = n - 2, epi_keys = "geo_value") %>%
+      check_enough_data(x, min_data_points = n - 2, epi_keys = "geo_value") %>%
       epi_workflow(linear_reg()) %>%
       fit(toy_epi_df) %>%
       predict(new_data = toy_test_data %>% filter(time_value > "2020-01-08"))
@@ -108,7 +108,7 @@ test_that("check_enough_data only checks train data when skip = FALSE", {
   expect_no_error(
     forecaster <- epi_recipe(toy_epi_df) %>%
       add_role(y, new_role = "outcome") %>%
-      check_enough_data(x, n = 1, epi_keys = "geo_value", skip = FALSE) %>%
+      check_enough_data(x, min_data_points = 1, epi_keys = "geo_value", skip = FALSE) %>%
       epi_workflow(linear_reg()) %>%
       fit(toy_epi_df)
   )
@@ -125,7 +125,7 @@ test_that("check_enough_data works with all_predictors() downstream of construct
   expect_no_error(
     epi_recipe(toy_epi_df) %>%
       step_epi_lag(x, lag = c(1, 2)) %>%
-      check_enough_data(all_predictors(), y, n = 2 * n - 5) %>%
+      check_enough_data(all_predictors(), y, min_data_points = 2 * n - 5) %>%
       prep(toy_epi_df) %>%
       bake(new_data = NULL)
   )
@@ -133,7 +133,7 @@ test_that("check_enough_data works with all_predictors() downstream of construct
     error = TRUE,
     epi_recipe(toy_epi_df) %>%
       step_epi_lag(x, lag = c(1, 2)) %>%
-      check_enough_data(all_predictors(), y, n = 2 * n - 4) %>%
+      check_enough_data(all_predictors(), y, min_data_points = 2 * n - 4) %>%
       prep(toy_epi_df)
   )
 })

From 14992799db2c974151932f9b5f89678aa129a631 Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Thu, 27 Mar 2025 13:21:45 -0500
Subject: [PATCH 11/12] doc renaming

---
 man/check_enough_data.Rd | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/man/check_enough_data.Rd b/man/check_enough_data.Rd
index 57c321fd1..31a072e62 100644
--- a/man/check_enough_data.Rd
+++ b/man/check_enough_data.Rd
@@ -7,12 +7,11 @@
 check_enough_data(
   recipe,
   ...,
-  n = NULL,
+  min_data_points = NULL,
   epi_keys = NULL,
   drop_na = TRUE,
   role = NA,
   trained = FALSE,
-  columns = NULL,
   skip = TRUE,
   id = rand_id("enough_data")
 )
@@ -25,8 +24,8 @@ sequence of operations for this recipe.}
 See \code{\link[=selections]{selections()}} for more details. You will usually want to use
 \code{\link[recipes:has_role]{recipes::all_predictors()}} and/or \code{\link[recipes:has_role]{recipes::all_outcomes()}} here.}
 
-\item{n}{The minimum number of data points required for training. If this is
-NULL, the total number of predictors will be used.}
+\item{min_data_points}{The minimum number of data points required for
+training. If this is NULL, the total number of predictors will be used.}
 
 \item{epi_keys}{A character vector of column names on which to group the data
 and check threshold within each group. Useful if your forecaster trains
@@ -40,9 +39,6 @@ created.}
 \item{trained}{A logical for whether the selectors in \code{...}
 have been resolved by \code{\link[=prep]{prep()}}.}
 
-\item{columns}{An internal argument that tracks which columns are evaluated
-for this check. Should not be used by the user.}
-
 \item{skip}{A logical. If \code{TRUE}, only training data is checked, while if
 \code{FALSE}, both training and predicting data is checked. Technically, this
 answers the question "should the check be skipped when the recipe is baked

From 84f991d6fc1eed1e420f3f39065a6ea52107876f Mon Sep 17 00:00:00 2001
From: dsweber2 <david.weber2@pm.me>
Date: Fri, 28 Mar 2025 18:06:11 -0500
Subject: [PATCH 12/12] rename, check for `pre` in names, dplyr::between

---
 R/arx_forecaster.R                         |  4 ++--
 R/canned-epipred.R                         |  2 +-
 R/check_enough_data.R                      | 26 +++++++++++-----------
 man/check_enough_data.Rd                   |  4 ++--
 tests/testthat/_snaps/check_enough_data.md | 10 ++++-----
 tests/testthat/test-arx_forecaster.R       |  2 +-
 tests/testthat/test-check_enough_data.R    | 24 ++++++++++----------
 7 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/R/arx_forecaster.R b/R/arx_forecaster.R
index 3cbcdeb4a..f988490fd 100644
--- a/R/arx_forecaster.R
+++ b/R/arx_forecaster.R
@@ -172,13 +172,13 @@ arx_fcast_epi_workflow <- function(
   r <- r %>%
     step_epi_naomit() %>%
     step_training_window(n_recent = args_list$n_training) %>%
-    check_enough_data(all_predictors(), min_data_points = 1, skip = FALSE)
+    check_enough_data(all_predictors(), min_observations = 1, skip = FALSE)
 
   if (!is.null(args_list$check_enough_data_n)) {
     r <- r %>% check_enough_data(
       all_predictors(),
       all_outcomes(),
-      min_data_points = args_list$check_enough_data_n,
+      min_observations = args_list$check_enough_data_n,
       epi_keys = args_list$check_enough_data_epi_keys,
       drop_na = FALSE
     )
diff --git a/R/canned-epipred.R b/R/canned-epipred.R
index 48e984168..7d53862c2 100644
--- a/R/canned-epipred.R
+++ b/R/canned-epipred.R
@@ -112,7 +112,7 @@ print.canned_epipred <- function(x, name, ...) {
     "At forecast date{?s}: {.val {fds}},",
     "For target date{?s}: {.val {tds}},"
   ))
-  if ("actions" %in% names(x$pre) && "recipe" %in% names(x$pre$actions)) {
+  if ("pre" %in% names(x) && "actions" %in% names(x$pre) && "recipe" %in% names(x$pre$actions)) {
     fit_recipe <- extract_recipe(x$epi_workflow)
     if (detect_step(fit_recipe, "adjust_latency")) {
       is_adj_latency <- map_lgl(fit_recipe$steps, function(x) inherits(x, "step_adjust_latency"))
diff --git a/R/check_enough_data.R b/R/check_enough_data.R
index 2a434f6a8..e830d5e54 100644
--- a/R/check_enough_data.R
+++ b/R/check_enough_data.R
@@ -8,7 +8,7 @@
 #' @param ... One or more selector functions to choose variables for this check.
 #'  See [selections()] for more details. You will usually want to use
 #'  [recipes::all_predictors()] and/or [recipes::all_outcomes()] here.
-#' @param min_data_points The minimum number of data points required for
+#' @param min_observations The minimum number of data points required for
 #'   training. If this is NULL, the total number of predictors will be used.
 #' @param epi_keys A character vector of column names on which to group the data
 #'   and check threshold within each group. Useful if your forecaster trains
@@ -44,7 +44,7 @@
 check_enough_data <-
   function(recipe,
            ...,
-           min_data_points = NULL,
+           min_observations = NULL,
            epi_keys = NULL,
            drop_na = TRUE,
            role = NA,
@@ -54,7 +54,7 @@ check_enough_data <-
     recipes::add_check(
       recipe,
       check_enough_data_new(
-        min_data_points = min_data_points,
+        min_observations = min_observations,
         epi_keys = epi_keys,
         drop_na = drop_na,
         terms = enquos(...),
@@ -68,12 +68,12 @@ check_enough_data <-
   }
 
 check_enough_data_new <-
-  function(min_data_points, epi_keys, drop_na, terms,
+  function(min_observations, epi_keys, drop_na, terms,
            role, trained, columns, skip, id) {
     recipes::check(
       subclass = "enough_data",
       prefix = "check_",
-      min_data_points = min_data_points,
+      min_observations = min_observations,
       epi_keys = epi_keys,
       drop_na = drop_na,
       terms = terms,
@@ -88,14 +88,14 @@ check_enough_data_new <-
 #' @export
 prep.check_enough_data <- function(x, training, info = NULL, ...) {
   col_names <- recipes::recipes_eval_select(x$terms, training, info)
-  if (is.null(x$min_data_points)) {
-    x$min_data_points <- length(col_names)
+  if (is.null(x$min_observations)) {
+    x$min_observations <- length(col_names)
   }
 
   check_enough_data_core(training, x, col_names, "train")
 
   check_enough_data_new(
-    min_data_points = x$min_data_points,
+    min_observations = x$min_observations,
     epi_keys = x$epi_keys,
     drop_na = x$drop_na,
     terms = x$terms,
@@ -116,7 +116,7 @@ bake.check_enough_data <- function(object, new_data, ...) {
 
 #' @export
 print.check_enough_data <- function(x, width = max(20, options()$width - 30), ...) {
-  title <- paste0("Check enough data (n = ", x$min_data_points, ") for ")
+  title <- paste0("Check enough data (n = ", x$min_observations, ") for ")
   recipes::print_step(x$columns, x$terms, x$trained, title, width)
   invisible(x)
 }
@@ -129,7 +129,7 @@ tidy.check_enough_data <- function(x, ...) {
     res <- tibble(terms = recipes::sel2char(x$terms))
   }
   res$id <- x$id
-  res$min_data_points <- x$min_data_points
+  res$min_observations <- x$min_observations
   res$epi_keys <- x$epi_keys
   res$drop_na <- x$drop_na
   res
@@ -142,7 +142,7 @@ check_enough_data_core <- function(epi_df, step_obj, col_names, train_or_predict
     any_missing_data <- epi_df %>%
       mutate(any_are_na = rowSums(across(any_of(.env$col_names), ~ is.na(.x))) > 0) %>%
       # count the number of rows where they're all not na
-      summarise(sum(any_are_na == 0) < .env$step_obj$min_data_points, .groups = "drop")
+      summarise(sum(any_are_na == 0) < .env$step_obj$min_observations, .groups = "drop")
     any_missing_data <- any_missing_data %>%
       summarize(across(all_of(setdiff(names(any_missing_data), step_obj$epi_keys)), any)) %>%
       any()
@@ -153,7 +153,7 @@ check_enough_data_core <- function(epi_df, step_obj, col_names, train_or_predict
       summarise(
         across(
           all_of(.env$col_names),
-          ~ sum(!is.na(.x)) < .env$step_obj$min_data_points
+          ~ sum(!is.na(.x)) < .env$step_obj$min_observations
         ),
         .groups = "drop"
       ) %>%
@@ -173,7 +173,7 @@ check_enough_data_core <- function(epi_df, step_obj, col_names, train_or_predict
   } else {
     # if we're not dropping na values, just count
     cols_not_enough_data <- epi_df %>%
-      summarise(across(all_of(.env$col_names), ~ dplyr::n() < .env$step_obj$min_data_points))
+      summarise(across(all_of(.env$col_names), ~ dplyr::n() < .env$step_obj$min_observations))
     any_missing_data <- cols_not_enough_data %>%
       summarize(across(all_of(.env$col_names), all)) %>%
       all()
diff --git a/man/check_enough_data.Rd b/man/check_enough_data.Rd
index 31a072e62..969caa1d2 100644
--- a/man/check_enough_data.Rd
+++ b/man/check_enough_data.Rd
@@ -7,7 +7,7 @@
 check_enough_data(
   recipe,
   ...,
-  min_data_points = NULL,
+  min_observations = NULL,
   epi_keys = NULL,
   drop_na = TRUE,
   role = NA,
@@ -24,7 +24,7 @@ sequence of operations for this recipe.}
 See \code{\link[=selections]{selections()}} for more details. You will usually want to use
 \code{\link[recipes:has_role]{recipes::all_predictors()}} and/or \code{\link[recipes:has_role]{recipes::all_outcomes()}} here.}
 
-\item{min_data_points}{The minimum number of data points required for
+\item{min_observations}{The minimum number of data points required for
 training. If this is NULL, the total number of predictors will be used.}
 
 \item{epi_keys}{A character vector of column names on which to group the data
diff --git a/tests/testthat/_snaps/check_enough_data.md b/tests/testthat/_snaps/check_enough_data.md
index f98293354..4a6ff336d 100644
--- a/tests/testthat/_snaps/check_enough_data.md
+++ b/tests/testthat/_snaps/check_enough_data.md
@@ -1,7 +1,7 @@
 # check_enough_data works on pooled data
 
     Code
-      epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_data_points = 2 * n + 1,
+      epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_observations = 2 * n + 1,
       drop_na = FALSE) %>% prep(toy_epi_df)
     Condition
       Error in `check_enough_data_core()`:
@@ -10,7 +10,7 @@
 ---
 
     Code
-      epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_data_points = 2 * n - 1,
+      epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_observations = 2 * n - 1,
       drop_na = TRUE) %>% prep(toy_epi_df)
     Condition
       Error in `check_enough_data_core()`:
@@ -19,7 +19,7 @@
 # check_enough_data works on unpooled data
 
     Code
-      epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_data_points = n + 1,
+      epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_observations = n + 1,
       epi_keys = "geo_value", drop_na = FALSE) %>% prep(toy_epi_df)
     Condition
       Error in `check_enough_data_core()`:
@@ -28,7 +28,7 @@
 ---
 
     Code
-      epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_data_points = 2 * n - 3,
+      epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_observations = 2 * n - 3,
       epi_keys = "geo_value", drop_na = TRUE) %>% prep(toy_epi_df)
     Condition
       Error in `check_enough_data_core()`:
@@ -47,7 +47,7 @@
 
     Code
       epi_recipe(toy_epi_df) %>% step_epi_lag(x, lag = c(1, 2)) %>% check_enough_data(
-        all_predictors(), y, min_data_points = 2 * n - 4) %>% prep(toy_epi_df)
+        all_predictors(), y, min_observations = 2 * n - 4) %>% prep(toy_epi_df)
     Condition
       Error in `check_enough_data_core()`:
       ! The following columns don't have enough data to train: no single column, but the combination of lag_1_x, lag_2_x, y.
diff --git a/tests/testthat/test-arx_forecaster.R b/tests/testthat/test-arx_forecaster.R
index a72222ddb..d13e6d2ea 100644
--- a/tests/testthat/test-arx_forecaster.R
+++ b/tests/testthat/test-arx_forecaster.R
@@ -32,7 +32,7 @@ test_that("warns if there's not enough data to predict", {
   ) %>%
     mutate(value = seq_len(nrow(.)) + rnorm(nrow(.))) %>%
     # Oct to May (flu season, ish) only:
-    filter(!between(as.POSIXlt(time_value)$mon + 1L, 6L, 9L)) %>%
+    filter(!dplyr::between(as.POSIXlt(time_value)$mon + 1L, 6L, 9L)) %>%
     # and actually, pretend we're around mid-October 2022:
     filter(time_value <= as.Date("2022-10-12")) %>%
     as_epi_df(as_of = as.Date("2022-10-12"))
diff --git a/tests/testthat/test-check_enough_data.R b/tests/testthat/test-check_enough_data.R
index e2398e5c6..3ca388afb 100644
--- a/tests/testthat/test-check_enough_data.R
+++ b/tests/testthat/test-check_enough_data.R
@@ -18,7 +18,7 @@ test_that("check_enough_data works on pooled data", {
   # Check both columns have enough data
   expect_no_error(
     epi_recipe(toy_epi_df) %>%
-      check_enough_data(x, y, min_data_points = 2 * n, drop_na = FALSE) %>%
+      check_enough_data(x, y, min_observations = 2 * n, drop_na = FALSE) %>%
       prep(toy_epi_df) %>%
       bake(new_data = NULL)
   )
@@ -26,14 +26,14 @@ test_that("check_enough_data works on pooled data", {
   expect_snapshot(
     error = TRUE,
     epi_recipe(toy_epi_df) %>%
-      check_enough_data(x, y, min_data_points = 2 * n + 1, drop_na = FALSE) %>%
+      check_enough_data(x, y, min_observations = 2 * n + 1, drop_na = FALSE) %>%
       prep(toy_epi_df)
   )
   # Check drop_na works
   expect_snapshot(
     error = TRUE,
     epi_recipe(toy_epi_df) %>%
-      check_enough_data(x, y, min_data_points = 2 * n - 1, drop_na = TRUE) %>%
+      check_enough_data(x, y, min_observations = 2 * n - 1, drop_na = TRUE) %>%
       prep(toy_epi_df)
   )
 })
@@ -42,7 +42,7 @@ test_that("check_enough_data works on unpooled data", {
   # Check both columns have enough data
   expect_no_error(
     epi_recipe(toy_epi_df) %>%
-      check_enough_data(x, y, min_data_points = n, epi_keys = "geo_value", drop_na = FALSE) %>%
+      check_enough_data(x, y, min_observations = n, epi_keys = "geo_value", drop_na = FALSE) %>%
       prep(toy_epi_df) %>%
       bake(new_data = NULL)
   )
@@ -50,14 +50,14 @@ test_that("check_enough_data works on unpooled data", {
   expect_snapshot(
     error = TRUE,
     epi_recipe(toy_epi_df) %>%
-      check_enough_data(x, y, min_data_points = n + 1, epi_keys = "geo_value", drop_na = FALSE) %>%
+      check_enough_data(x, y, min_observations = n + 1, epi_keys = "geo_value", drop_na = FALSE) %>%
       prep(toy_epi_df)
   )
   # Check drop_na works
   expect_snapshot(
     error = TRUE,
     epi_recipe(toy_epi_df) %>%
-      check_enough_data(x, y, min_data_points = 2 * n - 3, epi_keys = "geo_value", drop_na = TRUE) %>%
+      check_enough_data(x, y, min_observations = 2 * n - 3, epi_keys = "geo_value", drop_na = TRUE) %>%
       prep(toy_epi_df)
   )
 })
@@ -65,7 +65,7 @@ test_that("check_enough_data works on unpooled data", {
 test_that("check_enough_data outputs the correct recipe values", {
   expect_no_error(
     p <- epi_recipe(toy_epi_df) %>%
-      check_enough_data(x, y, min_data_points = 2 * n - 2) %>%
+      check_enough_data(x, y, min_observations = 2 * n - 2) %>%
       prep(toy_epi_df) %>%
       bake(new_data = NULL)
   )
@@ -90,7 +90,7 @@ test_that("check_enough_data only checks train data when skip = FALSE", {
     epiprocess::as_epi_df()
   expect_no_error(
     epi_recipe(toy_epi_df) %>%
-      check_enough_data(x, y, min_data_points = n - 2, epi_keys = "geo_value") %>%
+      check_enough_data(x, y, min_observations = n - 2, epi_keys = "geo_value") %>%
       prep(toy_epi_df) %>%
       bake(new_data = toy_test_data)
   )
@@ -98,7 +98,7 @@ test_that("check_enough_data only checks train data when skip = FALSE", {
   expect_no_error(
     epi_recipe(toy_epi_df) %>%
       add_role(y, new_role = "outcome") %>%
-      check_enough_data(x, min_data_points = n - 2, epi_keys = "geo_value") %>%
+      check_enough_data(x, min_observations = n - 2, epi_keys = "geo_value") %>%
       epi_workflow(linear_reg()) %>%
       fit(toy_epi_df) %>%
       predict(new_data = toy_test_data %>% filter(time_value > "2020-01-08"))
@@ -108,7 +108,7 @@ test_that("check_enough_data only checks train data when skip = FALSE", {
   expect_no_error(
     forecaster <- epi_recipe(toy_epi_df) %>%
       add_role(y, new_role = "outcome") %>%
-      check_enough_data(x, min_data_points = 1, epi_keys = "geo_value", skip = FALSE) %>%
+      check_enough_data(x, min_observations = 1, epi_keys = "geo_value", skip = FALSE) %>%
       epi_workflow(linear_reg()) %>%
       fit(toy_epi_df)
   )
@@ -125,7 +125,7 @@ test_that("check_enough_data works with all_predictors() downstream of construct
   expect_no_error(
     epi_recipe(toy_epi_df) %>%
       step_epi_lag(x, lag = c(1, 2)) %>%
-      check_enough_data(all_predictors(), y, min_data_points = 2 * n - 5) %>%
+      check_enough_data(all_predictors(), y, min_observations = 2 * n - 5) %>%
       prep(toy_epi_df) %>%
       bake(new_data = NULL)
   )
@@ -133,7 +133,7 @@ test_that("check_enough_data works with all_predictors() downstream of construct
     error = TRUE,
     epi_recipe(toy_epi_df) %>%
       step_epi_lag(x, lag = c(1, 2)) %>%
-      check_enough_data(all_predictors(), y, min_data_points = 2 * n - 4) %>%
+      check_enough_data(all_predictors(), y, min_observations = 2 * n - 4) %>%
       prep(toy_epi_df)
   )
 })