From f421a2d29be5774970b438c5a6c8ad0de35bbca2 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 21 Mar 2025 17:56:25 -0500 Subject: [PATCH 01/12] using check_enough_train_data in practice --- R/arx_forecaster.R | 4 +++- R/check_enough_train_data.R | 24 +++++++++++++++++++++--- R/epi_workflow.R | 2 +- tests/testthat/test-arx_forecaster.R | 19 +++++++++++++++++++ 4 files changed, 44 insertions(+), 5 deletions(-) diff --git a/R/arx_forecaster.R b/R/arx_forecaster.R index fe9128c00..fd8303658 100644 --- a/R/arx_forecaster.R +++ b/R/arx_forecaster.R @@ -171,7 +171,9 @@ arx_fcast_epi_workflow <- function( step_epi_ahead(!!outcome, ahead = args_list$ahead) r <- r %>% step_epi_naomit() %>% - step_training_window(n_recent = args_list$n_training) + step_training_window(n_recent = args_list$n_training) %>% + check_enough_train_data(all_predictors(), skip = FALSE) + if (!is.null(args_list$check_enough_data_n)) { r <- r %>% check_enough_train_data( all_predictors(), diff --git a/R/check_enough_train_data.R b/R/check_enough_train_data.R index 1279a3712..d71daecc3 100644 --- a/R/check_enough_train_data.R +++ b/R/check_enough_train_data.R @@ -47,7 +47,7 @@ check_enough_train_data <- role = NA, trained = FALSE, columns = NULL, - skip = TRUE, + skip = FALSE, id = rand_id("enough_train_data")) { recipes::add_check( recipe, @@ -90,7 +90,7 @@ prep.check_enough_train_data <- function(x, training, info = NULL, ...) { } if (x$drop_na) { - training <- tidyr::drop_na(training) + training <- tidyr::drop_na(training, any_of(unname(col_names))) } cols_not_enough_data <- training %>% group_by(across(all_of(.env$x$epi_keys))) %>% @@ -101,7 +101,8 @@ prep.check_enough_train_data <- function(x, training, info = NULL, ...) { if (length(cols_not_enough_data) > 0) { cli_abort( - "The following columns don't have enough data to predict: {cols_not_enough_data}." + "The following columns don't have enough data to predict: {cols_not_enough_data}.", + class = "epipredict__not_enough_train_data" ) } @@ -120,6 +121,23 @@ prep.check_enough_train_data <- function(x, training, info = NULL, ...) { #' @export bake.check_enough_train_data <- function(object, new_data, ...) { + col_names <- object$columns + if (object$drop_na) { + newish_data <- tidyr::drop_na(new_data, any_of(unname(col_names))) + } + cols_not_enough_data <- newish_data %>% + group_by(across(all_of(.env$object$epi_keys))) %>% + summarise(across(all_of(.env$col_names), ~ dplyr::n() < .env$object$n), .groups = "drop") %>% + summarise(across(all_of(.env$col_names), any), .groups = "drop") %>% + unlist() %>% + names(.)[.] + + if (length(cols_not_enough_data) > 0) { + cli_abort( + "The following columns don't have enough data to predict: {cols_not_enough_data}.", + class = "epipredict__not_enough_train_data" + ) + } new_data } diff --git a/R/epi_workflow.R b/R/epi_workflow.R index 81b443e7b..826ae7f1b 100644 --- a/R/epi_workflow.R +++ b/R/epi_workflow.R @@ -270,6 +270,6 @@ forecast.epi_workflow <- function(object, ..., n_recent = NULL, forecast_date = hardhat::extract_preprocessor(object), object$original_data ) - + test_data predict(object, new_data = test_data) } diff --git a/tests/testthat/test-arx_forecaster.R b/tests/testthat/test-arx_forecaster.R index 0f2b9bd16..cdc21769d 100644 --- a/tests/testthat/test-arx_forecaster.R +++ b/tests/testthat/test-arx_forecaster.R @@ -24,3 +24,22 @@ test_that("arx_forecaster errors if forecast date, target date, and ahead are in class = "epipredict__arx_args__inconsistent_target_ahead_forecaste_date" ) }) + +test_that("warns if there's not enough data to predict", { + edf <- tibble( + geo_value = "ct", + time_value = seq(as.Date("2020-10-01"), as.Date("2023-05-31"), by = "day"), + ) %>% + mutate(value = seq_len(nrow(.)) + rnorm(nrow(.))) %>% + # Oct to May (flu season, ish) only: + filter(!between(as.POSIXlt(time_value)$mon + 1L, 6L, 9L)) %>% + # and actually, pretend we're around mid-October 2022: + filter(time_value <= as.Date("2022-10-12")) %>% + as_epi_df(as_of = as.Date("2022-10-12")) + edf %>% filter(time_value > "2022-08-01") + + expect_error( + edf %>% arx_forecaster("value"), + class = "epipredict__not_enough_train_data" + ) +}) From b06e7e68488c2153e868aa74259228a0072abc15 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 21 Mar 2025 17:59:58 -0500 Subject: [PATCH 02/12] news & description --- DESCRIPTION | 2 +- NEWS.md | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index d6f37ef63..36a84f425 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: epipredict Title: Basic epidemiology forecasting methods -Version: 0.1.13 +Version: 0.1.14 Authors@R: c( person("Daniel J.", "McDonald", , "daniel@stat.ubc.ca", role = c("aut", "cre")), person("Ryan", "Tibshirani", , "ryantibs@cmu.edu", role = "aut"), diff --git a/NEWS.md b/NEWS.md index e117f3f52..79daf4fdf 100644 --- a/NEWS.md +++ b/NEWS.md @@ -33,6 +33,7 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.0.x will indicat - Add `climatological_forecaster()` to automatically create climate baselines - Replace `dist_quantiles()` with `hardhat::quantile_pred()` - Allow `quantile()` to threshold to an interval if desired (#434) +- `arx_forecaster()` detects if there's enough data to predict ## Bug fixes From 2940f9576cbc8c4ba0784d03c05f2908b2b5dc54 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Mon, 24 Mar 2025 10:39:08 -0500 Subject: [PATCH 03/12] actually passing all the tests --- R/arx_forecaster.R | 2 +- R/check_enough_train_data.R | 6 ++++-- tests/testthat/_snaps/check_enough_train_data.md | 2 +- tests/testthat/test-check_enough_train_data.R | 11 ++++++----- tests/testthat/test-layer_residual_quantiles.R | 2 +- 5 files changed, 13 insertions(+), 10 deletions(-) diff --git a/R/arx_forecaster.R b/R/arx_forecaster.R index fd8303658..97bd81f08 100644 --- a/R/arx_forecaster.R +++ b/R/arx_forecaster.R @@ -172,7 +172,7 @@ arx_fcast_epi_workflow <- function( r <- r %>% step_epi_naomit() %>% step_training_window(n_recent = args_list$n_training) %>% - check_enough_train_data(all_predictors(), skip = FALSE) + check_enough_train_data(all_predictors(), n = args_list$check_enough_data_n, skip = FALSE) if (!is.null(args_list$check_enough_data_n)) { r <- r %>% check_enough_train_data( diff --git a/R/check_enough_train_data.R b/R/check_enough_train_data.R index d71daecc3..3b2af6298 100644 --- a/R/check_enough_train_data.R +++ b/R/check_enough_train_data.R @@ -123,9 +123,11 @@ prep.check_enough_train_data <- function(x, training, info = NULL, ...) { bake.check_enough_train_data <- function(object, new_data, ...) { col_names <- object$columns if (object$drop_na) { - newish_data <- tidyr::drop_na(new_data, any_of(unname(col_names))) + non_na_data <- tidyr::drop_na(new_data, any_of(unname(col_names))) + } else { + non_na_data <- new_data } - cols_not_enough_data <- newish_data %>% + cols_not_enough_data <- non_na_data %>% group_by(across(all_of(.env$object$epi_keys))) %>% summarise(across(all_of(.env$col_names), ~ dplyr::n() < .env$object$n), .groups = "drop") %>% summarise(across(all_of(.env$col_names), any), .groups = "drop") %>% diff --git a/tests/testthat/_snaps/check_enough_train_data.md b/tests/testthat/_snaps/check_enough_train_data.md index 8f2389acb..8f852092a 100644 --- a/tests/testthat/_snaps/check_enough_train_data.md +++ b/tests/testthat/_snaps/check_enough_train_data.md @@ -38,7 +38,7 @@ Code epi_recipe(toy_epi_df) %>% step_epi_lag(x, lag = c(1, 2)) %>% - check_enough_train_data(all_predictors(), y, n = 2 * n - 5) %>% prep( + check_enough_train_data(all_predictors(), y, n = 2 * n - 4) %>% prep( toy_epi_df) %>% bake(new_data = NULL) Condition Error in `prep()`: diff --git a/tests/testthat/test-check_enough_train_data.R b/tests/testthat/test-check_enough_train_data.R index 9b2ef5f34..11e86a915 100644 --- a/tests/testthat/test-check_enough_train_data.R +++ b/tests/testthat/test-check_enough_train_data.R @@ -94,25 +94,26 @@ test_that("check_enough_train_data only checks train data", { epiprocess::as_epi_df() expect_no_error( epi_recipe(toy_epi_df) %>% - check_enough_train_data(x, y, n = n - 2, epi_keys = "geo_value") %>% + check_enough_train_data(x, y, n = n - 2, epi_keys = "geo_value", skip = TRUE) %>% prep(toy_epi_df) %>% bake(new_data = toy_test_data) ) # Same thing, but skip = FALSE expect_no_error( epi_recipe(toy_epi_df) %>% - check_enough_train_data(y, n = n - 2, epi_keys = "geo_value", skip = FALSE) %>% + check_enough_train_data(y, n = n - 2, epi_keys = "geo_value") %>% prep(toy_epi_df) %>% bake(new_data = toy_test_data) ) }) test_that("check_enough_train_data works with all_predictors() downstream of constructed terms", { - # With a lag of 2, we will get 2 * n - 6 non-NA rows + # With a lag of 2, we will get 2 * n - 5 non-NA rows (NA's in x but not in the + # lags don't count) expect_no_error( epi_recipe(toy_epi_df) %>% step_epi_lag(x, lag = c(1, 2)) %>% - check_enough_train_data(all_predictors(), y, n = 2 * n - 6) %>% + check_enough_train_data(all_predictors(), y, n = 2 * n - 5) %>% prep(toy_epi_df) %>% bake(new_data = NULL) ) @@ -120,7 +121,7 @@ test_that("check_enough_train_data works with all_predictors() downstream of con error = TRUE, epi_recipe(toy_epi_df) %>% step_epi_lag(x, lag = c(1, 2)) %>% - check_enough_train_data(all_predictors(), y, n = 2 * n - 5) %>% + check_enough_train_data(all_predictors(), y, n = 2 * n - 4) %>% prep(toy_epi_df) %>% bake(new_data = NULL) ) diff --git a/tests/testthat/test-layer_residual_quantiles.R b/tests/testthat/test-layer_residual_quantiles.R index 3d5883c72..2421b8a1c 100644 --- a/tests/testthat/test-layer_residual_quantiles.R +++ b/tests/testthat/test-layer_residual_quantiles.R @@ -103,7 +103,7 @@ test_that("Canned forecasters work with / without", { ) expect_silent( - arx_forecaster(jhu, "death_rate", c("case_rate", "death_rate")) + arx_forecaster(jhu, "death_rate", c("case_rate", "death_rate"), args_list = arx_args_list(check_enough_data_n = 1)) ) expect_silent( flatline_forecaster( From 5ef9823708c2a4abf84099da4d7e4fe720f71a4f Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Mon, 24 Mar 2025 19:25:21 -0500 Subject: [PATCH 04/12] rename, old default skip, news, all_outcomes() for test, docs --- NAMESPACE | 10 ++-- NEWS.md | 3 ++ R/arx_classifier.R | 2 +- R/arx_forecaster.R | 6 +-- ...nough_train_data.R => check_enough_data.R} | 53 ++++++++++--------- _pkgdown.yml | 2 +- ...ugh_train_data.Rd => check_enough_data.Rd} | 37 +++++++------ man/step_adjust_latency.Rd | 10 ++-- ...ugh_train_data.md => check_enough_data.md} | 25 +++++---- tests/testthat/test-arx_forecaster.R | 2 +- ..._train_data.R => test-check_enough_data.R} | 26 ++++----- 11 files changed, 94 insertions(+), 82 deletions(-) rename R/{check_enough_train_data.R => check_enough_data.R} (71%) rename man/{check_enough_train_data.Rd => check_enough_data.Rd} (54%) rename tests/testthat/_snaps/{check_enough_train_data.md => check_enough_data.md} (51%) rename tests/testthat/{test-check_enough_train_data.R => test-check_enough_data.R} (74%) diff --git a/NAMESPACE b/NAMESPACE index 939a1f01a..f39d9bfbc 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -15,7 +15,7 @@ S3method(apply_frosting,epi_workflow) S3method(augment,epi_workflow) S3method(autoplot,canned_epipred) S3method(autoplot,epi_workflow) -S3method(bake,check_enough_train_data) +S3method(bake,check_enough_data) S3method(bake,epi_recipe) S3method(bake,step_adjust_latency) S3method(bake,step_climate) @@ -49,7 +49,7 @@ S3method(key_colnames,recipe) S3method(mean,quantile_pred) S3method(predict,epi_workflow) S3method(predict,flatline) -S3method(prep,check_enough_train_data) +S3method(prep,check_enough_data) S3method(prep,epi_recipe) S3method(prep,step_adjust_latency) S3method(prep,step_climate) @@ -65,7 +65,7 @@ S3method(print,arx_class) S3method(print,arx_fcast) S3method(print,canned_epipred) S3method(print,cdc_baseline_fcast) -S3method(print,check_enough_train_data) +S3method(print,check_enough_data) S3method(print,climate_fcast) S3method(print,epi_recipe) S3method(print,epi_workflow) @@ -109,7 +109,7 @@ S3method(slather,layer_threshold) S3method(slather,layer_unnest) S3method(snap,default) S3method(snap,quantile_pred) -S3method(tidy,check_enough_train_data) +S3method(tidy,check_enough_data) S3method(tidy,frosting) S3method(tidy,layer) S3method(update,layer) @@ -142,7 +142,7 @@ export(autoplot) export(bake) export(cdc_baseline_args_list) export(cdc_baseline_forecaster) -export(check_enough_train_data) +export(check_enough_data) export(clean_f_name) export(climate_args_list) export(climatological_forecaster) diff --git a/NEWS.md b/NEWS.md index 79daf4fdf..24c8e3b73 100644 --- a/NEWS.md +++ b/NEWS.md @@ -20,6 +20,9 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.0.x will indicat - Removes dependence on the `distributional` package, replacing the quantiles with `hardhat::quantile_pred()`. Some associated functions are deprecated with `lifecycle` messages. +- Rename `check_enough_train_data()` to `check_enough_data()`, and generalize it + enough to use as a check on either training or testing. +- Add check for enough data to predict in `arx_forecaster()` ## Improvements diff --git a/R/arx_classifier.R b/R/arx_classifier.R index d1aa292dd..bc8783610 100644 --- a/R/arx_classifier.R +++ b/R/arx_classifier.R @@ -222,7 +222,7 @@ arx_class_epi_workflow <- function( step_training_window(n_recent = args_list$n_training) if (!is.null(args_list$check_enough_data_n)) { - r <- check_enough_train_data( + r <- check_enough_data( r, recipes::all_predictors(), recipes::all_outcomes(), diff --git a/R/arx_forecaster.R b/R/arx_forecaster.R index 97bd81f08..3a87719eb 100644 --- a/R/arx_forecaster.R +++ b/R/arx_forecaster.R @@ -172,12 +172,12 @@ arx_fcast_epi_workflow <- function( r <- r %>% step_epi_naomit() %>% step_training_window(n_recent = args_list$n_training) %>% - check_enough_train_data(all_predictors(), n = args_list$check_enough_data_n, skip = FALSE) + check_enough_data(all_predictors(), n = 1, skip = FALSE) if (!is.null(args_list$check_enough_data_n)) { - r <- r %>% check_enough_train_data( + r <- r %>% check_enough_data( all_predictors(), - !!outcome, + all_outcomes(), n = args_list$check_enough_data_n, epi_keys = args_list$check_enough_data_epi_keys, drop_na = FALSE diff --git a/R/check_enough_train_data.R b/R/check_enough_data.R similarity index 71% rename from R/check_enough_train_data.R rename to R/check_enough_data.R index 3b2af6298..e87984e49 100644 --- a/R/check_enough_train_data.R +++ b/R/check_enough_data.R @@ -1,13 +1,13 @@ #' Check the dataset contains enough data points. #' -#' `check_enough_train_data` creates a *specification* of a recipe +#' `check_enough_data` creates a *specification* of a recipe #' operation that will check if variables contain enough data. #' #' @param recipe A recipe object. The check will be added to the #' sequence of operations for this recipe. #' @param ... One or more selector functions to choose variables for this check. #' See [selections()] for more details. You will usually want to use -#' [recipes::all_predictors()] here. +#' [recipes::all_predictors()] and/or [recipes::all_outcomes()] here. #' @param n The minimum number of data points required for training. If this is #' NULL, the total number of predictors will be used. #' @param epi_keys A character vector of column names on which to group the data @@ -21,24 +21,29 @@ #' @param columns An internal argument that tracks which columns are evaluated #' for this check. Should not be used by the user. #' @param id A character string that is unique to this check to identify it. -#' @param skip A logical. Should the check be skipped when the -#' recipe is baked by [bake()]? While all operations are baked -#' when [prep()] is run, some operations may not be able to be -#' conducted on new data (e.g. processing the outcome variable(s)). -#' Care should be taken when using `skip = TRUE` as it may affect -#' the computations for subsequent operations. +#' @param skip A logical. If `TRUE`, only training data is checked, while if +#' `FALSE`, both training and predicting data is checked. Technically, this +#' answers the question "should the check be skipped when the recipe is baked +#' by [bake()]?" While all operations are baked when [prep()] is run, some +#' operations may not be able to be conducted on new data (e.g. processing the +#' outcome variable(s)). Care should be taken when using `skip = TRUE` as it +#' may affect the computations for subsequent operations. #' @family checks #' @export -#' @details This check will break the `bake` function if any of the checked -#' columns have not enough non-NA values. If the check passes, nothing is -#' changed to the data. +#' @details This check will break the `prep` and/or bake function if any of the +#' checked columns have not enough non-NA values. If the check passes, nothing +#' is changed in the data. It is best used after every other step. +#' +#' For checking training data, it is best to set `...` to be +#' `all_predictors(), all_outcomes()`, while for checking prediction data, it +#' is best to set `...` to be `all_predictors()` only, with `n = 1`. #' #' # tidy() results #' #' When you [`tidy()`][tidy.recipe()] this check, a tibble with column #' `terms` (the selectors or variables selected) is returned. #' -check_enough_train_data <- +check_enough_data <- function(recipe, ..., n = NULL, @@ -47,11 +52,11 @@ check_enough_train_data <- role = NA, trained = FALSE, columns = NULL, - skip = FALSE, - id = rand_id("enough_train_data")) { + skip = TRUE, + id = rand_id("enough_data")) { recipes::add_check( recipe, - check_enough_train_data_new( + check_enough_data_new( n = n, epi_keys = epi_keys, drop_na = drop_na, @@ -65,10 +70,10 @@ check_enough_train_data <- ) } -check_enough_train_data_new <- +check_enough_data_new <- function(n, epi_keys, drop_na, terms, role, trained, columns, skip, id) { recipes::check( - subclass = "enough_train_data", + subclass = "enough_data", prefix = "check_", n = n, epi_keys = epi_keys, @@ -83,7 +88,7 @@ check_enough_train_data_new <- } #' @export -prep.check_enough_train_data <- function(x, training, info = NULL, ...) { +prep.check_enough_data <- function(x, training, info = NULL, ...) { col_names <- recipes::recipes_eval_select(x$terms, training, info) if (is.null(x$n)) { x$n <- length(col_names) @@ -102,11 +107,11 @@ prep.check_enough_train_data <- function(x, training, info = NULL, ...) { if (length(cols_not_enough_data) > 0) { cli_abort( "The following columns don't have enough data to predict: {cols_not_enough_data}.", - class = "epipredict__not_enough_train_data" + class = "epipredict__not_enough_data" ) } - check_enough_train_data_new( + check_enough_data_new( n = x$n, epi_keys = x$epi_keys, drop_na = x$drop_na, @@ -120,7 +125,7 @@ prep.check_enough_train_data <- function(x, training, info = NULL, ...) { } #' @export -bake.check_enough_train_data <- function(object, new_data, ...) { +bake.check_enough_data <- function(object, new_data, ...) { col_names <- object$columns if (object$drop_na) { non_na_data <- tidyr::drop_na(new_data, any_of(unname(col_names))) @@ -137,21 +142,21 @@ bake.check_enough_train_data <- function(object, new_data, ...) { if (length(cols_not_enough_data) > 0) { cli_abort( "The following columns don't have enough data to predict: {cols_not_enough_data}.", - class = "epipredict__not_enough_train_data" + class = "epipredict__not_enough_data" ) } new_data } #' @export -print.check_enough_train_data <- function(x, width = max(20, options()$width - 30), ...) { +print.check_enough_data <- function(x, width = max(20, options()$width - 30), ...) { title <- paste0("Check enough data (n = ", x$n, ") for ") recipes::print_step(x$columns, x$terms, x$trained, title, width) invisible(x) } #' @export -tidy.check_enough_train_data <- function(x, ...) { +tidy.check_enough_data <- function(x, ...) { if (recipes::is_trained(x)) { res <- tibble(terms = unname(x$columns)) } else { diff --git a/_pkgdown.yml b/_pkgdown.yml index fe34c3b82..814bf6aa4 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -77,7 +77,7 @@ reference: - title: Epi recipe verification checks contents: - - check_enough_train_data + - check_enough_data - title: Forecast postprocessing desc: Create a series of postprocessing operations diff --git a/man/check_enough_train_data.Rd b/man/check_enough_data.Rd similarity index 54% rename from man/check_enough_train_data.Rd rename to man/check_enough_data.Rd index 57a4a9d78..57c321fd1 100644 --- a/man/check_enough_train_data.Rd +++ b/man/check_enough_data.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/check_enough_train_data.R -\name{check_enough_train_data} -\alias{check_enough_train_data} +% Please edit documentation in R/check_enough_data.R +\name{check_enough_data} +\alias{check_enough_data} \title{Check the dataset contains enough data points.} \usage{ -check_enough_train_data( +check_enough_data( recipe, ..., n = NULL, @@ -14,7 +14,7 @@ check_enough_train_data( trained = FALSE, columns = NULL, skip = TRUE, - id = rand_id("enough_train_data") + id = rand_id("enough_data") ) } \arguments{ @@ -23,7 +23,7 @@ sequence of operations for this recipe.} \item{...}{One or more selector functions to choose variables for this check. See \code{\link[=selections]{selections()}} for more details. You will usually want to use -\code{\link[recipes:has_role]{recipes::all_predictors()}} here.} +\code{\link[recipes:has_role]{recipes::all_predictors()}} and/or \code{\link[recipes:has_role]{recipes::all_outcomes()}} here.} \item{n}{The minimum number of data points required for training. If this is NULL, the total number of predictors will be used.} @@ -43,23 +43,28 @@ have been resolved by \code{\link[=prep]{prep()}}.} \item{columns}{An internal argument that tracks which columns are evaluated for this check. Should not be used by the user.} -\item{skip}{A logical. Should the check be skipped when the -recipe is baked by \code{\link[=bake]{bake()}}? While all operations are baked -when \code{\link[=prep]{prep()}} is run, some operations may not be able to be -conducted on new data (e.g. processing the outcome variable(s)). -Care should be taken when using \code{skip = TRUE} as it may affect -the computations for subsequent operations.} +\item{skip}{A logical. If \code{TRUE}, only training data is checked, while if +\code{FALSE}, both training and predicting data is checked. Technically, this +answers the question "should the check be skipped when the recipe is baked +by \code{\link[=bake]{bake()}}?" While all operations are baked when \code{\link[=prep]{prep()}} is run, some +operations may not be able to be conducted on new data (e.g. processing the +outcome variable(s)). Care should be taken when using \code{skip = TRUE} as it +may affect the computations for subsequent operations.} \item{id}{A character string that is unique to this check to identify it.} } \description{ -\code{check_enough_train_data} creates a \emph{specification} of a recipe +\code{check_enough_data} creates a \emph{specification} of a recipe operation that will check if variables contain enough data. } \details{ -This check will break the \code{bake} function if any of the checked -columns have not enough non-NA values. If the check passes, nothing is -changed to the data. +This check will break the \code{prep} and/or bake function if any of the +checked columns have not enough non-NA values. If the check passes, nothing +is changed in the data. It is best used after every other step. + +For checking training data, it is best to set \code{...} to be +\verb{all_predictors(), all_outcomes()}, while for checking prediction data, it +is best to set \code{...} to be \code{all_predictors()} only, with \code{n = 1}. } \section{tidy() results}{ When you \code{\link[=tidy.recipe]{tidy()}} this check, a tibble with column diff --git a/man/step_adjust_latency.Rd b/man/step_adjust_latency.Rd index 75d674472..9e1bafbd5 100644 --- a/man/step_adjust_latency.Rd +++ b/man/step_adjust_latency.Rd @@ -143,7 +143,7 @@ toy_recipe \%>\% #> #> # A tibble: 8 x 4 #> geo_value time_value a b -#> * +#> #> 1 ca 2015-01-11 100 5 #> 2 ca 2015-01-12 103 10 #> 3 ca 2015-01-13 103 10 @@ -179,7 +179,7 @@ toy_recipe \%>\% #> #> # A tibble: 21 x 7 #> geo_value time_value a b lag_3_a lag_4_b ahead_1_a -#> * +#> #> 1 ca 2015-01-10 NA NA NA NA 100 #> 2 ca 2015-01-11 100 5 NA NA 103 #> 3 ca 2015-01-12 103 10 NA NA NA @@ -227,7 +227,7 @@ toy_recipe \%>\% #> #> # A tibble: 10 x 6 #> geo_value time_value a b lag_0_a ahead_3_a -#> * +#> #> 1 ca 2015-01-08 NA NA NA 100 #> 2 ca 2015-01-09 NA NA NA 103 #> 3 ca 2015-01-11 100 5 100 NA @@ -267,8 +267,8 @@ while this will not: \if{html}{\out{
}}\preformatted{toy_recipe <- epi_recipe(toy_df) \%>\% step_epi_lag(a, lag=0) \%>\% step_adjust_latency(a, method = "extend_lags") -#> Warning: If `method` is "extend_lags" or "locf", then the previous `step_epi_lag`s won't -#> work with modified data. +#> Warning: If `method` is "extend_lags" or "locf", then the previous `step_epi_lag`s won't work with +#> modified data. }\if{html}{\out{
}} If you create columns that you then apply lags to (such as diff --git a/tests/testthat/_snaps/check_enough_train_data.md b/tests/testthat/_snaps/check_enough_data.md similarity index 51% rename from tests/testthat/_snaps/check_enough_train_data.md rename to tests/testthat/_snaps/check_enough_data.md index 8f852092a..37c13b3c3 100644 --- a/tests/testthat/_snaps/check_enough_train_data.md +++ b/tests/testthat/_snaps/check_enough_data.md @@ -1,8 +1,8 @@ -# check_enough_train_data works on pooled data +# check_enough_data works on pooled data Code - epi_recipe(toy_epi_df) %>% check_enough_train_data(x, y, n = 2 * n + 1, - drop_na = FALSE) %>% prep(toy_epi_df) %>% bake(new_data = NULL) + epi_recipe(toy_epi_df) %>% check_enough_data(x, y, n = 2 * n + 1, drop_na = FALSE) %>% + prep(toy_epi_df) %>% bake(new_data = NULL) Condition Error in `prep()`: ! The following columns don't have enough data to predict: x and y. @@ -10,16 +10,16 @@ --- Code - epi_recipe(toy_epi_df) %>% check_enough_train_data(x, y, n = 2 * n - 1, - drop_na = TRUE) %>% prep(toy_epi_df) %>% bake(new_data = NULL) + epi_recipe(toy_epi_df) %>% check_enough_data(x, y, n = 2 * n - 1, drop_na = TRUE) %>% + prep(toy_epi_df) %>% bake(new_data = NULL) Condition Error in `prep()`: ! The following columns don't have enough data to predict: x and y. -# check_enough_train_data works on unpooled data +# check_enough_data works on unpooled data Code - epi_recipe(toy_epi_df) %>% check_enough_train_data(x, y, n = n + 1, epi_keys = "geo_value", + epi_recipe(toy_epi_df) %>% check_enough_data(x, y, n = n + 1, epi_keys = "geo_value", drop_na = FALSE) %>% prep(toy_epi_df) %>% bake(new_data = NULL) Condition Error in `prep()`: @@ -28,18 +28,17 @@ --- Code - epi_recipe(toy_epi_df) %>% check_enough_train_data(x, y, n = 2 * n - 3, - epi_keys = "geo_value", drop_na = TRUE) %>% prep(toy_epi_df) %>% bake(new_data = NULL) + epi_recipe(toy_epi_df) %>% check_enough_data(x, y, n = 2 * n - 3, epi_keys = "geo_value", + drop_na = TRUE) %>% prep(toy_epi_df) %>% bake(new_data = NULL) Condition Error in `prep()`: ! The following columns don't have enough data to predict: x and y. -# check_enough_train_data works with all_predictors() downstream of constructed terms +# check_enough_data works with all_predictors() downstream of constructed terms Code - epi_recipe(toy_epi_df) %>% step_epi_lag(x, lag = c(1, 2)) %>% - check_enough_train_data(all_predictors(), y, n = 2 * n - 4) %>% prep( - toy_epi_df) %>% bake(new_data = NULL) + epi_recipe(toy_epi_df) %>% step_epi_lag(x, lag = c(1, 2)) %>% check_enough_data( + all_predictors(), y, n = 2 * n - 4) %>% prep(toy_epi_df) %>% bake(new_data = NULL) Condition Error in `prep()`: ! The following columns don't have enough data to predict: lag_1_x, lag_2_x, and y. diff --git a/tests/testthat/test-arx_forecaster.R b/tests/testthat/test-arx_forecaster.R index cdc21769d..a72222ddb 100644 --- a/tests/testthat/test-arx_forecaster.R +++ b/tests/testthat/test-arx_forecaster.R @@ -40,6 +40,6 @@ test_that("warns if there's not enough data to predict", { expect_error( edf %>% arx_forecaster("value"), - class = "epipredict__not_enough_train_data" + class = "epipredict__not_enough_data" ) }) diff --git a/tests/testthat/test-check_enough_train_data.R b/tests/testthat/test-check_enough_data.R similarity index 74% rename from tests/testthat/test-check_enough_train_data.R rename to tests/testthat/test-check_enough_data.R index 11e86a915..06526ace7 100644 --- a/tests/testthat/test-check_enough_train_data.R +++ b/tests/testthat/test-check_enough_data.R @@ -14,11 +14,11 @@ toy_epi_df <- tibble::tibble( y = 1:(2 * n) ) %>% epiprocess::as_epi_df() -test_that("check_enough_train_data works on pooled data", { +test_that("check_enough_data works on pooled data", { # Check both columns have enough data expect_no_error( epi_recipe(toy_epi_df) %>% - check_enough_train_data(x, y, n = 2 * n, drop_na = FALSE) %>% + check_enough_data(x, y, n = 2 * n, drop_na = FALSE) %>% prep(toy_epi_df) %>% bake(new_data = NULL) ) @@ -26,7 +26,7 @@ test_that("check_enough_train_data works on pooled data", { expect_snapshot( error = TRUE, epi_recipe(toy_epi_df) %>% - check_enough_train_data(x, y, n = 2 * n + 1, drop_na = FALSE) %>% + check_enough_data(x, y, n = 2 * n + 1, drop_na = FALSE) %>% prep(toy_epi_df) %>% bake(new_data = NULL) ) @@ -34,17 +34,17 @@ test_that("check_enough_train_data works on pooled data", { expect_snapshot( error = TRUE, epi_recipe(toy_epi_df) %>% - check_enough_train_data(x, y, n = 2 * n - 1, drop_na = TRUE) %>% + check_enough_data(x, y, n = 2 * n - 1, drop_na = TRUE) %>% prep(toy_epi_df) %>% bake(new_data = NULL) ) }) -test_that("check_enough_train_data works on unpooled data", { +test_that("check_enough_data works on unpooled data", { # Check both columns have enough data expect_no_error( epi_recipe(toy_epi_df) %>% - check_enough_train_data(x, y, n = n, epi_keys = "geo_value", drop_na = FALSE) %>% + check_enough_data(x, y, n = n, epi_keys = "geo_value", drop_na = FALSE) %>% prep(toy_epi_df) %>% bake(new_data = NULL) ) @@ -52,7 +52,7 @@ test_that("check_enough_train_data works on unpooled data", { expect_snapshot( error = TRUE, epi_recipe(toy_epi_df) %>% - check_enough_train_data(x, y, n = n + 1, epi_keys = "geo_value", drop_na = FALSE) %>% + check_enough_data(x, y, n = n + 1, epi_keys = "geo_value", drop_na = FALSE) %>% prep(toy_epi_df) %>% bake(new_data = NULL) ) @@ -60,16 +60,16 @@ test_that("check_enough_train_data works on unpooled data", { expect_snapshot( error = TRUE, epi_recipe(toy_epi_df) %>% - check_enough_train_data(x, y, n = 2 * n - 3, epi_keys = "geo_value", drop_na = TRUE) %>% + check_enough_data(x, y, n = 2 * n - 3, epi_keys = "geo_value", drop_na = TRUE) %>% prep(toy_epi_df) %>% bake(new_data = NULL) ) }) -test_that("check_enough_train_data outputs the correct recipe values", { +test_that("check_enough_data outputs the correct recipe values", { expect_no_error( p <- epi_recipe(toy_epi_df) %>% - check_enough_train_data(x, y, n = 2 * n - 2) %>% + check_enough_data(x, y, n = 2 * n - 2) %>% prep(toy_epi_df) %>% bake(new_data = NULL) ) @@ -107,13 +107,13 @@ test_that("check_enough_train_data only checks train data", { ) }) -test_that("check_enough_train_data works with all_predictors() downstream of constructed terms", { +test_that("check_enough_data works with all_predictors() downstream of constructed terms", { # With a lag of 2, we will get 2 * n - 5 non-NA rows (NA's in x but not in the # lags don't count) expect_no_error( epi_recipe(toy_epi_df) %>% step_epi_lag(x, lag = c(1, 2)) %>% - check_enough_train_data(all_predictors(), y, n = 2 * n - 5) %>% + check_enough_data(all_predictors(), y, n = 2 * n - 5) %>% prep(toy_epi_df) %>% bake(new_data = NULL) ) @@ -121,7 +121,7 @@ test_that("check_enough_train_data works with all_predictors() downstream of con error = TRUE, epi_recipe(toy_epi_df) %>% step_epi_lag(x, lag = c(1, 2)) %>% - check_enough_train_data(all_predictors(), y, n = 2 * n - 4) %>% + check_enough_data(all_predictors(), y, n = 2 * n - 4) %>% prep(toy_epi_df) %>% bake(new_data = NULL) ) From c5b25681b3221253aa11c18c75b4aa7dc164d25a Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Tue, 25 Mar 2025 11:45:16 -0500 Subject: [PATCH 05/12] fix drop na subtleties, unify prep/bake usage A couple of simultaneous problems that were making this tricky: 1. drop_na can completely remove states 2. checking each column individually misses cases where combinations of the states cause the signal to be left out. 3. checking all columns simultaneously doesn't let the user know which columns to check. --- R/check_enough_data.R | 91 ++++++++++++++-------- tests/testthat/_snaps/check_enough_data.md | 39 ++++++---- tests/testthat/test-check_enough_data.R | 43 ++++++---- 3 files changed, 109 insertions(+), 64 deletions(-) diff --git a/R/check_enough_data.R b/R/check_enough_data.R index e87984e49..a43a93c7a 100644 --- a/R/check_enough_data.R +++ b/R/check_enough_data.R @@ -94,22 +94,8 @@ prep.check_enough_data <- function(x, training, info = NULL, ...) { x$n <- length(col_names) } - if (x$drop_na) { - training <- tidyr::drop_na(training, any_of(unname(col_names))) - } - cols_not_enough_data <- training %>% - group_by(across(all_of(.env$x$epi_keys))) %>% - summarise(across(all_of(.env$col_names), ~ dplyr::n() < .env$x$n), .groups = "drop") %>% - summarise(across(all_of(.env$col_names), any), .groups = "drop") %>% - unlist() %>% - names(.)[.] + check_enough_data_core(training, x, col_names, "train") - if (length(cols_not_enough_data) > 0) { - cli_abort( - "The following columns don't have enough data to predict: {cols_not_enough_data}.", - class = "epipredict__not_enough_data" - ) - } check_enough_data_new( n = x$n, @@ -127,24 +113,7 @@ prep.check_enough_data <- function(x, training, info = NULL, ...) { #' @export bake.check_enough_data <- function(object, new_data, ...) { col_names <- object$columns - if (object$drop_na) { - non_na_data <- tidyr::drop_na(new_data, any_of(unname(col_names))) - } else { - non_na_data <- new_data - } - cols_not_enough_data <- non_na_data %>% - group_by(across(all_of(.env$object$epi_keys))) %>% - summarise(across(all_of(.env$col_names), ~ dplyr::n() < .env$object$n), .groups = "drop") %>% - summarise(across(all_of(.env$col_names), any), .groups = "drop") %>% - unlist() %>% - names(.)[.] - - if (length(cols_not_enough_data) > 0) { - cli_abort( - "The following columns don't have enough data to predict: {cols_not_enough_data}.", - class = "epipredict__not_enough_data" - ) - } + check_enough_data_core(new_data, object, col_names, "predict") new_data } @@ -168,3 +137,59 @@ tidy.check_enough_data <- function(x, ...) { res$drop_na <- x$drop_na res } + +check_enough_data_core <- function(epi_df, step_obj, col_names, train_or_predict) { + epi_df <- epi_df %>% + group_by(across(all_of(.env$step_obj$epi_keys))) + if (step_obj$drop_na) { + any_missing_data <- epi_df %>% + mutate(any_are_na = rowSums(across(any_of(.env$col_names), ~ is.na(.x))) > 0) %>% + # count the number of rows where they're all not na + summarise(sum(any_are_na == 0) < .env$step_obj$n, .groups = "drop") + any_missing_data <- any_missing_data %>% + summarize(across(all_of(setdiff(names(any_missing_data), step_obj$epi_keys)), any)) %>% + any() + + # figuring out which individual columns (if any) are to blame for this darth + # of data + cols_not_enough_data <- epi_df %>% + summarise( + across( + all_of(.env$col_names), + ~ sum(!is.na(.x)) < .env$step_obj$n + ), + .groups = "drop" + ) %>% + summarise(across(all_of(.env$col_names), any), .groups = "drop") %>% + unlist() %>% + names(.)[.] + + if (length(cols_not_enough_data) == 0) { + cols_not_enough_data <- + glue::glue("no single column, but the combination of {paste0(col_names, collapse = ', ')}") + } + } else { + # if we're not dropping na values, just count + cols_not_enough_data <- epi_df %>% + summarise( + across( + all_of(.env$col_names), + ~ dplyr::n() < .env$step_obj$n + ) + ) + any_missing_data <- cols_not_enough_data %>% + summarize(across(all_of(.env$col_names), all)) %>% + all() + cols_not_enough_data <- cols_not_enough_data %>% + summarise(across(all_of(.env$col_names), any), .groups = "drop") %>% + unlist() %>% + names(.)[.] + } + + if (any_missing_data) { + cli_abort( + "The following columns don't have enough data to {train_or_predict}: {cols_not_enough_data}.", + class = "epipredict__not_enough_data" + ) + } +} diff --git a/tests/testthat/_snaps/check_enough_data.md b/tests/testthat/_snaps/check_enough_data.md index 37c13b3c3..72ce8af99 100644 --- a/tests/testthat/_snaps/check_enough_data.md +++ b/tests/testthat/_snaps/check_enough_data.md @@ -2,44 +2,53 @@ Code epi_recipe(toy_epi_df) %>% check_enough_data(x, y, n = 2 * n + 1, drop_na = FALSE) %>% - prep(toy_epi_df) %>% bake(new_data = NULL) + prep(toy_epi_df) Condition - Error in `prep()`: - ! The following columns don't have enough data to predict: x and y. + Error in `check_enough_data_core()`: + ! The following columns don't have enough data to train: x and y. --- Code epi_recipe(toy_epi_df) %>% check_enough_data(x, y, n = 2 * n - 1, drop_na = TRUE) %>% - prep(toy_epi_df) %>% bake(new_data = NULL) + prep(toy_epi_df) Condition - Error in `prep()`: - ! The following columns don't have enough data to predict: x and y. + Error in `check_enough_data_core()`: + ! The following columns don't have enough data to train: x. # check_enough_data works on unpooled data Code epi_recipe(toy_epi_df) %>% check_enough_data(x, y, n = n + 1, epi_keys = "geo_value", - drop_na = FALSE) %>% prep(toy_epi_df) %>% bake(new_data = NULL) + drop_na = FALSE) %>% prep(toy_epi_df) Condition - Error in `prep()`: - ! The following columns don't have enough data to predict: x and y. + Error in `check_enough_data_core()`: + ! The following columns don't have enough data to train: x and y. --- Code epi_recipe(toy_epi_df) %>% check_enough_data(x, y, n = 2 * n - 3, epi_keys = "geo_value", - drop_na = TRUE) %>% prep(toy_epi_df) %>% bake(new_data = NULL) + drop_na = TRUE) %>% prep(toy_epi_df) Condition - Error in `prep()`: - ! The following columns don't have enough data to predict: x and y. + Error in `check_enough_data_core()`: + ! The following columns don't have enough data to train: x and y. + +# check_enough_data only checks train data when skip = FALSE + + Code + forecaster %>% predict(new_data = toy_test_data %>% filter(time_value > + "2020-01-08")) + Condition + Error in `check_enough_data_core()`: + ! The following columns don't have enough data to predict: x. # check_enough_data works with all_predictors() downstream of constructed terms Code epi_recipe(toy_epi_df) %>% step_epi_lag(x, lag = c(1, 2)) %>% check_enough_data( - all_predictors(), y, n = 2 * n - 4) %>% prep(toy_epi_df) %>% bake(new_data = NULL) + all_predictors(), y, n = 2 * n - 4) %>% prep(toy_epi_df) Condition - Error in `prep()`: - ! The following columns don't have enough data to predict: lag_1_x, lag_2_x, and y. + Error in `check_enough_data_core()`: + ! The following columns don't have enough data to train: no single column, but the combination of lag_1_x, lag_2_x, y. diff --git a/tests/testthat/test-check_enough_data.R b/tests/testthat/test-check_enough_data.R index 06526ace7..f31bc47ba 100644 --- a/tests/testthat/test-check_enough_data.R +++ b/tests/testthat/test-check_enough_data.R @@ -27,16 +27,14 @@ test_that("check_enough_data works on pooled data", { error = TRUE, epi_recipe(toy_epi_df) %>% check_enough_data(x, y, n = 2 * n + 1, drop_na = FALSE) %>% - prep(toy_epi_df) %>% - bake(new_data = NULL) + prep(toy_epi_df) ) # Check drop_na works expect_snapshot( error = TRUE, epi_recipe(toy_epi_df) %>% check_enough_data(x, y, n = 2 * n - 1, drop_na = TRUE) %>% - prep(toy_epi_df) %>% - bake(new_data = NULL) + prep(toy_epi_df) ) }) @@ -53,16 +51,14 @@ test_that("check_enough_data works on unpooled data", { error = TRUE, epi_recipe(toy_epi_df) %>% check_enough_data(x, y, n = n + 1, epi_keys = "geo_value", drop_na = FALSE) %>% - prep(toy_epi_df) %>% - bake(new_data = NULL) + prep(toy_epi_df) ) # Check drop_na works expect_snapshot( error = TRUE, epi_recipe(toy_epi_df) %>% check_enough_data(x, y, n = 2 * n - 3, epi_keys = "geo_value", drop_na = TRUE) %>% - prep(toy_epi_df) %>% - bake(new_data = NULL) + prep(toy_epi_df) ) }) @@ -85,7 +81,7 @@ test_that("check_enough_data outputs the correct recipe values", { expect_equal(p$geo_value, rep(c("ca", "hi"), each = n)) }) -test_that("check_enough_train_data only checks train data", { +test_that("check_enough_data only checks train data when skip = FALSE", { # Check that the train data has enough data, the test data does not, but # the check passes anyway (because it should be applied to training data) toy_test_data <- toy_epi_df %>% @@ -94,16 +90,32 @@ test_that("check_enough_train_data only checks train data", { epiprocess::as_epi_df() expect_no_error( epi_recipe(toy_epi_df) %>% - check_enough_train_data(x, y, n = n - 2, epi_keys = "geo_value", skip = TRUE) %>% + check_enough_data(x, y, n = n - 2, epi_keys = "geo_value") %>% prep(toy_epi_df) %>% bake(new_data = toy_test_data) ) - # Same thing, but skip = FALSE + # Making sure `skip = TRUE` is working correctly in `predict` expect_no_error( epi_recipe(toy_epi_df) %>% - check_enough_train_data(y, n = n - 2, epi_keys = "geo_value") %>% - prep(toy_epi_df) %>% - bake(new_data = toy_test_data) + add_role(y, new_role = "outcome") %>% + check_enough_data(x, n = n - 2, epi_keys = "geo_value") %>% + epi_workflow(linear_reg()) %>% + fit(toy_epi_df) %>% + predict(new_data = toy_test_data %>% filter(time_value > "2020-01-08")) + ) + # making sure it works for skip = FALSE, where there's enough data to train + # but not enough to predict + expect_no_error( + forecaster <- epi_recipe(toy_epi_df) %>% + add_role(y, new_role = "outcome") %>% + check_enough_data(x, n = 1, epi_keys = "geo_value", skip = FALSE) %>% + epi_workflow(linear_reg()) %>% + fit(toy_epi_df) + ) + expect_snapshot( + error = TRUE, + forecaster %>% + predict(new_data = toy_test_data %>% filter(time_value > "2020-01-08")) ) }) @@ -122,7 +134,6 @@ test_that("check_enough_data works with all_predictors() downstream of construct epi_recipe(toy_epi_df) %>% step_epi_lag(x, lag = c(1, 2)) %>% check_enough_data(all_predictors(), y, n = 2 * n - 4) %>% - prep(toy_epi_df) %>% - bake(new_data = NULL) + prep(toy_epi_df) ) }) From bba711e5ac26a7e506faf786016d043ae02e20d3 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Tue, 25 Mar 2025 13:09:00 -0500 Subject: [PATCH 06/12] note about weird branch --- R/check_enough_data.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/check_enough_data.R b/R/check_enough_data.R index a43a93c7a..823485ba6 100644 --- a/R/check_enough_data.R +++ b/R/check_enough_data.R @@ -164,6 +164,7 @@ check_enough_data_core <- function(epi_df, step_obj, col_names, train_or_predict unlist() %>% names(.)[.] + # if none of the single columns have enough data, that means its the combination of all of them if (length(cols_not_enough_data) == 0) { cols_not_enough_data <- glue::glue("no single column, but the combination of {paste0(col_names, collapse = ', ')}") From a899188a1d51debe4b08d6371514277a99ec84c7 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Tue, 25 Mar 2025 11:25:57 -0700 Subject: [PATCH 07/12] doc: add comments --- R/check_enough_data.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/R/check_enough_data.R b/R/check_enough_data.R index 823485ba6..f0d594ec2 100644 --- a/R/check_enough_data.R +++ b/R/check_enough_data.R @@ -160,11 +160,14 @@ check_enough_data_core <- function(epi_df, step_obj, col_names, train_or_predict ), .groups = "drop" ) %>% + # Aggregate across keys (if present) summarise(across(all_of(.env$col_names), any), .groups = "drop") %>% unlist() %>% names(.)[.] - # if none of the single columns have enough data, that means its the combination of all of them + # Either all columns have enough data, in which case this message won't be + # sent later or none of the single columns have enough data, that means its + # the combination of all of them. if (length(cols_not_enough_data) == 0) { cols_not_enough_data <- glue::glue("no single column, but the combination of {paste0(col_names, collapse = ', ')}") From 4e37fe632f69e15c23a2429268b4ef714b133b03 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Tue, 25 Mar 2025 11:35:57 -0700 Subject: [PATCH 08/12] doc: another comment for a weird R idiom --- R/check_enough_data.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/check_enough_data.R b/R/check_enough_data.R index f0d594ec2..ace7e4e25 100644 --- a/R/check_enough_data.R +++ b/R/check_enough_data.R @@ -163,6 +163,7 @@ check_enough_data_core <- function(epi_df, step_obj, col_names, train_or_predict # Aggregate across keys (if present) summarise(across(all_of(.env$col_names), any), .groups = "drop") %>% unlist() %>% + # Select the names of the columns that are TRUE names(.)[.] # Either all columns have enough data, in which case this message won't be @@ -187,6 +188,7 @@ check_enough_data_core <- function(epi_df, step_obj, col_names, train_or_predict cols_not_enough_data <- cols_not_enough_data %>% summarise(across(all_of(.env$col_names), any), .groups = "drop") %>% unlist() %>% + # Select the names of the columns that are TRUE names(.)[.] } From 51a1c6c713f57c51823b03469ad0f3e20b8c84d5 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Wed, 26 Mar 2025 11:15:49 -0500 Subject: [PATCH 09/12] extraneous test_data --- R/epi_workflow.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/epi_workflow.R b/R/epi_workflow.R index 826ae7f1b..81b443e7b 100644 --- a/R/epi_workflow.R +++ b/R/epi_workflow.R @@ -270,6 +270,6 @@ forecast.epi_workflow <- function(object, ..., n_recent = NULL, forecast_date = hardhat::extract_preprocessor(object), object$original_data ) - test_data + predict(object, new_data = test_data) } From 1b05e99f96d9d4510b3c67c956e1f9ba82813031 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Thu, 27 Mar 2025 12:48:53 -0500 Subject: [PATCH 10/12] suggestions from Dan --- R/arx_forecaster.R | 4 +-- R/check_enough_data.R | 42 +++++++++------------- tests/testthat/_snaps/check_enough_data.md | 18 +++++----- tests/testthat/test-check_enough_data.R | 24 ++++++------- 4 files changed, 40 insertions(+), 48 deletions(-) diff --git a/R/arx_forecaster.R b/R/arx_forecaster.R index 3a87719eb..3cbcdeb4a 100644 --- a/R/arx_forecaster.R +++ b/R/arx_forecaster.R @@ -172,13 +172,13 @@ arx_fcast_epi_workflow <- function( r <- r %>% step_epi_naomit() %>% step_training_window(n_recent = args_list$n_training) %>% - check_enough_data(all_predictors(), n = 1, skip = FALSE) + check_enough_data(all_predictors(), min_data_points = 1, skip = FALSE) if (!is.null(args_list$check_enough_data_n)) { r <- r %>% check_enough_data( all_predictors(), all_outcomes(), - n = args_list$check_enough_data_n, + min_data_points = args_list$check_enough_data_n, epi_keys = args_list$check_enough_data_epi_keys, drop_na = FALSE ) diff --git a/R/check_enough_data.R b/R/check_enough_data.R index ace7e4e25..2a434f6a8 100644 --- a/R/check_enough_data.R +++ b/R/check_enough_data.R @@ -8,8 +8,8 @@ #' @param ... One or more selector functions to choose variables for this check. #' See [selections()] for more details. You will usually want to use #' [recipes::all_predictors()] and/or [recipes::all_outcomes()] here. -#' @param n The minimum number of data points required for training. If this is -#' NULL, the total number of predictors will be used. +#' @param min_data_points The minimum number of data points required for +#' training. If this is NULL, the total number of predictors will be used. #' @param epi_keys A character vector of column names on which to group the data #' and check threshold within each group. Useful if your forecaster trains #' per group (for example, per geo_value). @@ -18,8 +18,6 @@ #' created. #' @param trained A logical for whether the selectors in `...` #' have been resolved by [prep()]. -#' @param columns An internal argument that tracks which columns are evaluated -#' for this check. Should not be used by the user. #' @param id A character string that is unique to this check to identify it. #' @param skip A logical. If `TRUE`, only training data is checked, while if #' `FALSE`, both training and predicting data is checked. Technically, this @@ -46,24 +44,23 @@ check_enough_data <- function(recipe, ..., - n = NULL, + min_data_points = NULL, epi_keys = NULL, drop_na = TRUE, role = NA, trained = FALSE, - columns = NULL, skip = TRUE, id = rand_id("enough_data")) { recipes::add_check( recipe, check_enough_data_new( - n = n, + min_data_points = min_data_points, epi_keys = epi_keys, drop_na = drop_na, terms = enquos(...), role = role, trained = trained, - columns = columns, + columns = NULL, skip = skip, id = id ) @@ -71,11 +68,12 @@ check_enough_data <- } check_enough_data_new <- - function(n, epi_keys, drop_na, terms, role, trained, columns, skip, id) { + function(min_data_points, epi_keys, drop_na, terms, + role, trained, columns, skip, id) { recipes::check( subclass = "enough_data", prefix = "check_", - n = n, + min_data_points = min_data_points, epi_keys = epi_keys, drop_na = drop_na, terms = terms, @@ -90,15 +88,14 @@ check_enough_data_new <- #' @export prep.check_enough_data <- function(x, training, info = NULL, ...) { col_names <- recipes::recipes_eval_select(x$terms, training, info) - if (is.null(x$n)) { - x$n <- length(col_names) + if (is.null(x$min_data_points)) { + x$min_data_points <- length(col_names) } check_enough_data_core(training, x, col_names, "train") - check_enough_data_new( - n = x$n, + min_data_points = x$min_data_points, epi_keys = x$epi_keys, drop_na = x$drop_na, terms = x$terms, @@ -119,7 +116,7 @@ bake.check_enough_data <- function(object, new_data, ...) { #' @export print.check_enough_data <- function(x, width = max(20, options()$width - 30), ...) { - title <- paste0("Check enough data (n = ", x$n, ") for ") + title <- paste0("Check enough data (n = ", x$min_data_points, ") for ") recipes::print_step(x$columns, x$terms, x$trained, title, width) invisible(x) } @@ -132,7 +129,7 @@ tidy.check_enough_data <- function(x, ...) { res <- tibble(terms = recipes::sel2char(x$terms)) } res$id <- x$id - res$n <- x$n + res$min_data_points <- x$min_data_points res$epi_keys <- x$epi_keys res$drop_na <- x$drop_na res @@ -145,18 +142,18 @@ check_enough_data_core <- function(epi_df, step_obj, col_names, train_or_predict any_missing_data <- epi_df %>% mutate(any_are_na = rowSums(across(any_of(.env$col_names), ~ is.na(.x))) > 0) %>% # count the number of rows where they're all not na - summarise(sum(any_are_na == 0) < .env$step_obj$n, .groups = "drop") + summarise(sum(any_are_na == 0) < .env$step_obj$min_data_points, .groups = "drop") any_missing_data <- any_missing_data %>% summarize(across(all_of(setdiff(names(any_missing_data), step_obj$epi_keys)), any)) %>% any() - # figuring out which individual columns (if any) are to blame for this darth + # figuring out which individual columns (if any) are to blame for this dearth # of data cols_not_enough_data <- epi_df %>% summarise( across( all_of(.env$col_names), - ~ sum(!is.na(.x)) < .env$step_obj$n + ~ sum(!is.na(.x)) < .env$step_obj$min_data_points ), .groups = "drop" ) %>% @@ -176,12 +173,7 @@ check_enough_data_core <- function(epi_df, step_obj, col_names, train_or_predict } else { # if we're not dropping na values, just count cols_not_enough_data <- epi_df %>% - summarise( - across( - all_of(.env$col_names), - ~ dplyr::n() < .env$step_obj$n - ) - ) + summarise(across(all_of(.env$col_names), ~ dplyr::n() < .env$step_obj$min_data_points)) any_missing_data <- cols_not_enough_data %>% summarize(across(all_of(.env$col_names), all)) %>% all() diff --git a/tests/testthat/_snaps/check_enough_data.md b/tests/testthat/_snaps/check_enough_data.md index 72ce8af99..f98293354 100644 --- a/tests/testthat/_snaps/check_enough_data.md +++ b/tests/testthat/_snaps/check_enough_data.md @@ -1,8 +1,8 @@ # check_enough_data works on pooled data Code - epi_recipe(toy_epi_df) %>% check_enough_data(x, y, n = 2 * n + 1, drop_na = FALSE) %>% - prep(toy_epi_df) + epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_data_points = 2 * n + 1, + drop_na = FALSE) %>% prep(toy_epi_df) Condition Error in `check_enough_data_core()`: ! The following columns don't have enough data to train: x and y. @@ -10,8 +10,8 @@ --- Code - epi_recipe(toy_epi_df) %>% check_enough_data(x, y, n = 2 * n - 1, drop_na = TRUE) %>% - prep(toy_epi_df) + epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_data_points = 2 * n - 1, + drop_na = TRUE) %>% prep(toy_epi_df) Condition Error in `check_enough_data_core()`: ! The following columns don't have enough data to train: x. @@ -19,8 +19,8 @@ # check_enough_data works on unpooled data Code - epi_recipe(toy_epi_df) %>% check_enough_data(x, y, n = n + 1, epi_keys = "geo_value", - drop_na = FALSE) %>% prep(toy_epi_df) + epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_data_points = n + 1, + epi_keys = "geo_value", drop_na = FALSE) %>% prep(toy_epi_df) Condition Error in `check_enough_data_core()`: ! The following columns don't have enough data to train: x and y. @@ -28,8 +28,8 @@ --- Code - epi_recipe(toy_epi_df) %>% check_enough_data(x, y, n = 2 * n - 3, epi_keys = "geo_value", - drop_na = TRUE) %>% prep(toy_epi_df) + epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_data_points = 2 * n - 3, + epi_keys = "geo_value", drop_na = TRUE) %>% prep(toy_epi_df) Condition Error in `check_enough_data_core()`: ! The following columns don't have enough data to train: x and y. @@ -47,7 +47,7 @@ Code epi_recipe(toy_epi_df) %>% step_epi_lag(x, lag = c(1, 2)) %>% check_enough_data( - all_predictors(), y, n = 2 * n - 4) %>% prep(toy_epi_df) + all_predictors(), y, min_data_points = 2 * n - 4) %>% prep(toy_epi_df) Condition Error in `check_enough_data_core()`: ! The following columns don't have enough data to train: no single column, but the combination of lag_1_x, lag_2_x, y. diff --git a/tests/testthat/test-check_enough_data.R b/tests/testthat/test-check_enough_data.R index f31bc47ba..e2398e5c6 100644 --- a/tests/testthat/test-check_enough_data.R +++ b/tests/testthat/test-check_enough_data.R @@ -18,7 +18,7 @@ test_that("check_enough_data works on pooled data", { # Check both columns have enough data expect_no_error( epi_recipe(toy_epi_df) %>% - check_enough_data(x, y, n = 2 * n, drop_na = FALSE) %>% + check_enough_data(x, y, min_data_points = 2 * n, drop_na = FALSE) %>% prep(toy_epi_df) %>% bake(new_data = NULL) ) @@ -26,14 +26,14 @@ test_that("check_enough_data works on pooled data", { expect_snapshot( error = TRUE, epi_recipe(toy_epi_df) %>% - check_enough_data(x, y, n = 2 * n + 1, drop_na = FALSE) %>% + check_enough_data(x, y, min_data_points = 2 * n + 1, drop_na = FALSE) %>% prep(toy_epi_df) ) # Check drop_na works expect_snapshot( error = TRUE, epi_recipe(toy_epi_df) %>% - check_enough_data(x, y, n = 2 * n - 1, drop_na = TRUE) %>% + check_enough_data(x, y, min_data_points = 2 * n - 1, drop_na = TRUE) %>% prep(toy_epi_df) ) }) @@ -42,7 +42,7 @@ test_that("check_enough_data works on unpooled data", { # Check both columns have enough data expect_no_error( epi_recipe(toy_epi_df) %>% - check_enough_data(x, y, n = n, epi_keys = "geo_value", drop_na = FALSE) %>% + check_enough_data(x, y, min_data_points = n, epi_keys = "geo_value", drop_na = FALSE) %>% prep(toy_epi_df) %>% bake(new_data = NULL) ) @@ -50,14 +50,14 @@ test_that("check_enough_data works on unpooled data", { expect_snapshot( error = TRUE, epi_recipe(toy_epi_df) %>% - check_enough_data(x, y, n = n + 1, epi_keys = "geo_value", drop_na = FALSE) %>% + check_enough_data(x, y, min_data_points = n + 1, epi_keys = "geo_value", drop_na = FALSE) %>% prep(toy_epi_df) ) # Check drop_na works expect_snapshot( error = TRUE, epi_recipe(toy_epi_df) %>% - check_enough_data(x, y, n = 2 * n - 3, epi_keys = "geo_value", drop_na = TRUE) %>% + check_enough_data(x, y, min_data_points = 2 * n - 3, epi_keys = "geo_value", drop_na = TRUE) %>% prep(toy_epi_df) ) }) @@ -65,7 +65,7 @@ test_that("check_enough_data works on unpooled data", { test_that("check_enough_data outputs the correct recipe values", { expect_no_error( p <- epi_recipe(toy_epi_df) %>% - check_enough_data(x, y, n = 2 * n - 2) %>% + check_enough_data(x, y, min_data_points = 2 * n - 2) %>% prep(toy_epi_df) %>% bake(new_data = NULL) ) @@ -90,7 +90,7 @@ test_that("check_enough_data only checks train data when skip = FALSE", { epiprocess::as_epi_df() expect_no_error( epi_recipe(toy_epi_df) %>% - check_enough_data(x, y, n = n - 2, epi_keys = "geo_value") %>% + check_enough_data(x, y, min_data_points = n - 2, epi_keys = "geo_value") %>% prep(toy_epi_df) %>% bake(new_data = toy_test_data) ) @@ -98,7 +98,7 @@ test_that("check_enough_data only checks train data when skip = FALSE", { expect_no_error( epi_recipe(toy_epi_df) %>% add_role(y, new_role = "outcome") %>% - check_enough_data(x, n = n - 2, epi_keys = "geo_value") %>% + check_enough_data(x, min_data_points = n - 2, epi_keys = "geo_value") %>% epi_workflow(linear_reg()) %>% fit(toy_epi_df) %>% predict(new_data = toy_test_data %>% filter(time_value > "2020-01-08")) @@ -108,7 +108,7 @@ test_that("check_enough_data only checks train data when skip = FALSE", { expect_no_error( forecaster <- epi_recipe(toy_epi_df) %>% add_role(y, new_role = "outcome") %>% - check_enough_data(x, n = 1, epi_keys = "geo_value", skip = FALSE) %>% + check_enough_data(x, min_data_points = 1, epi_keys = "geo_value", skip = FALSE) %>% epi_workflow(linear_reg()) %>% fit(toy_epi_df) ) @@ -125,7 +125,7 @@ test_that("check_enough_data works with all_predictors() downstream of construct expect_no_error( epi_recipe(toy_epi_df) %>% step_epi_lag(x, lag = c(1, 2)) %>% - check_enough_data(all_predictors(), y, n = 2 * n - 5) %>% + check_enough_data(all_predictors(), y, min_data_points = 2 * n - 5) %>% prep(toy_epi_df) %>% bake(new_data = NULL) ) @@ -133,7 +133,7 @@ test_that("check_enough_data works with all_predictors() downstream of construct error = TRUE, epi_recipe(toy_epi_df) %>% step_epi_lag(x, lag = c(1, 2)) %>% - check_enough_data(all_predictors(), y, n = 2 * n - 4) %>% + check_enough_data(all_predictors(), y, min_data_points = 2 * n - 4) %>% prep(toy_epi_df) ) }) From 14992799db2c974151932f9b5f89678aa129a631 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Thu, 27 Mar 2025 13:21:45 -0500 Subject: [PATCH 11/12] doc renaming --- man/check_enough_data.Rd | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/man/check_enough_data.Rd b/man/check_enough_data.Rd index 57c321fd1..31a072e62 100644 --- a/man/check_enough_data.Rd +++ b/man/check_enough_data.Rd @@ -7,12 +7,11 @@ check_enough_data( recipe, ..., - n = NULL, + min_data_points = NULL, epi_keys = NULL, drop_na = TRUE, role = NA, trained = FALSE, - columns = NULL, skip = TRUE, id = rand_id("enough_data") ) @@ -25,8 +24,8 @@ sequence of operations for this recipe.} See \code{\link[=selections]{selections()}} for more details. You will usually want to use \code{\link[recipes:has_role]{recipes::all_predictors()}} and/or \code{\link[recipes:has_role]{recipes::all_outcomes()}} here.} -\item{n}{The minimum number of data points required for training. If this is -NULL, the total number of predictors will be used.} +\item{min_data_points}{The minimum number of data points required for +training. If this is NULL, the total number of predictors will be used.} \item{epi_keys}{A character vector of column names on which to group the data and check threshold within each group. Useful if your forecaster trains @@ -40,9 +39,6 @@ created.} \item{trained}{A logical for whether the selectors in \code{...} have been resolved by \code{\link[=prep]{prep()}}.} -\item{columns}{An internal argument that tracks which columns are evaluated -for this check. Should not be used by the user.} - \item{skip}{A logical. If \code{TRUE}, only training data is checked, while if \code{FALSE}, both training and predicting data is checked. Technically, this answers the question "should the check be skipped when the recipe is baked From 84f991d6fc1eed1e420f3f39065a6ea52107876f Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 28 Mar 2025 18:06:11 -0500 Subject: [PATCH 12/12] rename, check for `pre` in names, dplyr::between --- R/arx_forecaster.R | 4 ++-- R/canned-epipred.R | 2 +- R/check_enough_data.R | 26 +++++++++++----------- man/check_enough_data.Rd | 4 ++-- tests/testthat/_snaps/check_enough_data.md | 10 ++++----- tests/testthat/test-arx_forecaster.R | 2 +- tests/testthat/test-check_enough_data.R | 24 ++++++++++---------- 7 files changed, 36 insertions(+), 36 deletions(-) diff --git a/R/arx_forecaster.R b/R/arx_forecaster.R index 3cbcdeb4a..f988490fd 100644 --- a/R/arx_forecaster.R +++ b/R/arx_forecaster.R @@ -172,13 +172,13 @@ arx_fcast_epi_workflow <- function( r <- r %>% step_epi_naomit() %>% step_training_window(n_recent = args_list$n_training) %>% - check_enough_data(all_predictors(), min_data_points = 1, skip = FALSE) + check_enough_data(all_predictors(), min_observations = 1, skip = FALSE) if (!is.null(args_list$check_enough_data_n)) { r <- r %>% check_enough_data( all_predictors(), all_outcomes(), - min_data_points = args_list$check_enough_data_n, + min_observations = args_list$check_enough_data_n, epi_keys = args_list$check_enough_data_epi_keys, drop_na = FALSE ) diff --git a/R/canned-epipred.R b/R/canned-epipred.R index 48e984168..7d53862c2 100644 --- a/R/canned-epipred.R +++ b/R/canned-epipred.R @@ -112,7 +112,7 @@ print.canned_epipred <- function(x, name, ...) { "At forecast date{?s}: {.val {fds}},", "For target date{?s}: {.val {tds}}," )) - if ("actions" %in% names(x$pre) && "recipe" %in% names(x$pre$actions)) { + if ("pre" %in% names(x) && "actions" %in% names(x$pre) && "recipe" %in% names(x$pre$actions)) { fit_recipe <- extract_recipe(x$epi_workflow) if (detect_step(fit_recipe, "adjust_latency")) { is_adj_latency <- map_lgl(fit_recipe$steps, function(x) inherits(x, "step_adjust_latency")) diff --git a/R/check_enough_data.R b/R/check_enough_data.R index 2a434f6a8..e830d5e54 100644 --- a/R/check_enough_data.R +++ b/R/check_enough_data.R @@ -8,7 +8,7 @@ #' @param ... One or more selector functions to choose variables for this check. #' See [selections()] for more details. You will usually want to use #' [recipes::all_predictors()] and/or [recipes::all_outcomes()] here. -#' @param min_data_points The minimum number of data points required for +#' @param min_observations The minimum number of data points required for #' training. If this is NULL, the total number of predictors will be used. #' @param epi_keys A character vector of column names on which to group the data #' and check threshold within each group. Useful if your forecaster trains @@ -44,7 +44,7 @@ check_enough_data <- function(recipe, ..., - min_data_points = NULL, + min_observations = NULL, epi_keys = NULL, drop_na = TRUE, role = NA, @@ -54,7 +54,7 @@ check_enough_data <- recipes::add_check( recipe, check_enough_data_new( - min_data_points = min_data_points, + min_observations = min_observations, epi_keys = epi_keys, drop_na = drop_na, terms = enquos(...), @@ -68,12 +68,12 @@ check_enough_data <- } check_enough_data_new <- - function(min_data_points, epi_keys, drop_na, terms, + function(min_observations, epi_keys, drop_na, terms, role, trained, columns, skip, id) { recipes::check( subclass = "enough_data", prefix = "check_", - min_data_points = min_data_points, + min_observations = min_observations, epi_keys = epi_keys, drop_na = drop_na, terms = terms, @@ -88,14 +88,14 @@ check_enough_data_new <- #' @export prep.check_enough_data <- function(x, training, info = NULL, ...) { col_names <- recipes::recipes_eval_select(x$terms, training, info) - if (is.null(x$min_data_points)) { - x$min_data_points <- length(col_names) + if (is.null(x$min_observations)) { + x$min_observations <- length(col_names) } check_enough_data_core(training, x, col_names, "train") check_enough_data_new( - min_data_points = x$min_data_points, + min_observations = x$min_observations, epi_keys = x$epi_keys, drop_na = x$drop_na, terms = x$terms, @@ -116,7 +116,7 @@ bake.check_enough_data <- function(object, new_data, ...) { #' @export print.check_enough_data <- function(x, width = max(20, options()$width - 30), ...) { - title <- paste0("Check enough data (n = ", x$min_data_points, ") for ") + title <- paste0("Check enough data (n = ", x$min_observations, ") for ") recipes::print_step(x$columns, x$terms, x$trained, title, width) invisible(x) } @@ -129,7 +129,7 @@ tidy.check_enough_data <- function(x, ...) { res <- tibble(terms = recipes::sel2char(x$terms)) } res$id <- x$id - res$min_data_points <- x$min_data_points + res$min_observations <- x$min_observations res$epi_keys <- x$epi_keys res$drop_na <- x$drop_na res @@ -142,7 +142,7 @@ check_enough_data_core <- function(epi_df, step_obj, col_names, train_or_predict any_missing_data <- epi_df %>% mutate(any_are_na = rowSums(across(any_of(.env$col_names), ~ is.na(.x))) > 0) %>% # count the number of rows where they're all not na - summarise(sum(any_are_na == 0) < .env$step_obj$min_data_points, .groups = "drop") + summarise(sum(any_are_na == 0) < .env$step_obj$min_observations, .groups = "drop") any_missing_data <- any_missing_data %>% summarize(across(all_of(setdiff(names(any_missing_data), step_obj$epi_keys)), any)) %>% any() @@ -153,7 +153,7 @@ check_enough_data_core <- function(epi_df, step_obj, col_names, train_or_predict summarise( across( all_of(.env$col_names), - ~ sum(!is.na(.x)) < .env$step_obj$min_data_points + ~ sum(!is.na(.x)) < .env$step_obj$min_observations ), .groups = "drop" ) %>% @@ -173,7 +173,7 @@ check_enough_data_core <- function(epi_df, step_obj, col_names, train_or_predict } else { # if we're not dropping na values, just count cols_not_enough_data <- epi_df %>% - summarise(across(all_of(.env$col_names), ~ dplyr::n() < .env$step_obj$min_data_points)) + summarise(across(all_of(.env$col_names), ~ dplyr::n() < .env$step_obj$min_observations)) any_missing_data <- cols_not_enough_data %>% summarize(across(all_of(.env$col_names), all)) %>% all() diff --git a/man/check_enough_data.Rd b/man/check_enough_data.Rd index 31a072e62..969caa1d2 100644 --- a/man/check_enough_data.Rd +++ b/man/check_enough_data.Rd @@ -7,7 +7,7 @@ check_enough_data( recipe, ..., - min_data_points = NULL, + min_observations = NULL, epi_keys = NULL, drop_na = TRUE, role = NA, @@ -24,7 +24,7 @@ sequence of operations for this recipe.} See \code{\link[=selections]{selections()}} for more details. You will usually want to use \code{\link[recipes:has_role]{recipes::all_predictors()}} and/or \code{\link[recipes:has_role]{recipes::all_outcomes()}} here.} -\item{min_data_points}{The minimum number of data points required for +\item{min_observations}{The minimum number of data points required for training. If this is NULL, the total number of predictors will be used.} \item{epi_keys}{A character vector of column names on which to group the data diff --git a/tests/testthat/_snaps/check_enough_data.md b/tests/testthat/_snaps/check_enough_data.md index f98293354..4a6ff336d 100644 --- a/tests/testthat/_snaps/check_enough_data.md +++ b/tests/testthat/_snaps/check_enough_data.md @@ -1,7 +1,7 @@ # check_enough_data works on pooled data Code - epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_data_points = 2 * n + 1, + epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_observations = 2 * n + 1, drop_na = FALSE) %>% prep(toy_epi_df) Condition Error in `check_enough_data_core()`: @@ -10,7 +10,7 @@ --- Code - epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_data_points = 2 * n - 1, + epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_observations = 2 * n - 1, drop_na = TRUE) %>% prep(toy_epi_df) Condition Error in `check_enough_data_core()`: @@ -19,7 +19,7 @@ # check_enough_data works on unpooled data Code - epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_data_points = n + 1, + epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_observations = n + 1, epi_keys = "geo_value", drop_na = FALSE) %>% prep(toy_epi_df) Condition Error in `check_enough_data_core()`: @@ -28,7 +28,7 @@ --- Code - epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_data_points = 2 * n - 3, + epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_observations = 2 * n - 3, epi_keys = "geo_value", drop_na = TRUE) %>% prep(toy_epi_df) Condition Error in `check_enough_data_core()`: @@ -47,7 +47,7 @@ Code epi_recipe(toy_epi_df) %>% step_epi_lag(x, lag = c(1, 2)) %>% check_enough_data( - all_predictors(), y, min_data_points = 2 * n - 4) %>% prep(toy_epi_df) + all_predictors(), y, min_observations = 2 * n - 4) %>% prep(toy_epi_df) Condition Error in `check_enough_data_core()`: ! The following columns don't have enough data to train: no single column, but the combination of lag_1_x, lag_2_x, y. diff --git a/tests/testthat/test-arx_forecaster.R b/tests/testthat/test-arx_forecaster.R index a72222ddb..d13e6d2ea 100644 --- a/tests/testthat/test-arx_forecaster.R +++ b/tests/testthat/test-arx_forecaster.R @@ -32,7 +32,7 @@ test_that("warns if there's not enough data to predict", { ) %>% mutate(value = seq_len(nrow(.)) + rnorm(nrow(.))) %>% # Oct to May (flu season, ish) only: - filter(!between(as.POSIXlt(time_value)$mon + 1L, 6L, 9L)) %>% + filter(!dplyr::between(as.POSIXlt(time_value)$mon + 1L, 6L, 9L)) %>% # and actually, pretend we're around mid-October 2022: filter(time_value <= as.Date("2022-10-12")) %>% as_epi_df(as_of = as.Date("2022-10-12")) diff --git a/tests/testthat/test-check_enough_data.R b/tests/testthat/test-check_enough_data.R index e2398e5c6..3ca388afb 100644 --- a/tests/testthat/test-check_enough_data.R +++ b/tests/testthat/test-check_enough_data.R @@ -18,7 +18,7 @@ test_that("check_enough_data works on pooled data", { # Check both columns have enough data expect_no_error( epi_recipe(toy_epi_df) %>% - check_enough_data(x, y, min_data_points = 2 * n, drop_na = FALSE) %>% + check_enough_data(x, y, min_observations = 2 * n, drop_na = FALSE) %>% prep(toy_epi_df) %>% bake(new_data = NULL) ) @@ -26,14 +26,14 @@ test_that("check_enough_data works on pooled data", { expect_snapshot( error = TRUE, epi_recipe(toy_epi_df) %>% - check_enough_data(x, y, min_data_points = 2 * n + 1, drop_na = FALSE) %>% + check_enough_data(x, y, min_observations = 2 * n + 1, drop_na = FALSE) %>% prep(toy_epi_df) ) # Check drop_na works expect_snapshot( error = TRUE, epi_recipe(toy_epi_df) %>% - check_enough_data(x, y, min_data_points = 2 * n - 1, drop_na = TRUE) %>% + check_enough_data(x, y, min_observations = 2 * n - 1, drop_na = TRUE) %>% prep(toy_epi_df) ) }) @@ -42,7 +42,7 @@ test_that("check_enough_data works on unpooled data", { # Check both columns have enough data expect_no_error( epi_recipe(toy_epi_df) %>% - check_enough_data(x, y, min_data_points = n, epi_keys = "geo_value", drop_na = FALSE) %>% + check_enough_data(x, y, min_observations = n, epi_keys = "geo_value", drop_na = FALSE) %>% prep(toy_epi_df) %>% bake(new_data = NULL) ) @@ -50,14 +50,14 @@ test_that("check_enough_data works on unpooled data", { expect_snapshot( error = TRUE, epi_recipe(toy_epi_df) %>% - check_enough_data(x, y, min_data_points = n + 1, epi_keys = "geo_value", drop_na = FALSE) %>% + check_enough_data(x, y, min_observations = n + 1, epi_keys = "geo_value", drop_na = FALSE) %>% prep(toy_epi_df) ) # Check drop_na works expect_snapshot( error = TRUE, epi_recipe(toy_epi_df) %>% - check_enough_data(x, y, min_data_points = 2 * n - 3, epi_keys = "geo_value", drop_na = TRUE) %>% + check_enough_data(x, y, min_observations = 2 * n - 3, epi_keys = "geo_value", drop_na = TRUE) %>% prep(toy_epi_df) ) }) @@ -65,7 +65,7 @@ test_that("check_enough_data works on unpooled data", { test_that("check_enough_data outputs the correct recipe values", { expect_no_error( p <- epi_recipe(toy_epi_df) %>% - check_enough_data(x, y, min_data_points = 2 * n - 2) %>% + check_enough_data(x, y, min_observations = 2 * n - 2) %>% prep(toy_epi_df) %>% bake(new_data = NULL) ) @@ -90,7 +90,7 @@ test_that("check_enough_data only checks train data when skip = FALSE", { epiprocess::as_epi_df() expect_no_error( epi_recipe(toy_epi_df) %>% - check_enough_data(x, y, min_data_points = n - 2, epi_keys = "geo_value") %>% + check_enough_data(x, y, min_observations = n - 2, epi_keys = "geo_value") %>% prep(toy_epi_df) %>% bake(new_data = toy_test_data) ) @@ -98,7 +98,7 @@ test_that("check_enough_data only checks train data when skip = FALSE", { expect_no_error( epi_recipe(toy_epi_df) %>% add_role(y, new_role = "outcome") %>% - check_enough_data(x, min_data_points = n - 2, epi_keys = "geo_value") %>% + check_enough_data(x, min_observations = n - 2, epi_keys = "geo_value") %>% epi_workflow(linear_reg()) %>% fit(toy_epi_df) %>% predict(new_data = toy_test_data %>% filter(time_value > "2020-01-08")) @@ -108,7 +108,7 @@ test_that("check_enough_data only checks train data when skip = FALSE", { expect_no_error( forecaster <- epi_recipe(toy_epi_df) %>% add_role(y, new_role = "outcome") %>% - check_enough_data(x, min_data_points = 1, epi_keys = "geo_value", skip = FALSE) %>% + check_enough_data(x, min_observations = 1, epi_keys = "geo_value", skip = FALSE) %>% epi_workflow(linear_reg()) %>% fit(toy_epi_df) ) @@ -125,7 +125,7 @@ test_that("check_enough_data works with all_predictors() downstream of construct expect_no_error( epi_recipe(toy_epi_df) %>% step_epi_lag(x, lag = c(1, 2)) %>% - check_enough_data(all_predictors(), y, min_data_points = 2 * n - 5) %>% + check_enough_data(all_predictors(), y, min_observations = 2 * n - 5) %>% prep(toy_epi_df) %>% bake(new_data = NULL) ) @@ -133,7 +133,7 @@ test_that("check_enough_data works with all_predictors() downstream of construct error = TRUE, epi_recipe(toy_epi_df) %>% step_epi_lag(x, lag = c(1, 2)) %>% - check_enough_data(all_predictors(), y, min_data_points = 2 * n - 4) %>% + check_enough_data(all_predictors(), y, min_observations = 2 * n - 4) %>% prep(toy_epi_df) ) })