cmu-delphi
diff --git a/‎DESCRIPTION
+3-3 b/‎DESCRIPTION
+3-3
diff --git a/‎NAMESPACE
+7 b/‎NAMESPACE
+7
diff --git a/‎NEWS.md
+7 b/‎NEWS.md
+7
diff --git a/‎R/arx_classifier.R
+14-6 b/‎R/arx_classifier.R
+14-6
diff --git a/‎R/arx_forecaster.R
+20-14 b/‎R/arx_forecaster.R
+20-14
diff --git a/‎R/epi_check_training_set.R
+52 b/‎R/epi_check_training_set.R
+52
diff --git a/‎R/epi_keys.R
+4 b/‎R/epi_keys.R
+4
diff --git a/‎R/epi_recipe.R
+14-3 b/‎R/epi_recipe.R
+14-3
diff --git a/‎R/epi_shift.R
+1-1 b/‎R/epi_shift.R
+1-1
diff --git a/‎R/flatline.R
+2-2 b/‎R/flatline.R
+2-2
@@ -1,6 +1,6 @@
 Package: epipredict
 Title: Basic epidemiology forecasting methods
-Version: 0.0.3.9999
+Version: 0.0.4
 Authors@R: c(
     person("Daniel", "McDonald", , "[email protected]", role = c("aut", "cre")),
     person("Ryan", "Tibshirani", , "[email protected]", role = "aut"),
@@ -21,7 +21,7 @@ URL: https://github.com/cmu-delphi/epipredict/,
     https://cmu-delphi.github.io/epipredict
 BugReports: https://github.com/cmu-delphi/epipredict/issues/
 Depends: 
-    epiprocess,
+    epiprocess (>= 0.6.0),
     parsnip (>= 1.0.0),
     R (>= 3.5.0)
 Imports: 
@@ -61,7 +61,7 @@ VignetteBuilder:
     knitr
 Remotes:
     cmu-delphi/epidatr,
-    cmu-delphi/epiprocess
+    cmu-delphi/epiprocess@dev
 Config/testthat/edition: 3
 Encoding: UTF-8
 LazyData: true
 
@@ -44,7 +44,10 @@ S3method(prep,step_growth_rate)
 S3method(prep,step_lag_difference)
 S3method(prep,step_population_scaling)
 S3method(prep,step_training_window)
+S3method(print,arx_clist)
+S3method(print,arx_flist)
 S3method(print,epi_workflow)
+S3method(print,flatline_alist)
 S3method(print,frosting)
 S3method(print,layer_add_forecast_date)
 S3method(print,layer_add_target_date)
@@ -91,6 +94,7 @@ export(arx_args_list)
 export(arx_class_args_list)
 export(arx_classifier)
 export(arx_forecaster)
+export(bake)
 export(create_layer)
 export(default_epi_recipe_blueprint)
 export(detect_layer)
@@ -128,6 +132,7 @@ export(layer_threshold)
 export(nested_quantiles)
 export(new_default_epi_recipe_blueprint)
 export(new_epi_recipe_blueprint)
+export(prep)
 export(quantile_reg)
 export(remove_frosting)
 export(slather)
@@ -152,6 +157,8 @@ importFrom(hardhat,run_mold)
 importFrom(magrittr,"%>%")
 importFrom(methods,is)
 importFrom(quantreg,rq)
+importFrom(recipes,bake)
+importFrom(recipes,prep)
 importFrom(rlang,"!!")
 importFrom(rlang,":=")
 importFrom(rlang,`%||%`)
 
@@ -1,7 +1,14 @@
 # epipredict (development)
 
+
+# epipredict 0.0.4
+
 * add quantile_reg()
 * clean up documentation bugs
+* add smooth_quantile_reg()
+* add classifier
+* training window step debugged
+* `min_train_window` argument removed from canned forecasters
 
 # epipredict 0.0.3
 
 
@@ -97,7 +97,8 @@ arx_classifier <- function(epi_data,
     step_epi_ahead(!!o, ahead = args_list$ahead, role = "pre-outcome") %>%
     step_mutate(outcome_class = cut(!!o2, breaks = args_list$breaks),
                 role = "outcome") %>%
-    step_epi_naomit()
+    step_epi_naomit() %>%
+    step_training_window(n_recent = args_list$n_training)
 
   forecast_date <- args_list$forecast_date %||% max(epi_data$time_value)
   target_date <- args_list$target_date %||% forecast_date + args_list$ahead
@@ -152,7 +153,7 @@ arx_classifier <- function(epi_data,
 #'   calculation. See [epiprocess::growth_rate()] and the related Vignette for
 #'   more details.
 #'
-#' @return A list containing updated parameter choices with class `arx_alist`.
+#' @return A list containing updated parameter choices with class `arx_clist`.
 #' @export
 #'
 #' @examples
@@ -164,7 +165,7 @@ arx_classifier <- function(epi_data,
 arx_class_args_list <- function(
     lags = c(0L, 7L, 14L),
     ahead = 7L,
-    min_train_window = 20L,
+    n_training = Inf,
     forecast_date = NULL,
     target_date = NULL,
     outcome_transform = c("growth_rate", "lag_difference"),
@@ -180,12 +181,14 @@ arx_class_args_list <- function(
   method <- match.arg(method)
   outcome_transform <- match.arg(outcome_transform)
 
-  arg_is_scalar(ahead, min_train_window, horizon, log_scale)
+  arg_is_scalar(ahead, n_training, horizon, log_scale)
   arg_is_scalar(forecast_date, target_date, allow_null = TRUE)
   arg_is_date(forecast_date, target_date, allow_null = TRUE)
-  arg_is_nonneg_int(ahead, min_train_window, lags, horizon)
+  arg_is_nonneg_int(ahead, lags, horizon)
   arg_is_numeric(breaks)
   arg_is_lgl(log_scale)
+  arg_is_pos(n_training)
+  if (is.finite(n_training)) arg_is_pos_int(n_training)
   if (!is.list(additional_gr_args)) {
     rlang::abort(
       c("`additional_gr_args` must be a list.",
@@ -202,7 +205,7 @@ arx_class_args_list <- function(
   structure(
     enlist(lags = .lags,
            ahead,
-           min_train_window,
+           n_training,
            breaks,
            forecast_date,
            target_date,
@@ -216,3 +219,8 @@ arx_class_args_list <- function(
     class = "arx_clist"
   )
 }
+
+#' @export
+print.arx_clist <- function(x, ...) {
+  utils::str(x)
+}
@@ -38,7 +38,7 @@ arx_forecaster <- function(epi_data,
 
   # --- validation
   validate_forecaster_inputs(epi_data, outcome, predictors)
-  if (!inherits(args_list, "arx_alist"))
+  if (!inherits(args_list, "arx_flist"))
     cli_stop("args_list was not created using `arx_args_list().")
   if (!is_regression(trainer))
     cli_stop("{trainer} must be a `parsnip` method of mode 'regression'.")
@@ -52,9 +52,8 @@ arx_forecaster <- function(epi_data,
   }
   r <- r %>%
     step_epi_ahead(!!outcome, ahead = args_list$ahead) %>%
-    step_epi_naomit()
-  # should limit the training window here (in an open PR)
-  # What to do if insufficient training data? Add issue.
+    step_epi_naomit() %>%
+    step_training_window(n_recent = args_list$n_training)
 
   forecast_date <- args_list$forecast_date %||% max(epi_data$time_value)
   target_date <- args_list$target_date %||% forecast_date + args_list$ahead
@@ -105,9 +104,9 @@ arx_lags_validator <- function(predictors, lags) {
 #'   in autoregressive-type models (in days).
 #' @param ahead Integer. Number of time steps ahead (in days) of the forecast
 #'   date for which forecasts should be produced.
-#' @param min_train_window Integer. The minimal amount of training
-#'   data (in the time unit of the `epi_df`) needed to produce a forecast.
-#'   If smaller, the forecaster will return `NA` predictions.
+#' @param n_training Integer. An upper limit for the number of rows per
+#'   key that are used for training
+#'   (in the time unit of the `epi_df`).
 #' @param forecast_date Date. The date on which the forecast is created.
 #'   The default `NULL` will attempt to determine this automatically.
 #' @param target_date Date. The date for which the forecast is intended.
@@ -124,16 +123,16 @@ arx_lags_validator <- function(predictors, lags) {
 #'   [layer_residual_quantiles()] for more information. The default,
 #'   `character(0)` performs no grouping.
 #'
-#' @return A list containing updated parameter choices with class `arx_alist`.
+#' @return A list containing updated parameter choices with class `arx_flist`.
 #' @export
 #'
 #' @examples
 #' arx_args_list()
 #' arx_args_list(symmetrize = FALSE)
-#' arx_args_list(levels = c(.1, .3, .7, .9), min_train_window = 120)
+#' arx_args_list(levels = c(.1, .3, .7, .9), n_training = 120)
 arx_args_list <- function(lags = c(0L, 7L, 14L),
                           ahead = 7L,
-                          min_train_window = 20L,
+                          n_training = Inf,
                           forecast_date = NULL,
                           target_date = NULL,
                           levels = c(0.05, 0.95),
@@ -145,24 +144,31 @@ arx_args_list <- function(lags = c(0L, 7L, 14L),
   .lags <- lags
   if (is.list(lags)) lags <- unlist(lags)
 
-  arg_is_scalar(ahead, min_train_window, symmetrize, nonneg)
+  arg_is_scalar(ahead, n_training, symmetrize, nonneg)
   arg_is_chr(quantile_by_key, allow_null = TRUE)
   arg_is_scalar(forecast_date, target_date, allow_null = TRUE)
   arg_is_date(forecast_date, target_date, allow_null = TRUE)
-  arg_is_nonneg_int(ahead, min_train_window, lags)
+  arg_is_nonneg_int(ahead, lags)
   arg_is_lgl(symmetrize, nonneg)
   arg_is_probabilities(levels, allow_null = TRUE)
+  arg_is_pos(n_training)
+  if (is.finite(n_training)) arg_is_pos_int(n_training)
 
   max_lags <- max(lags)
   structure(enlist(lags = .lags,
                    ahead,
-                   min_train_window,
+                   n_training,
                    levels,
                    forecast_date,
                    target_date,
                    symmetrize,
                    nonneg,
                    max_lags,
                    quantile_by_key),
-            class = "arx_alist")
+            class = "arx_flist")
+}
+
+#' @export
+print.arx_flist <- function(x, ...) {
+  utils::str(x)
 }
@@ -0,0 +1,52 @@
+epi_check_training_set <- function(x, rec) {
+  # Philosophy, allow the model to be fit with warnings, whenever possible.
+  # If geo_type / time_type of the recipe and training data don't match
+  #   we proceed and warn.
+  # If other_keys is missing from the training set, there are other issues.
+  validate_meta_match(x, rec$template, "geo_type", "warn")
+  validate_meta_match(x, rec$template, "time_type", "warn")
+
+  # There are 3 possibilities.
+  # 1. template has ok that are in x, but not labelled
+  # 2. template has ok that are not in x
+  # 3. x has ok that are not in template. Not a problem.
+  old_ok <- attr(rec$template, "metadata")$other_keys
+  new_ok <- attr(x, "metadata")$other_keys
+
+  if (!is.null(old_ok)) {
+    if (all(old_ok %in% colnames(x))) { # case 1
+      if (!all(old_ok %in% new_ok)) {
+        cli::cli_warn(c(
+          "The recipe specifies additional keys. Because these are available,",
+          "they are being added to the metadata of the training data."
+        ))
+        attr(x, "metadata")$other_keys <- union(new_ok, old_ok)
+      }
+    }
+    missing_ok <- setdiff(old_ok, colnames(x))
+    if (length(missing_ok) > 0) { # case 2
+      cli::cli_abort(c(
+        "The recipe specifies keys which are not in the training data.",
+        i = "The training set is missing columns for {missing_ok}."
+      ))
+    }
+  }
+  x
+}
+
+validate_meta_match <- function(x, template, meta, warn_or_abort = "warn") {
+  new_meta <- attr(x, "metadata")[[meta]]
+  old_meta <- attr(template, "metadata")[[meta]]
+  msg <- c(
+    "The `{meta}` of the training data appears to be different from that",
+    "used to construct the recipe. This may result in unexpected consequences.",
+    i = "Training `geo_type` is '{new_meta}'.",
+    i = "Originally, it was '{old_meta}'."
+  )
+  if (new_meta != old_meta) {
+    switch(warn_or_abort,
+           warn = cli::cli_warn(msg),
+           abort = cli::cli_abort(msg)
+    )
+  }
+}
@@ -33,3 +33,7 @@ epi_keys_mold <- function(mold) {
   unname(unlist(mold_keys))
 }
 
+kill_time_value <- function(v) {
+  arg_is_chr(v)
+  v[v != "time_value"]
+}
@@ -146,7 +146,7 @@ epi_recipe.formula <- function(formula, data, ...) {
   # we ensure that there's only 1 row in the template
   data <- data[1,]
   # check for minus:
-  if (! epiprocess::is_epi_df(data)) {
+  if (!epiprocess::is_epi_df(data)) {
     return(recipes::recipe(formula, data, ...))
   }
 
@@ -280,14 +280,17 @@ add_epi_recipe <- function(
 
 
 
-# unfortunately, everything the same as in prep.recipe except string/fctr handling
+# unfortunately, almost everything the same as in prep.recipe except string/fctr handling
 #' @export
 prep.epi_recipe <- function(
     x, training = NULL, fresh = FALSE, verbose = FALSE,
     retain = TRUE, log_changes = FALSE, strings_as_factors = TRUE, ...) {
   training <- recipes:::check_training_set(training, x, fresh)
+  training <- epi_check_training_set(training, x)
+  training <- dplyr::relocate(training, tidyselect::all_of(epi_keys(training)))
   tr_data <- recipes:::train_info(training)
-  keys <- epi_keys(training)
+  keys <- epi_keys(x)
+
   orig_lvls <- lapply(training, recipes:::get_levels)
   orig_lvls <- kill_levels(orig_lvls, keys)
   if (strings_as_factors) {
@@ -322,12 +325,20 @@ prep.epi_recipe <- function(
         cat(note, "[training]", "\n")
       }
       before_nms <- names(training)
+      before_template <- training[1, ]
       x$steps[[i]] <- prep(x$steps[[i]], training = training,
                            info = x$term_info)
       training <- bake(x$steps[[i]], new_data = training)
       if (!tibble::is_tibble(training)) {
         abort("bake() methods should always return tibbles")
       }
+      if (!is_epi_df(training)) {
+        # tidymodels killed our class
+        # for now, we only allow step_epi_* to alter the metadata
+        training <- dplyr::dplyr_reconstruct(
+          epiprocess::as_epi_df(training), before_template)
+      }
+      training <- dplyr::relocate(training, tidyselect::all_of(epi_keys(training)))
       x$term_info <- recipes:::merge_term_info(get_types(training), x$term_info)
       if (!is.na(x$steps[[i]]$role)) {
         new_vars <- setdiff(x$term_info$variable, running_info$variable)
 
@@ -36,5 +36,5 @@ epi_shift_single <- function(x, col, shift_val, newname, key_cols) {
   x %>%
     dplyr::select(tidyselect::all_of(c(key_cols, col))) %>%
     dplyr::mutate(time_value = time_value + shift_val) %>%
-    dplyr::rename(!!newname := col)
+    dplyr::rename(!!newname := {{ col }})
 }
@@ -43,7 +43,7 @@ flatline <- function(formula, data) {
   ek <- rhs[-n]
   if (length(response) > 1)
     cli_stop("flatline forecaster can accept only 1 observed time series.")
-  keys <- ek[ek != "time_value"]
+  keys <- kill_time_value(ek)
 
   preds <- data %>%
     dplyr::mutate(.pred = !!rlang::sym(observed),
@@ -54,7 +54,7 @@ flatline <- function(formula, data) {
     dplyr::arrange(time_value) %>%
     dplyr::slice_tail(n = 1L) %>%
     dplyr::ungroup() %>%
-    dplyr::select(dplyr::all_of(c(keys, ".pred")))
+    dplyr::select(tidyselect::all_of(c(keys, ".pred")))
 
   structure(list(
     residuals = dplyr::select(preds, dplyr::all_of(c(keys, ".resid"))),
Original file line number	Diff line number	Diff line change
`@@ -33,3 +33,7 @@ epi_keys_mold <- function(mold) {`
`33`	`33`	`unname(unlist(mold_keys))`
`34`	`34`	`}`
`35`	`35`
	`36`	`+kill_time_value <- function(v) {`
	`37`	`+ arg_is_chr(v)`
	`38`	`+ v[v != "time_value"]`
	`39`	`+}`
Original file line number	Diff line number	Diff line change
`@@ -36,5 +36,5 @@ epi_shift_single <- function(x, col, shift_val, newname, key_cols) {`
`36`	`36`	`x %>%`
`37`	`37`	`dplyr::select(tidyselect::all_of(c(key_cols, col))) %>%`
`38`	`38`	`dplyr::mutate(time_value = time_value + shift_val) %>%`
`39`		`- dplyr::rename(!!newname := col)`
	`39`	`+ dplyr::rename(!!newname := {{ col }})`
`40`	`40`	`}`