cmu-delphi · dajmcdon · Feb 4, 2025 · Jan 17, 2025 · Jan 24, 2025 · Jan 27, 2025
@@ -1,6 +1,6 @@
 Package: epipredict
 Title: Basic epidemiology forecasting methods
-Version: 0.1.6
+Version: 0.1.7
 Authors@R: c(
     person("Daniel J.", "McDonald", , "[email protected]", role = c("aut", "cre")),
     person("Ryan", "Tibshirani", , "[email protected]", role = "aut"),

@@ -29,7 +29,6 @@ S3method(bake,step_population_scaling)
 S3method(bake,step_training_window)
 S3method(detect_layer,frosting)
 S3method(detect_layer,workflow)
-S3method(epi_recipe,default)
 S3method(epi_recipe,epi_df)
 S3method(epi_recipe,formula)
 S3method(extract_argument,epi_workflow)

@@ -19,6 +19,7 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.0.x will indicat
 - Make key column inference more consistent within the package and with current `epiprocess`.
 - Fix `quantile_reg()` producing error when asked to output just median-level predictions.
 - (temporary) ahead negative is allowed for `step_epi_ahead` until we have `step_epi_shift`
+- Add `reference_date` as an argument to `epi_recipe()`
 
 ## Bug fixes
 - Shifting no columns results in no error for either `step_epi_ahead` and `step_epi_lag`

@@ -13,30 +13,34 @@ epi_recipe <- function(x, ...) {
 }
 
 
-#' @rdname epi_recipe
-#' @export
-epi_recipe.default <- function(x, ...) {
-  cli_abort(paste(
-    "`x` must be an {.cls epi_df} or a {.cls formula},",
-    "not a {.cls {class(x)[[1]]}}."
-  ))
-}
 
 #' @rdname epi_recipe
 #' @inheritParams recipes::recipe
 #' @param roles A character string (the same length of `vars`) that
 #'   describes a single role that the variable will take. This value could be
 #'   anything but common roles are `"outcome"`, `"predictor"`,
 #'   `"time_value"`, and `"geo_value"`
+#' @param reference_date Either a date of the same class as the `time_value`
+#'   column in the `epi_df` or `NULL`. If a date, it gives the date to which all
+#'   operations are relative. Typically, in real-time tasks this is the date that
+#'   the model is created (and presumably trained). In forecasting, this is
+#'   often the same as the most recent date of
+#'   data availability, but when data is "latent" (reported after the date to
+#'   which it corresponds), or if performing a nowcast, the `reference_date` may
+#'   be later than this. Setting `reference_date`
+#'   to a value BEFORE the most recent data is not a true "forecast",
+#'   because future data is being used to create the model, but this may be
+#'   reasonable in model building, nowcasting (predicting finalized values from
+#'   preliminary data), or if producing a backcast. If `NULL`, it will be set
+#'   to the `as_of` date of the `epi_df`.
 #' @param ... Further arguments passed to or from other methods (not currently
 #'   used).
 #' @param formula A model formula. No in-line functions should be used here
 #'  (e.g. `log(x)`, `x:y`, etc.) and minus signs are not allowed. These types of
 #'  transformations should be enacted using `step` functions in this package.
 #'  Dots are allowed as are simple multivariate outcome terms (i.e. no need for
 #'  `cbind`; see Examples).
-#' @param x,data A data frame, tibble, or epi_df of the *template* data set
-#'   (see below). This is always coerced to the first row to avoid memory issues
+#' @param x,data An epi_df of the *template* data set (see below).
 #' @inherit recipes::recipe return
 #'
 #' @export
@@ -56,100 +60,107 @@ epi_recipe.default <- function(x, ...) {
 #'   step_naomit(all_outcomes(), skip = TRUE)
 #'
 #' r
-epi_recipe.epi_df <-
-  function(x, formula = NULL, ..., vars = NULL, roles = NULL) {
-    attr(x, "decay_to_tibble") <- FALSE
-    if (!is.null(formula)) {
-      if (!is.null(vars)) {
-        cli_abort(paste0(
-          "This `vars` specification will be ignored ",
+epi_recipe.epi_df <- function(x,
+                              reference_date = NULL,
+                              formula = NULL,
+                              ...,
+                              vars = NULL,
+                              roles = NULL) {
+  attr(x, "decay_to_tibble") <- FALSE
+  if (!is.null(formula)) {
+    if (!is.null(vars)) {
+      cli_abort(paste0(
+        "This `vars` specification will be ignored ",
+        "when a formula is used"
+      ))
+    }
+    if (!is.null(roles)) {
+      cli_abort(
+        paste0(
+          "This `roles` specification will be ignored ",
           "when a formula is used"
-        ))
-      }
-      if (!is.null(roles)) {
-        cli_abort(
-          paste0(
-            "This `roles` specification will be ignored ",
-            "when a formula is used"
-          )
         )
-      }
-
-      obj <- epi_recipe.formula(formula, x, ...)
-      return(obj)
-    }
-    if (is.null(vars)) vars <- colnames(x)
-    if (any(table(vars) > 1)) {
-      cli_abort("`vars` should have unique members")
-    }
-    if (any(!(vars %in% colnames(x)))) {
-      cli_abort("1 or more elements of `vars` are not in the data")
+      )
     }
 
-    keys <- key_colnames(x) # we know x is an epi_df
+    obj <- epi_recipe.formula(formula, x, ...)
+    return(obj)
+  }
+  if (is.null(vars)) vars <- colnames(x)
+  if (any(table(vars) > 1)) {
+    cli_abort("`vars` should have unique members")
+  }
+  if (any(!(vars %in% colnames(x)))) {
+    cli_abort("1 or more elements of `vars` are not in the data")
+  }
 
-    var_info <- tibble(variable = vars)
-    key_roles <- c("geo_value", rep("key", length(keys) - 2), "time_value")
+  keys <- key_colnames(x) # we know x is an epi_df
 
-    ## Check and add roles when available
-    if (!is.null(roles)) {
-      if (length(roles) != length(vars)) {
-        cli_abort(paste0(
-          "The number of roles should be the same as the number of ",
-          "variables."
-        ))
-      }
-      var_info$role <- roles
-    } else {
-      var_info <- var_info %>% filter(!(variable %in% keys))
-      var_info$role <- "raw"
-    }
-    ## Now we add the keys when necessary
-    var_info <- dplyr::union(
-      var_info,
-      tibble::tibble(variable = keys, role = key_roles)
-    )
+  var_info <- tibble(variable = vars)
+  key_roles <- c("geo_value", rep("key", length(keys) - 2), "time_value")
 
-    ## Add types
-    var_info <- full_join(recipes:::get_types(x), var_info, by = "variable")
-    var_info$source <- "original"
-
-    ## arrange to easy order
-    var_info <- var_info %>%
-      arrange(factor(
-        role,
-        levels = union(
-          c("predictor", "outcome", "time_value", "geo_value", "key"),
-          unique(role)
-        ) # anything else
+  ## Check and add roles when available
+  if (!is.null(roles)) {
+    if (length(roles) != length(vars)) {
+      cli_abort(paste0(
+        "The number of roles should be the same as the number of ",
+        "variables."
       ))
-
-    ## Return final object of class `recipe`
-    out <- list(
-      var_info = var_info,
-      term_info = var_info,
-      steps = NULL,
-      template = x[1, ],
-      max_time_value = max(x$time_value),
-      levels = NULL,
-      retained = NA
-    )
-    class(out) <- c("epi_recipe", "recipe")
-    out
+    }
+    var_info$role <- roles
+  } else {
+    var_info <- var_info %>% filter(!(variable %in% keys))
+    var_info$role <- "raw"
   }
+  ## Now we add the keys when necessary
+  var_info <- dplyr::union(
+    var_info,
+    tibble::tibble(variable = keys, role = key_roles)
+  )
+
+  ## Add types
+  var_info <- full_join(recipes:::get_types(x), var_info, by = "variable")
+  var_info$source <- "original"
+
+  ## arrange to easy order
+  var_info <- var_info %>%
+    arrange(factor(
+      role,
+      levels = union(
+        c("predictor", "outcome", "time_value", "geo_value", "key"),
+        unique(role)
+      ) # anything else
+    ))
+
+  ## Return final object of class `recipe`
+  max_time_value <- max(x$time_value)
+  reference_date <- reference_date %||% attr(x, "metadata")$as_of
+  out <- list(
+    var_info = var_info,
+    term_info = var_info,
+    steps = NULL,
+    template = x[1, ],
+    max_time_value = max_time_value,
+    reference_date = reference_date,
+    levels = NULL,
+    retained = NA
+  )
+  class(out) <- c("epi_recipe", "recipe")
+  out
+}
 
 
 #' @rdname epi_recipe
 #' @export
-epi_recipe.formula <- function(formula, data, ...) {
+epi_recipe.formula <- function(formula, data, reference_date = NULL, ...) {
   # we ensure that there's only 1 row in the template
   data <- data[1, ]
   # check for minus:
   if (!epiprocess::is_epi_df(data)) {
-    cli_abort(paste(
-      "`epi_recipe()` has been called with a non-{.cls epi_df} object.",
-      "Use `recipe()` instead."
-    ))
+    cli_abort(
+      "`epi_recipe()` has been called with a non-{.cls epi_df} object.
+      Use `recipe()` instead."
+    )
   }
 
   attr(data, "decay_to_tibble") <- FALSE

@@ -78,7 +78,7 @@ covid_case_death_rates
 #> An `epi_df` object, 20,496 x 4 with metadata:
 #> * geo_type  = state
 #> * time_type = day
-#> * as_of     = 2022-05-31 12:08:25.791826
+#> * as_of     = 2022-05-31
 #> 
 #> # A tibble: 20,496 × 4
 #>    geo_value time_value case_rate death_rate
@@ -92,7 +92,7 @@ covid_case_death_rates
 #>  7 co        2020-12-31      35.8      0.649
 #>  8 ct        2020-12-31      52.1      0.819
 #>  9 dc        2020-12-31      31.0      0.601
-#> 10 de        2020-12-31      65.2      0.807
+#> 10 de        2020-12-31      64.3      0.912
 #> # ℹ 20,486 more rows
 ```
 
@@ -113,19 +113,20 @@ two_week_ahead <- arx_forecaster(
 two_week_ahead
 #> ══ A basic forecaster of type ARX Forecaster ═══════════════════════════════
 #> 
-#> This forecaster was fit on 2024-11-11 11:38:31.
+#> This forecaster was fit on 2025-01-23 14:01:04.
 #> 
 #> Training data was an <epi_df> with:
 #> • Geography: state,
 #> • Time type: day,
-#> • Using data up-to-date as of: 2022-05-31 12:08:25.
+#> • Using data up-to-date as of: 2022-05-31.
+#> • With the last data available on 2021-12-31
 #> 
 #> ── Predictions ─────────────────────────────────────────────────────────────
 #> 
 #> A total of 56 predictions are available for
 #> • 56 unique geographic regions,
 #> • At forecast date: 2021-12-31,
-#> • For target date: 2022-01-14.
+#> • For target date: 2022-01-14,
 #> 
 ```
 
@@ -161,11 +162,11 @@ two_week_ahead$epi_workflow
 #> 
 #> Coefficients:
 #>       (Intercept)    lag_0_case_rate    lag_1_case_rate    lag_2_case_rate  
-#>        -0.0073358          0.0030365          0.0012467          0.0009536  
+#>        -0.0072151          0.0030311          0.0012525          0.0009551  
 #>   lag_3_case_rate    lag_7_case_rate   lag_14_case_rate   lag_0_death_rate  
-#>         0.0011425          0.0012481          0.0003041          0.1351769  
+#>         0.0011488          0.0012238          0.0003301          0.1348459  
 #>  lag_7_death_rate  lag_14_death_rate  
-#>         0.1471127          0.1062473
+#>         0.1468325          0.1056316
 #> 
 #> ── Postprocessor ───────────────────────────────────────────────────────────
 #> 
@@ -188,11 +189,11 @@ two_week_ahead$predictions
 #> # A tibble: 56 × 5
 #>    geo_value .pred        .pred_distn forecast_date target_date
 #>    <chr>     <dbl>             <dist> <date>        <date>     
-#>  1 ak        0.449 quantiles(0.45)[2] 2021-12-31    2022-01-14 
+#>  1 ak        0.448 quantiles(0.45)[2] 2021-12-31    2022-01-14 
 #>  2 al        0.574 quantiles(0.57)[2] 2021-12-31    2022-01-14 
 #>  3 ar        0.673 quantiles(0.67)[2] 2021-12-31    2022-01-14 
 #>  4 as        0     quantiles(0.12)[2] 2021-12-31    2022-01-14 
-#>  5 az        0.679 quantiles(0.68)[2] 2021-12-31    2022-01-14 
+#>  5 az        0.678 quantiles(0.68)[2] 2021-12-31    2022-01-14 
 #>  6 ca        0.575 quantiles(0.57)[2] 2021-12-31    2022-01-14 
 #>  7 co        0.862 quantiles(0.86)[2] 2021-12-31    2022-01-14 
 #>  8 ct        1.07  quantiles(1.07)[2] 2021-12-31    2022-01-14 

@@ -1,4 +1,5 @@
 Version: 1.0
+ProjectId: 51641a05-0347-438c-a50e-466a31e886c2
 
 RestoreWorkspace: No
 SaveWorkspace: No