Merge pull request #7 from kenmawer/km-issue_22_fix

kenmawer · web-flow · commit 492234ba8c73 · 2022-05-27T11:30:16.000-07:00
Km issue 22 fix
diff --git a/R/epi_ahead.R b/R/epi_ahead.R
@@ -0,0 +1,163 @@
+#' Create a leading outcome
+#'
+#' `step_epi_ahead` creates a *specification* of a recipe step that
+#'   will add new columns of leading data. Leading data will
+#'   by default include NA values where the lag was induced.
+#'   These can be removed with [step_naomit()], or you may
+#'   specify an alternative filler value with the `default`
+#'   argument.
+#'
+#' @param recipe A recipe object. The step will be added to the
+#'  sequence of operations for this recipe.
+#' @param ... One or more selector functions to choose variables
+#'  for this step. See [selections()] for more details.
+#' @param role For model terms created by this step, what analysis role should
+#'  they be assigned?
+#' @param trained A logical to indicate if the quantities for
+#'  preprocessing have been estimated.
+#' @param ahead A vector of positive integers. Each specified column will be
+#'  lead for each value in the vector.
+#' @param prefix A prefix for generated column names, default to "ahead_".
+#' @param default Determines what fills empty rows
+#'   left by leading/lagging (defaults to NA).
+#' @param keys A character vector of the keys in an epi_df
+#' @param columns A character string of variable names that will
+#'  be populated (eventually) by the `terms` argument.
+#' @param skip A logical. Should the step be skipped when the
+#'  recipe is baked by [bake()]? While all operations are baked
+#'  when [prep()] is run, some operations may not be able to be
+#'  conducted on new data (e.g. processing the outcome variable(s)).
+#'  Care should be taken when using `skip = TRUE` as it may affect
+#'  the computations for subsequent operations.
+#' @param id A character string that is unique to this step to identify it.
+#' @template step-return
+#'
+#' @details The step assumes that the data are already _in the proper sequential
+#'  order_ for leading.
+#'
+#' @family row operation steps
+#' @export
+#'
+#' @examples
+#' tib <- tibble::tibble(
+#'   x = 1:5, y = 1:5,
+#'   time_value = seq(as.Date("2020-01-01"), by = 1, length.out = 5),
+#'   geo_value = "ca"
+#'   ) %>% epiprocess::as_epi_df()
+#'
+#' library(recipes)
+#' epi_recipe(y ~ x, data = tib) %>%
+#'   step_epi_lag(x, lag = 2:3) %>%
+#'   step_epi_ahead(y, ahead = 1) %>%
+#'   prep(tib) %>%
+#'   bake(tib)
+step_epi_ahead <-
+  function(recipe,
+           ...,
+           role = "outcome",
+           trained = FALSE,
+           ahead = 1,
+           prefix = "ahead_",
+           default = NA,
+           keys = epi_keys(recipe),
+           columns = NULL,
+           skip = FALSE,
+           id = rand_id("epi_ahead")) {
+    add_step(
+      recipe,
+      step_epi_ahead_new(
+        terms = dplyr::enquos(...),
+        role = role,
+        trained = trained,
+        ahead = ahead,
+        prefix = prefix,
+        default = default,
+        keys = keys,
+        columns = columns,
+        skip = skip,
+        id = id
+      )
+    )
+  }
+
+step_epi_ahead_new <-
+  function(terms, role, trained, ahead, prefix, default, keys,
+           columns, skip, id) {
+    step(
+      subclass = "epi_ahead",
+      terms = terms,
+      role = role,
+      trained = trained,
+      ahead = ahead,
+      prefix = prefix,
+      default = default,
+      keys = keys,
+      columns = columns,
+      skip = skip,
+      id = id
+    )
+  }
+
+#' @export
+prep.step_epi_ahead <- function(x, training, info = NULL, ...) {
+  step_epi_ahead_new(
+    terms = x$terms,
+    role = x$role,
+    trained = TRUE,
+    ahead = x$ahead,
+    prefix = x$prefix,
+    default = x$default,
+    keys = x$keys,
+    columns = recipes_eval_select(x$terms, training, info),
+    skip = x$skip,
+    id = x$id
+  )
+}
+
+#' @export
+bake.step_epi_ahead <- function(object, new_data, ...) {
+  if (!all(object$ahead == as.integer(object$ahead))) {
+    rlang::abort("step_epi_ahead requires 'ahead' argument to be integer valued.")
+  }
+
+  grid <- tidyr::expand_grid(
+    col = object$columns, lag_val = -object$ahead) %>%
+    dplyr::mutate(
+      ahead_val = -lag_val,
+      newname = glue::glue("{object$prefix}{ahead_val}_{col}")
+    ) %>%
+    dplyr::select(-ahead_val)
+
+  ## ensure no name clashes
+  new_data_names <- colnames(new_data)
+  intersection <- new_data_names %in% grid$newname
+  if (any(intersection)) {
+    rlang::abort(
+      paste0("Name collision occured in `", class(object)[1],
+             "`. The following variable names already exists: ",
+             paste0(new_data_names[intersection], collapse = ", "),
+             "."))
+  }
+
+  ok <- object$keys
+  lagged <- purrr::reduce(
+    purrr::pmap(grid, epi_shift_single, x = new_data, key_cols = ok),
+    dplyr::full_join,
+    by = ok
+  )
+
+  dplyr::full_join(new_data, lagged, by = ok) %>%
+    dplyr::group_by(dplyr::across(dplyr::all_of(ok[-1]))) %>%
+    dplyr::arrange(time_value) %>%
+    dplyr::ungroup()
+
+}
+
+#' @export
+print.step_epi_ahead <-
+  function(x, width = max(20, options()$width - 30), ...) {
+    ## TODO add printing of the lags
+    title <- "Leading "
+    recipes::print_step(x$columns, x$terms, x$trained, title, width)
+    invisible(x)
+  }
diff --git a/R/epi_lag.R b/R/epi_lag.R
@@ -16,19 +16,19 @@
 #'
 #' @family row operation steps
 #' @export
-#' @rdname step_epi_lag
+#' @rdname step_epi_ahead
 step_epi_lag <-
   function(recipe,
            ...,
            role = "predictor",
            trained = FALSE,
            lag = 1,
-           prefix = ifelse(lag >= 0, "lag_","ahead_"),
+           prefix = "lag_",
            default = NA,
            keys = epi_keys(recipe),
            columns = NULL,
            skip = FALSE,
-           id = rand_id(ifelse(lag >= 0, "epi_lag","epi_ahead"))) {
+           id = rand_id("epi_lag")) {
     add_step(
       recipe,
       step_epi_lag_new(
@@ -50,7 +50,7 @@ step_epi_lag_new <-
   function(terms, role, trained, lag, prefix, default, keys,
            columns, skip, id) {
     step(
-      subclass = "step_epi_lag",
+      subclass = "epi_lag",
       terms = terms,
       role = role,
       trained = trained,
@@ -86,10 +86,7 @@ bake.step_epi_lag <- function(object, new_data, ...) {
     rlang::abort("step_epi_lag requires 'lag' argument to be integer valued.")
   }
 
-  is_neg <- object$lag < 0
-
-  grid <- tidyr::expand_grid(col = object$columns,
-                             lag_val = object$lag) %>%
+  grid <- tidyr::expand_grid(col = object$columns, lag_val = object$lag) %>%
     dplyr::mutate(newname = glue::glue("{object$prefix}{lag_val}_{col}"))
 
   ## ensure no name clashes
@@ -120,7 +117,7 @@ bake.step_epi_lag <- function(object, new_data, ...) {
 print.step_epi_lag <-
   function(x, width = max(20, options()$width - 30), ...) {
     ## TODO add printing of the lags
-    title <- ifelse(x$lag >= 0, "Lagging", "Leading")
+    title <- "Lagging "
     recipes::print_step(x$columns, x$terms, x$trained, title, width)
     invisible(x)
   }
diff --git a/musings/example-recipe.R b/musings/example-recipe.R
@@ -36,7 +36,7 @@ xx <- x %>% filter(time_value > "2021-12-01")
 # Baseline AR3
 r <- epi_recipe(x) %>% # if we add this as a class, maybe we get better
                        # behaviour downstream?
-  step_epi_lag(death_rate, lag = -7) %>%
+  step_epi_ahead(death_rate, ahead = 7) %>%
   step_epi_lag(death_rate, lag = c(0, 7, 14)) %>%
   step_epi_lag(case_rate, lag = c(0, 7, 14)) %>%
   step_naomit(all_predictors()) %>%