From 7089c2f4c6d66e2cbaf4fd44db44a32a60e16cc6 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Wed, 12 Feb 2025 10:44:00 -0600 Subject: [PATCH 1/3] David's suggestions --- R/climatological_forecaster.R | 2 ++ R/layer_threshold_preds.R | 1 + R/step_climate.R | 35 +++++++++++++++++++++++------- man/step_adjust_latency.Rd | 4 ++-- tests/testthat/test-step_climate.R | 7 ++++-- 5 files changed, 37 insertions(+), 12 deletions(-) diff --git a/R/climatological_forecaster.R b/R/climatological_forecaster.R index fb65d3e8a..f1f7d66e1 100644 --- a/R/climatological_forecaster.R +++ b/R/climatological_forecaster.R @@ -67,6 +67,8 @@ climatological_forecaster <- function(epi_data, horizon <- args_list$forecast_horizon window_size <- args_list$window_size time_type <- args_list$time_type + # check that the prediction time type is more granular than epi_data's + # time_type ttype_ord <- match(time_type, c("day", "epiweek", "week", "month")) ttype_ord <- ttype_ord - as.integer(ttype_ord > 2) edf_ttype_ord <- match(edf_time_type, c("day", "week", "yearmonth")) diff --git a/R/layer_threshold_preds.R b/R/layer_threshold_preds.R index 7b8ca0252..99f016128 100644 --- a/R/layer_threshold_preds.R +++ b/R/layer_threshold_preds.R @@ -61,6 +61,7 @@ layer_threshold_new <- +#' restrict various objects to the interval [lower, upper] snap <- function(x, lower, upper, ...) { UseMethod("snap") } diff --git a/R/step_climate.R b/R/step_climate.R index 2f88ca94c..38a5a0ec5 100644 --- a/R/step_climate.R +++ b/R/step_climate.R @@ -1,13 +1,14 @@ #' Calculate a climatological variable based on the history #' -#' `step_climate()` creates a *specification* of a recipe step -#' that will generate one or more new columns of derived data. This step -#' examines all available seasons in the training data and calculates the -#' a measure of center for the "typical" season. Think of this like with -#' the weather: to predict the temperature in January in Pittsburgh, PA, -#' I might look at all previous January's on record, average their temperatures, -#' and include that in my model. So it is important to _align_ the forecast -#' horizon with the climate. See the details for more information. +#' `step_climate()` creates a *specification* of a recipe step that will +#' generate one or more new columns of derived data. This step examines all +#' available seasons in the training data and calculates the a measure of center +#' for the "typical" season. Think of this like with the weather: to predict the +#' temperature in January in Pittsburgh, PA, I might look at all previous +#' January's on record, average their temperatures, and include that in my +#' model. So it is important to _align_ the forecast horizon with the climate. +#' This step will work best if added after `step_epi_ahead()`, but that is not +#' strictly required. See the details for more information. #' #' @details #' Construction of a climate predictor can be helpful with strongly seasonal @@ -101,6 +102,16 @@ #' r %>% #' prep(covid_case_death_rates) %>% #' bake(new_data = NULL) +#' +#' # switching the order is possible if you specify `forecast_ahead` +#' r <- epi_recipe(covid_case_death_rates) %>% +#' step_climate(death_rate, forecast_ahead = 7, time_type = "day") %>% +#' step_epi_ahead(death_rate, ahead = 7) %>% +#' r() +#' +#' r %>% +#' prep(covid_case_death_rates) %>% +#' bake(new_data = NULL) step_climate <- function(recipe, ..., @@ -315,6 +326,14 @@ print.step_climate <- function(x, width = max(20, options()$width - 30), ...) { invisible(x) } +#' group col by .idx values and sum windows around each .idx value +#' @param .idx the relevant periodic part of time value, e.g. the week number +#' @param col the list of values indexed by `.idx` +#' @param weights how much to weigh each particular datapoint +#' @param aggr the aggregation function, probably Quantile, mean or median +#' @param window_size the number of .idx entries before and after to include in +#' the aggregation +#' @param modulus the maximum value of `.idx` roll_modular_multivec <- function(col, .idx, weights, aggr, window_size, modulus) { tib <- tibble(col = col, weights = weights, .idx = .idx) |> arrange(.idx) |> diff --git a/man/step_adjust_latency.Rd b/man/step_adjust_latency.Rd index 75d674472..1fadd3ab3 100644 --- a/man/step_adjust_latency.Rd +++ b/man/step_adjust_latency.Rd @@ -267,8 +267,8 @@ while this will not: \if{html}{\out{
}}\preformatted{toy_recipe <- epi_recipe(toy_df) \%>\% step_epi_lag(a, lag=0) \%>\% step_adjust_latency(a, method = "extend_lags") -#> Warning: If `method` is "extend_lags" or "locf", then the previous `step_epi_lag`s won't -#> work with modified data. +#> Warning: If `method` is "extend_lags" or "locf", then the previous `step_epi_lag`s won't work with +#> modified data. }\if{html}{\out{
}} If you create columns that you then apply lags to (such as diff --git a/tests/testthat/test-step_climate.R b/tests/testthat/test-step_climate.R index 045789483..480eb62b5 100644 --- a/tests/testthat/test-step_climate.R +++ b/tests/testthat/test-step_climate.R @@ -10,6 +10,7 @@ test_that("roll_modular_multivec works", { Median <- function(x, w) median(x, na.rm = TRUE) # unweighted mean + # window of size 0 expected_res <- tib |> mutate(.idx = .idx %% modulus, .idx = .idx + (.idx == 0) * modulus) |> summarise(climate_pred = weighted.mean(col, w = w), .by = .idx) @@ -17,13 +18,15 @@ test_that("roll_modular_multivec works", { roll_modular_multivec(tib$col, tib$.idx, tib$w, Mean, 0, modulus), expected_res ) - w_size <- 1L + # window of size 1, which includes everything expected_res <- tibble(.idx = as.double(1:3), climate_pred = mean(tib$col)) expect_equal( roll_modular_multivec(tib$col, tib$.idx, tib$w, Mean, 1L, modulus), expected_res ) + # weighted mean + # window of size 0 tib$w <- c(1, 2, 3, 1, 2, 1, 1, 2, 2, 1) expected_res <- tib |> mutate(.idx = .idx %% modulus, .idx = .idx + (.idx == 0) * modulus) |> @@ -32,7 +35,7 @@ test_that("roll_modular_multivec works", { roll_modular_multivec(tib$col, tib$.idx, tib$w, Mean, 0, modulus), expected_res ) - tib$w <- c(1, 2, 3, 1, 2, 1, 1, 2, 2, 1) + # window of size 1 expected_res <- tibble( .idx = as.double(1:3), climate_pred = weighted.mean(tib$col, tib$w) From 60d6aeb5dff90e7ae3f7d2a647ee4472d76250fa Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Thu, 13 Feb 2025 14:14:39 -0600 Subject: [PATCH 2/3] time_type defaulting to the epi_df's time_type --- R/step_climate.R | 7 ++++++- tests/testthat/test-step_climate.R | 4 ++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/R/step_climate.R b/R/step_climate.R index 38a5a0ec5..6169bd2a7 100644 --- a/R/step_climate.R +++ b/R/step_climate.R @@ -117,7 +117,7 @@ step_climate <- ..., forecast_ahead = "detect", role = "predictor", - time_type = c("epiweek", "week", "month", "day"), + time_type = c("detect", "epiweek", "week", "month", "day"), center_method = c("median", "mean"), window_size = 3L, epi_keys = NULL, @@ -132,6 +132,11 @@ step_climate <- n_outcomes <- sum(recipe$var_info$role == "outcome") time_type <- rlang::arg_match(time_type) edf_time_type <- attr(recipe$template, "metadata")$time_type + if (time_type == "detect") { + time_type <- edf_time_type + } else { + edf_time_type <- attr(recipe$template, "metadata")$time_type + } if (edf_time_type == "custom") { cli_abort("This step only works with daily, weekly, or yearmonth data.") } diff --git a/tests/testthat/test-step_climate.R b/tests/testthat/test-step_climate.R index 480eb62b5..2544a77ae 100644 --- a/tests/testthat/test-step_climate.R +++ b/tests/testthat/test-step_climate.R @@ -69,7 +69,7 @@ test_that("prep/bake steps create the correct training data", { ) %>% as_epi_df() # epiweeks 1, 52, and 53 are all 1, note that there are days in wk 52, 2 in wk 53 - r <- epi_recipe(x) %>% step_climate(y) + r <- epi_recipe(x) %>% step_climate(y, time_type = "epiweek") p <- prep(r, x) expected_res <- tibble(.idx = 1:53, climate_y = c(2, 2:25, 25, 25, 25:2, 2, 2)) @@ -97,7 +97,7 @@ test_that("leading the climate predictor works as expected", { r <- epi_recipe(x) %>% step_epi_ahead(y, ahead = 14L) %>% step_epi_lag(y, lag = c(0, 7L, 14L)) %>% - step_climate(y, forecast_ahead = 2L) %>% + step_climate(y, forecast_ahead = 2L, time_type = "epiweek") %>% # matches the response step_epi_naomit() p <- prep(r, x) From 759a3b80ea02c389b58cae729c7283c71b58e453 Mon Sep 17 00:00:00 2001 From: David Weber Date: Fri, 14 Feb 2025 12:41:33 -0800 Subject: [PATCH 3/3] unneeded else branch Co-authored-by: Daniel McDonald --- R/step_climate.R | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/R/step_climate.R b/R/step_climate.R index 6169bd2a7..db66aa32d 100644 --- a/R/step_climate.R +++ b/R/step_climate.R @@ -132,11 +132,7 @@ step_climate <- n_outcomes <- sum(recipe$var_info$role == "outcome") time_type <- rlang::arg_match(time_type) edf_time_type <- attr(recipe$template, "metadata")$time_type - if (time_type == "detect") { - time_type <- edf_time_type - } else { - edf_time_type <- attr(recipe$template, "metadata")$time_type - } + if (time_type == "detect") time_type <- edf_time_type if (edf_time_type == "custom") { cli_abort("This step only works with daily, weekly, or yearmonth data.") }