diff --git a/R/archive.R b/R/archive.R index 2f6af5e0..61eca1af 100644 --- a/R/archive.R +++ b/R/archive.R @@ -584,7 +584,7 @@ epi_archive = #' details. #' @importFrom data.table key #' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms - slide = function(f, ..., n, group_by, ref_time_values, + slide = function(f, ..., before, group_by, ref_time_values, time_step, new_col_name = "slide_value", as_list_col = FALSE, names_sep = "_", all_rows = FALSE) { @@ -597,10 +597,22 @@ epi_archive = ref_time_values = ref_time_values[ref_time_values %in% unique(self$DT$time_value)] } - + + # Validate and pre-process `before`: + if (missing(before)) { + Abort("`before` is required (and must be passed by name); + if you did not want to apply a sliding window but rather + to map `as_of` and `f` across various `ref_time_values`, + pass a large `before` value (e.g., if time steps are days, + `before=365000`).") + } + before <- vctrs::vec_cast(before, integer()) + if (length(before) != 1L || is.na(before) || before < 0L) { + Abort("`before` must be length-1, non-NA, non-negative") + } + # If a custom time step is specified, then redefine units - before_num = n-1 - if (!missing(time_step)) before_num = time_step(n-1) + if (!missing(time_step)) before <- time_step(before) # What to group by? If missing, set according to internal keys; # otherwise, tidyselect. @@ -673,12 +685,13 @@ epi_archive = if (!missing(f)) { if (rlang::is_formula(f)) f = rlang::as_function(f) - x = purrr::map_dfr(ref_time_values, function(t) { - self$as_of(t, min_time_value = t - before_num) %>% + x = purrr::map_dfr(ref_time_values, function(ref_time_value) { + self$as_of(ref_time_value, + min_time_value = ref_time_value - before) %>% dplyr::group_by(!!!group_by) %>% dplyr::group_modify(comp_one_grp, f = f, ..., - time_value = t, + time_value = ref_time_value, key_vars = key_vars, new_col = new_col, .keep = TRUE) %>% @@ -700,12 +713,13 @@ epi_archive = f = function(x, quo, ...) rlang::eval_tidy(quo, x) new_col = sym(names(rlang::quos_auto_name(quos))) - x = purrr::map_dfr(ref_time_values, function(t) { - self$as_of(t, min_time_value = t - before_num) %>% + x = purrr::map_dfr(ref_time_values, function(ref_time_value) { + self$as_of(ref_time_value, + min_time_value = ref_time_value - before) %>% dplyr::group_by(!!!group_by) %>% dplyr::group_modify(comp_one_grp, f = f, quo = quo, - time_value = t, + time_value = ref_time_value, key_vars = key_vars, new_col = new_col, .keep = TRUE) %>% diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 3387c935..6cffa144 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -358,10 +358,21 @@ epix_merge = function(x, y, #' @param ... Additional arguments to pass to the function or formula specified #' via `f`. Alternatively, if `f` is missing, then the current argument is #' interpreted as an expression for tidy evaluation. -#' @param n Number of time steps to use in the running window. For example, if -#' `n = 7`, and one time step is one day, then to produce a value on January 7 -#' we apply the given function or formula to data in between January 1 and -#' 7. +#' @param before How far `before` each `ref_time_value` should the sliding +#' window extend? If provided, should be a single, non-NA, +#' [integer-compatible][vctrs::vec_cast] number of time steps. This window +#' endpoint is inclusive. For example, if `before = 7`, and one time step is +#' one day, then to produce a value for a `ref_time_value` of January 8, we +#' apply the given function or formula to data (for each group present) with +#' `time_value`s from January 1 onward, as they were reported on January 8. +#' For typical disease surveillance sources, this will not include any data +#' with a `time_value` of January 8, and, depending on the amount of reporting +#' latency, may not include January 7 or even earlier `time_value`s. (If +#' instead the archive were to hold nowcasts instead of regular surveillance +#' data, then we would indeed expect data for `time_value` January 8. If it +#' were to hold forecasts, then we would expect data for `time_value`s after +#' January 8, and the sliding window would extend as far after each +#' `ref_time_value` as needed to include all such `time_value`s.) #' @param group_by The variable(s) to group by before slide computation. If #' missing, then the keys in the underlying data table, excluding `time_value` #' and `version`, will be used for grouping. To omit a grouping entirely, use @@ -396,10 +407,14 @@ epix_merge = function(x, y, #' values. #' #' @details Two key distinctions between inputs to the current function and -#' `epi_slide()`: -#' 1. `epix_slide()` uses windows that are **always right-aligned** (in -#' `epi_slide()`, custom alignments could be specified using the `align` or -#' `before` arguments). +#' [`epi_slide()`]: +#' 1. `epix_slide()` doesn't accept an `after` argument; its windows extend +#' from `before` time steps before a given `ref_time_value` through the last +#' `time_value` available as of version `ref_time_value` (typically, this +#' won't include `ref_time_value` itself, as observations about a particular +#' time interval (e.g., day) are only published after that time interval ends); +#' `epi_slide` windows extend from `before` time steps before a +#' `ref_time_value` through `after` time steps after `ref_time_value`. #' 2. `epix_slide()` uses a `group_by` to specify the grouping upfront (in #' `epi_slide()`, this would be accomplished by a preceding function call to #' `dplyr::group_by()`). @@ -422,11 +437,11 @@ epix_merge = function(x, y, #' Finally, this is simply a wrapper around the `slide()` method of the #' `epi_archive` class, so if `x` is an `epi_archive` object, then: #' ``` -#' epix_slide(x, new_var = comp(old_var), n = 120) +#' epix_slide(x, new_var = comp(old_var), before = 119) #' ``` #' is equivalent to: #' ``` -#' x$slide(x, new_var = comp(old_var), n = 120) +#' x$slide(new_var = comp(old_var), before = 119) #' ``` #' #' @importFrom rlang enquo @@ -444,15 +459,15 @@ epix_merge = function(x, y, #' by = "1 day") #' epix_slide(x = archive_cases_dv_subset, #' f = ~ mean(.x$case_rate_7d_av), -#' n = 3, +#' before = 2, #' group_by = geo_value, #' ref_time_values = time_values, #' new_col_name = 'case_rate_3d_av') -epix_slide = function(x, f, ..., n, group_by, ref_time_values, +epix_slide = function(x, f, ..., before, group_by, ref_time_values, time_step, new_col_name = "slide_value", as_list_col = FALSE, names_sep = "_", all_rows = FALSE) { if (!inherits(x, "epi_archive")) Abort("`x` must be of class `epi_archive`.") - return(x$slide(f, ..., n = n, + return(x$slide(f, ..., before = before, group_by = {{group_by}}, ref_time_values = ref_time_values, time_step = time_step, diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index 026f27e1..998ade9e 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -277,7 +277,7 @@ details. \if{html}{\out{
}}\preformatted{epi_archive$slide( f, ..., - n, + before, group_by, ref_time_values, time_step, diff --git a/man/epix_slide.Rd b/man/epix_slide.Rd index 79e9c1c3..bbeb51d4 100644 --- a/man/epix_slide.Rd +++ b/man/epix_slide.Rd @@ -8,7 +8,7 @@ epix_slide( x, f, ..., - n, + before, group_by, ref_time_values, time_step, @@ -34,10 +34,21 @@ sliding window of \code{n} time steps.} via \code{f}. Alternatively, if \code{f} is missing, then the current argument is interpreted as an expression for tidy evaluation.} -\item{n}{Number of time steps to use in the running window. For example, if -\code{n = 7}, and one time step is one day, then to produce a value on January 7 -we apply the given function or formula to data in between January 1 and -7.} +\item{before}{How far \code{before} each \code{ref_time_value} should the sliding +window extend? If provided, should be a single, non-NA, +\link[vctrs:vec_cast]{integer-compatible} number of time steps. This window +endpoint is inclusive. For example, if \code{before = 7}, and one time step is +one day, then to produce a value for a \code{ref_time_value} of January 8, we +apply the given function or formula to data (for each group present) with +\code{time_value}s from January 1 onward, as they were reported on January 8. +For typical disease surveillance sources, this will not include any data +with a \code{time_value} of January 8, and, depending on the amount of reporting +latency, may not include January 7 or even earlier \code{time_value}s. (If +instead the archive were to hold nowcasts instead of regular surveillance +data, then we would indeed expect data for \code{time_value} January 8. If it +were to hold forecasts, then we would expect data for \code{time_value}s after +January 8, and the sliding window would extend as far after each +\code{ref_time_value} as needed to include all such \code{time_value}s.)} \item{group_by}{The variable(s) to group by before slide computation. If missing, then the keys in the underlying data table, excluding \code{time_value} @@ -89,11 +100,15 @@ examples. } \details{ Two key distinctions between inputs to the current function and -\code{epi_slide()}: +\code{\link[=epi_slide]{epi_slide()}}: \enumerate{ -\item \code{epix_slide()} uses windows that are \strong{always right-aligned} (in -\code{epi_slide()}, custom alignments could be specified using the \code{align} or -\code{before} arguments). +\item \code{epix_slide()} doesn't accept an \code{after} argument; its windows extend +from \code{before} time steps before a given \code{ref_time_value} through the last +\code{time_value} available as of version \code{ref_time_value} (typically, this +won't include \code{ref_time_value} itself, as observations about a particular +time interval (e.g., day) are only published after that time interval ends); +\code{epi_slide} windows extend from \code{before} time steps before a +\code{ref_time_value} through \code{after} time steps after \code{ref_time_value}. \item \code{epix_slide()} uses a \code{group_by} to specify the grouping upfront (in \code{epi_slide()}, this would be accomplished by a preceding function call to \code{dplyr::group_by()}). @@ -117,12 +132,12 @@ version-aware sliding is necessary (as it its purpose). Finally, this is simply a wrapper around the \code{slide()} method of the \code{epi_archive} class, so if \code{x} is an \code{epi_archive} object, then: -\if{html}{\out{
}}\preformatted{epix_slide(x, new_var = comp(old_var), n = 120) +\if{html}{\out{
}}\preformatted{epix_slide(x, new_var = comp(old_var), before = 119) }\if{html}{\out{
}} is equivalent to: -\if{html}{\out{
}}\preformatted{x$slide(x, new_var = comp(old_var), n = 120) +\if{html}{\out{
}}\preformatted{x$slide(new_var = comp(old_var), before = 119) }\if{html}{\out{
}} } \examples{ @@ -138,7 +153,7 @@ time_values <- seq(as.Date("2020-06-01"), by = "1 day") epix_slide(x = archive_cases_dv_subset, f = ~ mean(.x$case_rate_7d_av), - n = 3, + before = 2, group_by = geo_value, ref_time_values = time_values, new_col_name = 'case_rate_3d_av') diff --git a/tests/testthat/test-epix_fill_through_version.R b/tests/testthat/test-epix_fill_through_version.R index 03e9c504..1d78bf49 100644 --- a/tests/testthat/test-epix_fill_through_version.R +++ b/tests/testthat/test-epix_fill_through_version.R @@ -11,9 +11,10 @@ test_that("epix_fill_through_version mirrors input when it is sufficiently up to # edition 3, which is based on `waldo::compare` rather than `base::identical`; # `waldo::compare` in waldo >=0.3.1 appears (as of 0.4.0) to compare R6 # objects by contents rather than address (in a way that is tested but maybe - # not guaranteed via user docs). Use `local_edition` to ensure we use edition - # 3 here. - local_edition(3) + # not guaranteed via user docs). Use `testthat::local_edition` to ensure we + # use testthat edition 3 here (use `testthat::` to prevent ambiguity with + # `readr`). + testthat::local_edition(3) expect_identical(ea_orig, ea_trivial_fill_na1) expect_identical(ea_orig, ea_trivial_fill_na2) expect_identical(ea_orig, ea_trivial_fill_locf) @@ -30,9 +31,9 @@ test_that("epix_fill_through_version can extend observed versions, gives expecte ea_fill_na = epix_fill_through_version(ea_orig, later_unobserved_version, "na") ea_fill_locf = epix_fill_through_version(ea_orig, later_unobserved_version, "locf") - # We use edition 3 features here, passing `ignore_attr` to `waldo::compare`. - # Ensure we are using edition 3: - local_edition(3) + # We use testthat edition 3 features here, passing `ignore_attr` to + # `waldo::compare`. Ensure we are using edition 3: + testthat::local_edition(3) withCallingHandlers({ expect_identical(ea_fill_na$versions_end, later_unobserved_version) expect_identical(tibble::as_tibble(ea_fill_na$as_of(first_unobserved_version)), diff --git a/tests/testthat/test-epix_merge.R b/tests/testthat/test-epix_merge.R index 51f2c3c6..8a873926 100644 --- a/tests/testthat/test-epix_merge.R +++ b/tests/testthat/test-epix_merge.R @@ -58,7 +58,7 @@ test_that("epix_merge merges and carries forward updates properly", { ) # We rely on testthat edition 3 expect_identical using waldo, not identical. See # test-epix_fill_through_version.R comments for details. - local_edition(3) + testthat::local_edition(3) expect_identical(xy, xy_expected) }) diff --git a/tests/testthat/test-epix_slide.R b/tests/testthat/test-epix_slide.R new file mode 100644 index 00000000..9a11c64a --- /dev/null +++ b/tests/testthat/test-epix_slide.R @@ -0,0 +1,65 @@ +library(dplyr) + +test_that("epix_slide only works on an epi_archive",{ + expect_error(epix_slide(data.frame(x=1))) +}) + +x <- tibble::tribble(~version, ~time_value, ~binary, + 4, c(1:3), 2^(1:3), + 5, c(1:2,4), 2^(4:6), + 6, c(1:2,4:5), 2^(7:10), + 7, 2:6, 2^(11:15)) %>% + tidyr::unnest(c(time_value,binary)) + +xx <- bind_cols(geo_value = rep("x",15), x) %>% + as_epi_archive() + +test_that("epix_slide works as intended",{ + xx1 <- epix_slide(x = xx, + f = ~ sum(.x$binary), + before = 2, + group_by = geo_value, + new_col_name = "sum_binary") + + xx2 <- tibble(geo_value = rep("x",3), + # 7 should also be there below; this is a bug on issue #153 + time_value = c(4,5,6), + sum_binary = c(2^3+2^2, + 2^6+2^3, + 2^10+2^9)) %>% + as_epi_df(as_of = 1) # Also a bug (issue #213) + + expect_identical(xx1,xx2) # * + + xx3 <- xx$slide(f = ~ sum(.x$binary), + before = 2, + group_by = "geo_value", + new_col_name = 'sum_binary') + + expect_identical(xx1,xx3) # This and * Imply xx2 and xx3 are identical +}) + +test_that("epix_slide `before` validation works", { + expect_error(xx$slide(f = ~ sum(.x$binary)), + "`before` is required") + expect_error(xx$slide(f = ~ sum(.x$binary), before=NA), + "`before`.*NA") + expect_error(xx$slide(f = ~ sum(.x$binary), before=-1), + "`before`.*negative") + expect_error(xx$slide(f = ~ sum(.x$binary), before=1.5), + regexp="before", + class="vctrs_error_incompatible_type") + # We might want to allow this at some point (issue #219): + expect_error(xx$slide(f = ~ sum(.x$binary), before=Inf), + regexp="before", + class="vctrs_error_incompatible_type") + # (wrapper shouldn't introduce a value:) + expect_error(epix_slide(xx, f = ~ sum(.x$binary)), "`before` is required") + # These `before` values should be accepted: + expect_error(xx$slide(f = ~ sum(.x$binary), before=0), + NA) + expect_error(xx$slide(f = ~ sum(.x$binary), before=2L), + NA) + expect_error(xx$slide(f = ~ sum(.x$binary), before=365000), + NA) +}) diff --git a/tests/testthat/test-methods-epi_archive.R b/tests/testthat/test-methods-epi_archive.R index d0434f59..02630595 100644 --- a/tests/testthat/test-methods-epi_archive.R +++ b/tests/testthat/test-methods-epi_archive.R @@ -71,13 +71,13 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss compactify = TRUE) reference_by_modulus = epix_slide(x = ea, f = ~ mean(.x$case_rate_7d_av), - n = 3, + before = 2, group_by = modulus, ref_time_values = time_values, new_col_name = 'case_rate_3d_av') reference_by_both = epix_slide(x = ea, f = ~ mean(.x$case_rate_7d_av), - n = 3, + before = 2, group_by = c(geo_value, modulus), ref_time_values = time_values, new_col_name = 'case_rate_3d_av') @@ -85,7 +85,7 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss expect_identical( ea$slide( f = ~ mean(.x$case_rate_7d_av), - n = 3, + before = 2, group_by = modulus, ref_time_values = time_values, new_col_name = 'case_rate_3d_av' @@ -96,7 +96,7 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss expect_identical( epix_slide(x = ea, f = ~ mean(.x$case_rate_7d_av), - n = 3, + before = 2, group_by = "modulus", ref_time_values = time_values, new_col_name = 'case_rate_3d_av'), @@ -105,7 +105,7 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss expect_identical( ea$slide( f = ~ mean(.x$case_rate_7d_av), - n = 3, + before = 2, group_by = "modulus", ref_time_values = time_values, new_col_name = 'case_rate_3d_av' @@ -121,7 +121,7 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss expect_identical( epix_slide(x = ea, f = ~ mean(.x$case_rate_7d_av), - n = 3, + before = 2, group_by = tidyselect::all_of(my_group_by), ref_time_values = time_values, new_col_name = 'case_rate_3d_av'), @@ -130,7 +130,7 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss expect_identical( ea$slide( f = ~ mean(.x$case_rate_7d_av), - n = 3, + before = 2, group_by = tidyselect::all_of(my_group_by), ref_time_values = time_values, new_col_name = 'case_rate_3d_av' @@ -141,7 +141,7 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss expect_identical( epix_slide(x = ea, f = ~ mean(.x$case_rate_7d_av), - n = 3, + before = 2, ref_time_values = time_values, new_col_name = 'case_rate_3d_av'), reference_by_both @@ -149,7 +149,7 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss expect_identical( ea$slide( f = ~ mean(.x$case_rate_7d_av), - n = 3, + before = 2, ref_time_values = time_values, new_col_name = 'case_rate_3d_av' ), diff --git a/vignettes/advanced.Rmd b/vignettes/advanced.Rmd index 5514eaaa..8b2eb16f 100644 --- a/vignettes/advanced.Rmd +++ b/vignettes/advanced.Rmd @@ -61,12 +61,12 @@ df %>% df %>% mutate(version = time_value) %>% as_epi_archive() %>% - epix_slide(x_2dav = mean(x), n = 2, ref_time_values = as.Date("2020-06-02")) + epix_slide(x_2dav = mean(x), before = 1, ref_time_values = as.Date("2020-06-02")) df %>% mutate(version = time_value) %>% as_epi_archive() %>% - epix_slide(~ mean(.x$x), n = 2, ref_time_values = as.Date("2020-06-02")) + epix_slide(~ mean(.x$x), before = 1, ref_time_values = as.Date("2020-06-02")) ``` When the slide computation returns an atomic vector (rather than a single value) @@ -153,7 +153,7 @@ df %>% as_epi_archive() %>% epix_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), ref_time_values = as.Date("2020-06-02"), - n = 2, as_list_col = FALSE, names_sep = NULL) + before = 1, as_list_col = FALSE, names_sep = NULL) ``` ## Multi-row outputs @@ -354,7 +354,7 @@ k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { x %>% epix_slide(fc = prob_arx(percent_cli, case_rate_7d_av, geo_value, time_value, args = prob_arx_args(ahead = ahead)), - n = 120, ref_time_values = fc_time_values) %>% + before = 119, ref_time_values = fc_time_values) %>% mutate(target_date = time_value + ahead, as_of = TRUE, geo_value = fc_geo_value) } diff --git a/vignettes/archive.Rmd b/vignettes/archive.Rmd index b78644ad..588b0c88 100644 --- a/vignettes/archive.Rmd +++ b/vignettes/archive.Rmd @@ -357,8 +357,9 @@ fc_time_values <- seq(as.Date("2020-08-01"), as.Date("2021-12-01"), by = "1 month") -z <- epix_slide(x, fc = prob_arx(x = percent_cli, y = case_rate_7d_av), n = 120, - ref_time_values = fc_time_values, group_by = geo_value) +z <- epix_slide(x, fc = prob_arx(x = percent_cli, y = case_rate_7d_av), + before = 119, ref_time_values = fc_time_values, + group_by = geo_value) head(z, 10) ``` @@ -387,7 +388,7 @@ x_latest <- epix_as_of(x, max_version = max(x$DT$version)) k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { if (as_of) { x %>% - epix_slide(fc = prob_arx(percent_cli, case_rate_7d_av, ahead = ahead), n = 120, + epix_slide(fc = prob_arx(percent_cli, case_rate_7d_av, ahead = ahead), before = 119, ref_time_values = fc_time_values, group_by = geo_value) %>% mutate(target_date = time_value + ahead, as_of = TRUE) } diff --git a/vignettes/compactify.Rmd b/vignettes/compactify.Rmd index 034235b3..ecb227ca 100644 --- a/vignettes/compactify.Rmd +++ b/vignettes/compactify.Rmd @@ -102,7 +102,7 @@ speeds <- rbind(speeds, speed_test(iterate_as_of,"as_of_1000x")) # Performance of slide slide_median <- function(my_ea) { - my_ea$slide(median = median(case_rate_7d_av), n = 7) + my_ea$slide(median = median(case_rate_7d_av), before = 7) } speeds <- rbind(speeds, speed_test(slide_median,"slide_median"))