diff --git a/DESCRIPTION b/DESCRIPTION index f78076c3..81f1871e 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -31,7 +31,6 @@ Imports: data.table, dplyr (>= 1.0.8), genlasso, - glue, ggplot2, glue, lifecycle (>= 1.0.1), diff --git a/R/autoplot.R b/R/autoplot.R index 7443628b..23f480fe 100644 --- a/R/autoplot.R +++ b/R/autoplot.R @@ -47,8 +47,8 @@ autoplot.epi_df <- function( .facet_by = c(".response", "other_keys", "all_keys", "geo_value", "all", "none"), .base_color = "#3A448F", .max_facets = Inf) { - .color_by <- match.arg(.color_by) - .facet_by <- match.arg(.facet_by) + .color_by <- rlang::arg_match(.color_by) + .facet_by <- rlang::arg_match(.facet_by) assert(anyInfinite(.max_facets), checkInt(.max_facets), combine = "or") assert_character(.base_color, len = 1) diff --git a/R/correlation.R b/R/correlation.R index 5e9694c4..e86ad373 100644 --- a/R/correlation.R +++ b/R/correlation.R @@ -99,7 +99,7 @@ epi_cor <- function(x, var1, var2, dt1 = 0, dt2 = 0, shift_by = geo_value, # nol shift_by <- syms(names(eval_select(enquo(shift_by), x))) # Which method? - method <- match.arg(method) + method <- rlang::arg_match(method) # Perform time shifts, then compute appropriate correlations and return return(x %>% diff --git a/R/growth_rate.R b/R/growth_rate.R index 4537375d..d8264fd2 100644 --- a/R/growth_rate.R +++ b/R/growth_rate.R @@ -120,7 +120,7 @@ growth_rate <- function(x = seq_along(y), y, x0 = x, # Check x, y, x0 if (length(x) != length(y)) cli_abort("`x` and `y` must have the same length.") if (!all(x0 %in% x)) cli_abort("`x0` must be a subset of `x`.") - method <- match.arg(method) + method <- rlang::arg_match(method) # Arrange in increasing order of x o <- order(x) diff --git a/R/outliers.R b/R/outliers.R index 3d0ff5e5..8be492dd 100644 --- a/R/outliers.R +++ b/R/outliers.R @@ -89,7 +89,7 @@ detect_outlr <- function(x = seq_along(y), y, ), combiner = c("median", "mean", "none")) { # Validate combiner - combiner <- match.arg(combiner) + combiner <- rlang::arg_match(combiner) # Validate that x contains all distinct values if (any(duplicated(x))) { @@ -189,7 +189,7 @@ detect_outlr_rm <- function(x = seq_along(y), y, n = 21, # Calculate lower and upper thresholds and replacement value z <- z %>% - epi_slide(fitted = median(y), before = floor((n - 1) / 2), after = ceiling((n - 1) / 2)) %>% + epi_slide(fitted = median(y), .window_size = n, .align = "center") %>% dplyr::mutate(resid = y - fitted) %>% roll_iqr( n = n, @@ -360,8 +360,7 @@ roll_iqr <- function(z, n, detection_multiplier, min_radius, z %>% epi_slide( roll_iqr = stats::IQR(resid), - before = floor((n - 1) / 2), - after = ceiling((n - 1) / 2) + .window_size = n, .align = "center" ) %>% dplyr::mutate( lower = pmax( diff --git a/R/slide.R b/R/slide.R index c27d7cea..91cebd2b 100644 --- a/R/slide.R +++ b/R/slide.R @@ -5,31 +5,29 @@ #' for examples. #' #' @template basic-slide-params -#' @param f Function, formula, or missing; together with `...` specifies the +#' @param .f Function, formula, or missing; together with `...` specifies the #' computation to slide. To "slide" means to apply a computation within a #' sliding (a.k.a. "rolling") time window for each data group. The window is #' determined by the `before` and `after` parameters described below. One time #' step is typically one day or one week; see details for more explanation. If -#' a function, `f` must take a data frame with the same column names as -#' the original object, minus any grouping variables, containing the time -#' window data for one group-`ref_time_value` combination; followed by a -#' one-row tibble containing the values of the grouping variables for the -#' associated group; followed by any number of named arguments. If a formula, -#' `f` can operate directly on columns accessed via `.x$var` or `.$var`, as -#' in `~mean(.x$var)` to compute a mean of a column `var` for each +#' a function, `.f` must take a data frame with the same column names as the +#' original object, minus any grouping variables, containing the time window +#' data for one group-`.ref_time_value` combination; followed by a one-row +#' tibble containing the values of the grouping variables for the associated +#' group; followed by any number of named arguments. If a formula, `.f` can +#' operate directly on columns accessed via `.x$var` or `.$var`, as in +#' `~mean(.x$var)` to compute a mean of a column `var` for each #' `ref_time_value`-group combination. The group key can be accessed via `.y`. -#' If `f` is missing, then `...` will specify the computation. +#' If `.f` is missing, then `...` will specify the computation. #' @param ... Additional arguments to pass to the function or formula specified -#' via `f`. Alternatively, if `f` is missing, then the `...` is interpreted as -#' a ["data-masking"][rlang::args_data_masking] expression or expressions for -#' tidy evaluation; in addition to referring columns directly by name, the +#' via `.f`. Alternatively, if `.f` is missing, then the `...` is interpreted +#' as a ["data-masking"][rlang::args_data_masking] expression or expressions +#' for tidy evaluation; in addition to referring columns directly by name, the #' expressions have access to `.data` and `.env` pronouns as in `dplyr` verbs, #' and can also refer to `.x`, `.group_key`, and `.ref_time_value`. See #' details. -#' @param new_col_name String indicating the name of the new column that will -#' contain the derivative values. The default is "slide_value" unless your -#' slide computations output data frames, in which case they will be unpacked -#' into the constituent columns and those names used. Note that setting +#' @param .new_col_name String indicating the name of the new column that will +#' contain the derivative values. Default is "slide_value"; note that setting #' `new_col_name` equal to an existing column name will overwrite this column. #' #' @template basic-slide-details @@ -45,32 +43,28 @@ #' # the `epi_slide_mean` and `epi_slide_sum` functions instead. #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide(cases_7dav = mean(cases), before = 6) %>% -#' # Remove a nonessential var. to ensure new col is printed +#' epi_slide(cases_7dav = mean(cases), .window_size = 7) %>% #' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% #' ungroup() #' #' # slide a 7-day leading average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide(cases_7dav = mean(cases), after = 6) %>% -#' # Remove a nonessential var. to ensure new col is printed +#' epi_slide(cases_7dav = mean(cases), .window_size = 7, .align = "left") %>% #' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% #' ungroup() #' #' # slide a 7-day centre-aligned average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide(cases_7dav = mean(cases), before = 3, after = 3) %>% -#' # Remove a nonessential var. to ensure new col is printed +#' epi_slide(cases_7dav = mean(cases), .window_size = 7, .align = "center") %>% #' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% #' ungroup() #' #' # slide a 14-day centre-aligned average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide(cases_14dav = mean(cases), before = 6, after = 7) %>% -#' # Remove a nonessential var. to ensure new col is printed +#' epi_slide(cases_14dav = mean(cases), .window_size = 14, .align = "center") %>% #' dplyr::select(geo_value, time_value, cases, cases_14dav) %>% #' ungroup() #' @@ -78,80 +72,119 @@ #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% #' epi_slide( -#' a = data.frame( +#' cases_2d = list(data.frame( #' cases_2dav = mean(cases), #' cases_2dma = mad(cases) -#' ), -#' before = 1, as_list_col = TRUE +#' )), +#' .window_size = 2 #' ) %>% #' ungroup() -epi_slide <- function(x, f, ..., before = NULL, after = NULL, ref_time_values = NULL, - new_col_name = NULL, all_rows = FALSE, - as_list_col = deprecated(), names_sep = deprecated()) { - assert_class(x, "epi_df") +epi_slide <- function( + .x, .f, ..., + .window_size = 1, .align = c("right", "center", "left"), + .ref_time_values = NULL, .new_col_name = NULL, .all_rows = FALSE) { + # Argument deprecation handling + provided_args <- rlang::call_args_names(rlang::call_match()) + if (any(purrr::map_lgl(provided_args, ~ .x %in% c("x", "f", "ref_time_values", "new_col_name", "all_rows")))) { + cli::cli_abort( + "epi_slide: you are using one of the following old argument names: `x`, `f`, `ref_time_values`, + `new_col_name`, or `all_rows`. Please use the new dot-prefixed names: `.x`, `.f`, `.ref_time_values`, + `.new_col_name`, `.all_rows`." + ) + } + if ("as_list_col" %in% provided_args) { + cli::cli_abort( + "epi_slide: the argument `as_list_col` is deprecated. If FALSE, you can just remove it. + If TRUE, have your given computation wrap its result using `list(result)` instead." + ) + } + if ("names_sep" %in% provided_args) { + cli::cli_abort( + "epi_slide: the argument `names_sep` is deprecated. If NULL, you can remove it, it is now default. + If a string, please manually prefix your column names instead." + ) + } + if ("before" %in% provided_args || "after" %in% provided_args) { + cli::cli_abort( + "epi_slide: `before` and `after` are deprecated for `epi_slide`. Use `.window_size` and `.align` instead. + See the slide documentation for more details." + ) + } + + # Function body starts + assert_class(.x, "epi_df") - if (nrow(x) == 0L) { - return(x) + if (nrow(.x) == 0L) { + return(.x) } - if (is.null(ref_time_values)) { - ref_time_values <- unique(x$time_value) + if (is.null(.ref_time_values)) { + .ref_time_values <- unique(.x$time_value) } else { - assert_numeric(ref_time_values, min.len = 1L, null.ok = FALSE, any.missing = FALSE) - if (!test_subset(ref_time_values, unique(x$time_value))) { + assert_numeric(.ref_time_values, min.len = 1L, null.ok = FALSE, any.missing = FALSE) + if (!test_subset(.ref_time_values, unique(.x$time_value))) { cli_abort( "`ref_time_values` must be a unique subset of the time values in `x`.", class = "epi_slide__invalid_ref_time_values" ) } - if (anyDuplicated(ref_time_values) != 0L) { + + if (anyDuplicated(.ref_time_values) != 0L) { cli_abort( - "`ref_time_values` must not contain any duplicates; use `unique` if appropriate.", + "`.ref_time_values` must not contain any duplicates; use `unique` if appropriate.", class = "epi_slide__invalid_ref_time_values" ) } } - ref_time_values <- sort(ref_time_values) - - # Handle defaults for before/after - time_type <- attr(x, "metadata")$time_type - if (is.null(before) && !is.null(after)) { - if (inherits(after, "difftime")) { - before <- as.difftime(0, units = units(after)) - } else { - before <- 0 - } - } - if (is.null(after) && !is.null(before)) { - if (inherits(before, "difftime")) { - after <- as.difftime(0, units = units(before)) - } else { - if (identical(before, Inf) && time_type %in% c("day", "week")) { + .ref_time_values <- sort(.ref_time_values) + + # Handle window arguments + align <- rlang::arg_match(.align) + time_type <- attr(.x, "metadata")$time_type + validate_slide_window_arg(.window_size, time_type) + if (identical(.window_size, Inf)) { + if (align == "right") { + before <- Inf + if (time_type %in% c("day", "week")) { after <- as.difftime(0, units = glue::glue("{time_type}s")) } else { after <- 0 } + } else { + cli_abort( + "`epi_slide`: center and left alignment are not supported with an infinite window size." + ) + } + } else { + if (align == "right") { + before <- .window_size - 1 + after <- 0 + } else if (align == "center") { + # For .window_size = 5, before = 2, after = 2. For .window_size = 4, before = 2, after = 1. + before <- floor(.window_size / 2) + after <- .window_size - before - 1 + } else if (align == "left") { + before <- 0 + after <- .window_size - 1 } } - validate_slide_window_arg(before, time_type) - validate_slide_window_arg(after, time_type, allow_inf = FALSE) # Arrange by increasing time_value - x <- arrange(x, .data$time_value) + x <- arrange(.x, .data$time_value) # Now set up starts and stops for sliding/hopping - starts <- ref_time_values - before - stops <- ref_time_values + after + starts <- .ref_time_values - before + stops <- .ref_time_values + after # If `f` is missing, interpret ... as an expression for tidy evaluation - if (missing(f)) { + if (missing(.f)) { used_data_masking <- TRUE quosures <- enquos(...) if (length(quosures) == 0) { cli_abort("If `f` is missing then a computation must be specified via `...`.") } - f <- quosures + .f <- quosures # Magic value that passes zero args as dots in calls below. Equivalent to # `... <- missing_arg()`, but use `assign` to avoid warning about # improper use of dots. @@ -160,51 +193,7 @@ epi_slide <- function(x, f, ..., before = NULL, after = NULL, ref_time_values = used_data_masking <- FALSE } - f <- as_slide_computation(f, ...) - - if (lifecycle::is_present(as_list_col)) { - if (!as_list_col) { - lifecycle::deprecate_warn("0.8.1", "epi_slide(as_list_col =)", details = "You can simply remove as_list_col = FALSE.") # nolint: line_length_linter - } else { - lifecycle::deprecate_warn("0.8.1", "epi_slide(as_list_col =)", details = "Have your computation wrap its result using `list(result)` instead, unless the `epi_slide()` row-recycling behavior would be inappropriate. Attempting to mimic the effects of such a rewrite, but you may see changes in behavior...") # nolint: line_length_linter - f_orig <- f - if (!used_data_masking) { - f <- function(...) { - list(f_orig(...)) - } - } else { - f <- function(...) { - # tidyeval pre-as_list_col-deprecation only supported a single, named, - # data-masking expr. So we should have a single column which is a packed - # data.frame, or a non-data.frame. - wrapped_result_orig <- f_orig(...) - if (length(wrapped_result_orig) != 1L) { - cli_abort("Failed to rewrite `as_list_col = TRUE`, which is deprecated: an internal bug was encountered. Please remove `as_list_col = TRUE` and update your slide computation instead.") # nolint: line_length_linter - } - name_orig <- names(wrapped_result_orig)[[1L]] - result_orig <- wrapped_result_orig[[1L]] - if (is.data.frame(result_orig)) { - # to list of rows: - result_col <- lapply(seq_len(nrow(result_orig)), function(subresult_i) { - result_orig[subresult_i, ] - }) - results_lst <- list(result_col) - } else { - results_lst <- as.list(result_orig) - } - validate_tibble(new_tibble(`names<-`(results_lst, name_orig))) - } - } - } - } - - if (lifecycle::is_present(names_sep)) { - if (is.null(names_sep)) { - lifecycle::deprecate_warn("0.8.1", "epi_slide(names_sep =)", details = "You can simply remove `names_sep = NULL`; that's now the defualt.") # nolint: line_length_linter - } else { - lifecycle::deprecate_stop("0.8.1", "epi_slide(names_sep =)", details = "Manually prefix your column names instead, or wrap the results in (return `list(result)` instead of `result` in your slide computation) and pipe into tidyr::unnest(names_sep = )") # nolint: line_length_linter - } - } + f <- as_slide_computation(.f, ...) # Create a wrapper that calculates and passes `.ref_time_value` to the # computation. `i` is contained in the `f_wrapper_factory` environment such @@ -282,6 +271,7 @@ epi_slide <- function(x, f, ..., before = NULL, after = NULL, ref_time_values = slide_values <- vctrs::list_unchop(slide_values_list) + if ( all(purrr::map_int(slide_values_list, vctrs::vec_size) == 1L) && length(slide_values_list) != 0L @@ -333,16 +323,18 @@ epi_slide <- function(x, f, ..., before = NULL, after = NULL, ref_time_values = f_factory = f_wrapper_factory, starts = starts, stops = stops, - ref_time_values = ref_time_values, - all_rows = all_rows, - new_col_name = new_col_name, + ref_time_values = .ref_time_values, + all_rows = .all_rows, + new_col_name = .new_col_name, .keep = FALSE ) + return(x) } -#' Optimized slide function for performing common rolling computations on an `epi_df` object +#' Optimized slide function for performing common rolling computations on an +#' `epi_df` object #' #' Slides an n-timestep [data.table::froll] or [slider::summary-slide] function #' over variables in an `epi_df` object. See the @@ -351,30 +343,22 @@ epi_slide <- function(x, f, ..., before = NULL, after = NULL, ref_time_values = #' #' @template basic-slide-params #' @template opt-slide-params -#' @param f Function; together with `...` specifies the computation to slide. -#' `f` must be one of `data.table`'s rolling functions +#' @param .f Function; together with `...` specifies the computation to slide. +#' `.f` must be one of `data.table`'s rolling functions #' (`frollmean`, `frollsum`, `frollapply`. See [data.table::roll]) or one #' of `slider`'s specialized sliding functions (`slide_mean`, `slide_sum`, -#' etc. See [slider::summary-slide]). To "slide" means to apply a -#' computation within a sliding (a.k.a. "rolling") time window for each data -#' group. The window is determined by the `before` and `after` parameters -#' described below. One time step is typically one day or one week; see -#' details for more explanation. +#' etc. See [slider::summary-slide]). #' #' The optimized `data.table` and `slider` functions can't be directly passed -#' as the computation function in `epi_slide` without careful handling to -#' make sure each computation group is made up of the `n` dates rather than -#' `n` points. `epi_slide_opt` (and wrapper functions `epi_slide_mean` and -#' `epi_slide_sum`) take care of window completion automatically to prevent -#' associated errors. -#' @param ... Additional arguments to pass to the slide computation `f`, for -#' example, `na.rm` and `algo` if `f` is a `data.table` function. If `f` is -#' a `data.table` function, it is automatically passed the data `x` to -#' operate on, the window size `n`, and the alignment `align`. Providing -#' these args via `...` will cause an error. If `f` is a `slider` function, -#' it is automatically passed the data `x` to operate on, and number of -#' points `before` and `after` to use in the computation. -#' +#' as the computation function in `epi_slide` without careful handling to make +#' sure each computation group is made up of the `.window_size` dates rather +#' than `.window_size` points. `epi_slide_opt` (and wrapper functions +#' `epi_slide_mean` and `epi_slide_sum`) take care of window completion +#' automatically to prevent associated errors. +#' @param ... Additional arguments to pass to the slide computation `.f`, for +#' example, `algo` or `na.rm` in data.table functions. You don't need to +#' specify `.x`, `.window_size`, or `.align` (or `before`/`after` for slider +#' functions). #' @template opt-slide-details #' #' @importFrom dplyr bind_rows mutate %>% arrange tibble select all_of @@ -393,7 +377,7 @@ epi_slide <- function(x, f, ..., before = NULL, after = NULL, ref_time_values = #' group_by(geo_value) %>% #' epi_slide_opt( #' cases, -#' f = data.table::frollmean, before = 6 +#' .f = data.table::frollmean, .window_size = 7 #' ) %>% #' # Remove a nonessential var. to ensure new col is printed, and rename new col #' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>% @@ -405,9 +389,9 @@ epi_slide <- function(x, f, ..., before = NULL, after = NULL, ref_time_values = #' group_by(geo_value) %>% #' epi_slide_opt( #' cases, -#' f = data.table::frollmean, before = 6, +#' .f = data.table::frollmean, .window_size = 7, #' # `frollmean` options -#' na.rm = TRUE, algo = "exact", hasNA = TRUE +#' algo = "exact", hasNA = TRUE, na.rm = TRUE #' ) %>% #' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>% #' ungroup() @@ -417,60 +401,73 @@ epi_slide <- function(x, f, ..., before = NULL, after = NULL, ref_time_values = #' group_by(geo_value) %>% #' epi_slide_opt( #' cases, -#' f = slider::slide_mean, after = 6 +#' .f = slider::slide_mean, .window_size = 7, .align = "left" #' ) %>% #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>% #' ungroup() #' -#' # slide a 7-day centre-aligned sum. This can also be done with `epi_slide_sum` +#' # slide a 7-day center-aligned sum. This can also be done with `epi_slide_sum` #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% #' epi_slide_opt( #' cases, -#' f = data.table::frollsum, before = 3, after = 3 +#' .f = data.table::frollsum, .window_size = 6, .align = "center" #' ) %>% #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>% #' ungroup() -epi_slide_opt <- function(x, col_names, f, ..., before = NULL, after = NULL, ref_time_values = NULL, - new_col_name = NULL, all_rows = FALSE, - as_list_col = deprecated(), names_sep = NULL) { - assert_class(x, "epi_df") - - if (nrow(x) == 0L) { - cli_abort( - c( - "input data `x` unexpectedly has 0 rows", - "i" = "If this computation is occuring within an `epix_slide` call, - check that `epix_slide` `ref_time_values` argument was set appropriately" - ), - class = "epiprocess__epi_slide_opt__0_row_input", - epiprocess__x = x +epi_slide_opt <- function( + .x, .col_names, .f, ..., + .window_size = 0, .align = c("right", "center", "left"), + .ref_time_values = NULL, .all_rows = FALSE) { + assert_class(.x, "epi_df") + + # Argument deprecation handling + provided_args <- rlang::call_args_names(rlang::call_match()) + if (any(purrr::map_lgl(provided_args, ~ .x %in% c("x", "col_names", "f", "ref_time_values", "all_rows")))) { + cli::cli_abort( + "epi_slide_opt: you are using one of the following old argument names: `x`, `col_names`, `f`, `ref_time_values`, + or `all_rows`. Please use the new dot-prefixed names: `.x`, `.col_names`, `.f`, + `.ref_time_values`, `.all_rows`." ) } - - if (!is.null(new_col_name)) { - cli_abort( - c( - "`new_col_name` is not supported for `epi_slide_[opt/mean/sum]`", - "i" = "If you want to customize the output column names, use [`dplyr::rename`] after the slide." - ), + if ("as_list_col" %in% provided_args) { + cli::cli_abort( + "epi_slide_opt: the argument `as_list_col` is deprecated. If FALSE, you can just remove it. + If TRUE, have your given computation wrap its result using `list(result)` instead." + ) + } + if ("before" %in% provided_args || "after" %in% provided_args) { + cli::cli_abort( + "epi_slide_opt: `before` and `after` are deprecated for `epi_slide`. Use `.window_size` and `.align` instead. + See the slide documentation for more details." + ) + } + if ("new_col_name" %in% provided_args || ".new_col_name" %in% provided_args) { + cli::cli_abort( + "epi_slide_opt: the argument `new_col_name` is not supported for `epi_slide_opt`. If you want to customize + the output column names, use `dplyr::rename` after the slide.", class = "epiprocess__epi_slide_opt__new_name_not_supported" ) } - - if (lifecycle::is_present(as_list_col)) { - lifecycle::deprecate_stop("0.8.1", "epi_slide_opt(as_list_col =)") + if ("names_sep" %in% provided_args || ".names_sep" %in% provided_args) { + cli::cli_abort( + "epi_slide_opt: the argument `names_sep` is not supported for `epi_slide_opt`. If you want to customize + the output column names, use `dplyr::rename` after the slide.", + class = "epiprocess__epi_slide_opt__name_sep_not_supported" + ) } - if (!is.null(names_sep)) { + if (nrow(.x) == 0L) { cli_abort( c( - "`names_sep` is not supported for `epi_slide_[opt/mean/sum]`", - "i" = "If you want to customize the output column names, use [`dplyr::rename`] after the slide." + "input data `x` unexpectedly has 0 rows", + "i" = "If this computation is occuring within an `epix_slide` call, + check that `epix_slide` `ref_time_values` argument was set appropriately" ), - class = "epiprocess__epi_slide_opt__name_sep_not_supported" + class = "epiprocess__epi_slide_opt__0_row_input", + epiprocess__x = .x ) } @@ -480,16 +477,12 @@ epi_slide_opt <- function(x, col_names, f, ..., before = NULL, after = NULL, ref # locally). if (any(map_lgl( list(frollmean, frollsum, frollapply), - function(roll_fn) { - identical(f, roll_fn) - } + ~ identical(.f, .x) ))) { f_from_package <- "data.table" } else if (any(map_lgl( list(slide_sum, slide_prod, slide_mean, slide_min, slide_max, slide_all, slide_any), - function(roll_fn) { - identical(f, roll_fn) - } + ~ identical(.f, .x) ))) { f_from_package <- "slider" } else { @@ -503,55 +496,71 @@ epi_slide_opt <- function(x, col_names, f, ..., before = NULL, after = NULL, ref etc. See `?slider::\`summary-slide\`` for more options)." ), class = "epiprocess__epi_slide_opt__unsupported_slide_function", - epiprocess__f = f + epiprocess__f = .f ) } - user_provided_rtvs <- !is.null(ref_time_values) + user_provided_rtvs <- !is.null(.ref_time_values) if (!user_provided_rtvs) { - ref_time_values <- unique(x$time_value) + .ref_time_values <- unique(.x$time_value) } else { - assert_numeric(ref_time_values, min.len = 1L, null.ok = FALSE, any.missing = FALSE) - if (!test_subset(ref_time_values, unique(x$time_value))) { + assert_numeric(.ref_time_values, min.len = 1L, null.ok = FALSE, any.missing = FALSE) + if (!test_subset(.ref_time_values, unique(.x$time_value))) { cli_abort( "`ref_time_values` must be a unique subset of the time values in `x`.", class = "epi_slide_opt__invalid_ref_time_values" ) } - if (anyDuplicated(ref_time_values) != 0L) { + if (anyDuplicated(.ref_time_values) != 0L) { cli_abort( "`ref_time_values` must not contain any duplicates; use `unique` if appropriate.", class = "epi_slide_opt__invalid_ref_time_values" ) } } - ref_time_values <- sort(ref_time_values) - - # Handle defaults for before/after - time_type <- attr(x, "metadata")$time_type - if (is.null(before) && !is.null(after)) { - if (inherits(after, "difftime")) { - before <- as.difftime(0, units = units(after)) + ref_time_values <- sort(.ref_time_values) + + # Handle window arguments + align <- rlang::arg_match(.align) + time_type <- attr(.x, "metadata")$time_type + validate_slide_window_arg(.window_size, time_type) + if (identical(.window_size, Inf)) { + if (align == "right") { + before <- Inf + if (time_type %in% c("day", "week")) { + after <- as.difftime(0, units = glue::glue("{time_type}s")) + } else { + after <- 0 + } } else { - before <- 0 + cli_abort( + "`epi_slide`: center and left alignment are not supported with an infinite window size." + ) } - } - if (is.null(after) && !is.null(before)) { - if (inherits(before, "difftime")) { - after <- as.difftime(0, units = units(before)) - } else { - if (identical(before, Inf) && time_type %in% c("day", "week")) { + } else { + if (align == "right") { + before <- .window_size - 1 + if (time_type %in% c("day", "week")) { after <- as.difftime(0, units = glue::glue("{time_type}s")) } else { after <- 0 } + } else if (align == "center") { + # For .window_size = 5, before = 2, after = 2. For .window_size = 4, before = 2, after = 1. + before <- floor(.window_size / 2) + after <- .window_size - before - 1 + } else if (align == "left") { + if (time_type %in% c("day", "week")) { + before <- as.difftime(0, units = glue::glue("{time_type}s")) + } else { + before <- 0 + } + after <- .window_size - 1 } } - validate_slide_window_arg(before, time_type) - validate_slide_window_arg(after, time_type, allow_inf = FALSE) # Make a complete date sequence between min(x$time_value) and max(x$time_value). - date_seq_list <- full_date_seq(x, before, after, time_type) + date_seq_list <- full_date_seq(.x, before, after, time_type) all_dates <- date_seq_list$all_dates pad_early_dates <- date_seq_list$pad_early_dates pad_late_dates <- date_seq_list$pad_late_dates @@ -562,12 +571,12 @@ epi_slide_opt <- function(x, col_names, f, ..., before = NULL, after = NULL, ref # positions of user-provided `col_names` into string column names. We avoid # using `names(pos)` directly for robustness and in case we later want to # allow users to rename fields via tidyselection. - if (class(quo_get_expr(enquo(col_names))) == "character") { - pos <- eval_select(all_of(col_names), data = x, allow_rename = FALSE) + if (class(quo_get_expr(enquo(.col_names))) == "character") { + pos <- eval_select(dplyr::all_of(.col_names), data = .x, allow_rename = FALSE) } else { - pos <- eval_select(enquo(col_names), data = x, allow_rename = FALSE) + pos <- eval_select(enquo(.col_names), data = .x, allow_rename = FALSE) } - col_names_chr <- names(x)[pos] + col_names_chr <- names(.x)[pos] # Always rename results to "slide_value_". result_col_names <- paste0("slide_value_", col_names_chr) slide_one_grp <- function(.data_group, .group_key, ...) { @@ -622,10 +631,10 @@ epi_slide_opt <- function(x, col_names, f, ..., before = NULL, after = NULL, ref # be; shift results to the left by `after` timesteps. if (before != Inf) { window_size <- before + after + 1L - roll_output <- f(x = .data_group[, col_names_chr], n = window_size, ...) + roll_output <- .f(x = .data_group[, col_names_chr], n = window_size, ...) } else { window_size <- list(seq_along(.data_group$time_value)) - roll_output <- f(x = .data_group[, col_names_chr], n = window_size, adaptive = TRUE, ...) + roll_output <- .f(x = .data_group[, col_names_chr], n = window_size, adaptive = TRUE, ...) } if (after >= 1) { .data_group[, result_col_names] <- purrr::map(roll_output, function(.x) { @@ -637,7 +646,7 @@ epi_slide_opt <- function(x, col_names, f, ..., before = NULL, after = NULL, ref } if (f_from_package == "slider") { for (i in seq_along(col_names_chr)) { - .data_group[, result_col_names[i]] <- f( + .data_group[, result_col_names[i]] <- .f( x = .data_group[[col_names_chr[i]]], before = as.numeric(before), after = as.numeric(after), @@ -649,13 +658,13 @@ epi_slide_opt <- function(x, col_names, f, ..., before = NULL, after = NULL, ref return(.data_group) } - result <- mutate(x, .real = TRUE) %>% + result <- mutate(.x, .real = TRUE) %>% group_modify(slide_one_grp, ..., .keep = FALSE) result <- result[result$.real, ] result$.real <- NULL - if (all_rows) { + if (.all_rows) { result[!(result$time_value %in% ref_time_values), result_col_names] <- NA } else if (user_provided_rtvs) { result <- result[result$time_value %in% ref_time_values, ] @@ -664,7 +673,7 @@ epi_slide_opt <- function(x, col_names, f, ..., before = NULL, after = NULL, ref if (!is_epi_df(result)) { # `all_rows`handling strips epi_df format and metadata. # Restore them. - result <- reclass(result, attributes(x)$metadata) + result <- reclass(result, attributes(.x)$metadata) } return(result) @@ -676,14 +685,14 @@ epi_slide_opt <- function(x, col_names, f, ..., before = NULL, after = NULL, ref #' vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html) for #' examples. #' -#' Wrapper around `epi_slide_opt` with `f = datatable::frollmean`. +#' Wrapper around `epi_slide_opt` with `.f = datatable::frollmean`. #' #' @template basic-slide-params #' @template opt-slide-params -#' @param ... Additional arguments to pass to `data.table::frollmean`, for -#' example, `na.rm` and `algo`. `data.table::frollmean` is automatically -#' passed the data `x` to operate on, the window size `n`, and the alignment -#' `align`. Providing these args via `...` will cause an error. +#' @param ... Additional arguments to pass to the slide computation `.f`, for +#' example, `algo` or `na.rm` in data.table functions. You don't need to +#' specify `.x`, `.window_size`, or `.align` (or `before`/`after` for slider +#' functions). #' #' @template opt-slide-details #' @@ -693,7 +702,7 @@ epi_slide_opt <- function(x, col_names, f, ..., before = NULL, after = NULL, ref #' # slide a 7-day trailing average formula on cases #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean(cases, before = 6) %>% +#' epi_slide_mean(cases, .window_size = 7) %>% #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>% #' ungroup() @@ -704,7 +713,7 @@ epi_slide_opt <- function(x, col_names, f, ..., before = NULL, after = NULL, ref #' group_by(geo_value) %>% #' epi_slide_mean( #' cases, -#' before = 6, +#' .window_size = 7, #' # `frollmean` options #' na.rm = TRUE, algo = "exact", hasNA = TRUE #' ) %>% @@ -714,41 +723,79 @@ epi_slide_opt <- function(x, col_names, f, ..., before = NULL, after = NULL, ref #' # slide a 7-day leading average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean(cases, after = 6) %>% +#' epi_slide_mean(cases, .window_size = 7, .align = "right") %>% #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>% #' ungroup() #' -#' # slide a 7-day centre-aligned average +#' # slide a 7-day center-aligned average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean(cases, before = 3, after = 3) %>% +#' epi_slide_mean(cases, .window_size = 7, .align = "center") %>% #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>% #' ungroup() #' -#' # slide a 14-day centre-aligned average +#' # slide a 14-day center-aligned average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean(cases, before = 6, after = 7) %>% +#' epi_slide_mean(cases, .window_size = 14, .align = "center") %>% #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_14dav = slide_value_cases) %>% #' ungroup() -epi_slide_mean <- function(x, col_names, ..., before = NULL, after = NULL, ref_time_values = NULL, - new_col_name = NULL, all_rows = FALSE, - as_list_col = deprecated(), names_sep = NULL) { +epi_slide_mean <- function( + .x, .col_names, ..., + .window_size = 0, .align = c("right", "center", "left"), + .ref_time_values = NULL, .all_rows = FALSE) { + # Argument deprecation handling + provided_args <- rlang::call_args_names(rlang::call_match()) + if (any(purrr::map_lgl(provided_args, ~ .x %in% c("x", "col_names", "f", "ref_time_values", "all_rows")))) { + cli::cli_abort( + "epi_slide_mean: you are using one of the following old argument names: `x`, `col_names`, `f`, `ref_time_values`, + or `all_rows`. Please use the new dot-prefixed names: `.x`, `.col_names`, `.f`, + `.ref_time_values`, `.all_rows`." + ) + } + if ("as_list_col" %in% provided_args) { + cli::cli_abort( + "epi_slide_mean: the argument `as_list_col` is deprecated. If FALSE, you can just remove it. + If TRUE, have your given computation wrap its result using `list(result)` instead." + ) + } + if ("names_sep" %in% provided_args) { + cli::cli_abort( + "epi_slide_mean: the argument `names_sep` is deprecated. If NULL, you can remove it, it is now default. + If a string, please manually prefix your column names instead." + ) + } + if ("before" %in% provided_args || "after" %in% provided_args) { + cli::cli_abort( + "epi_slide_mean: `before` and `after` are deprecated for `epi_slide`. Use `.window_size` and `.align` instead. + See the slide documentation for more details." + ) + } + if ("new_col_name" %in% provided_args || ".new_col_name" %in% provided_args) { + cli::cli_abort( + "epi_slide_mean: the argument `new_col_name` is not supported. If you want to customize + the output column names, use `dplyr::rename` after the slide." + ) + } + if ("names_sep" %in% provided_args || ".names_sep" %in% provided_args) { + cli::cli_abort( + "epi_slide_mean: the argument `names_sep` is not supported. If you want to customize + the output column names, use `dplyr::rename` after the slide." + ) + } + epi_slide_opt( - x = x, - col_names = {{ col_names }}, - f = data.table::frollmean, + .x = .x, + .col_names = {{ .col_names }}, + .f = data.table::frollmean, ..., - before = before, - after = after, - ref_time_values = ref_time_values, - new_col_name = new_col_name, - as_list_col = as_list_col, - names_sep = names_sep, - all_rows = all_rows + .window_size = .window_size, + .align = .align, + .ref_time_values = .ref_time_values, + .all_rows = .all_rows ) } @@ -758,14 +805,14 @@ epi_slide_mean <- function(x, col_names, ..., before = NULL, after = NULL, ref_t #' vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html) for #' examples. #' -#' Wrapper around `epi_slide_opt` with `f = datatable::frollsum`. +#' Wrapper around `epi_slide_opt` with `.f = datatable::frollsum`. #' #' @template basic-slide-params #' @template opt-slide-params -#' @param ... Additional arguments to pass to `data.table::frollsum`, for -#' example, `na.rm` and `algo`. `data.table::frollsum` is automatically -#' passed the data `x` to operate on, the window size `n`, and the alignment -#' `align`. Providing these args via `...` will cause an error. +#' @param ... Additional arguments to pass to the slide computation `.f`, for +#' example, `algo` or `na.rm` in data.table functions. You don't need to +#' specify `.x`, `.window_size`, or `.align` (or `before`/`after` for slider +#' functions). #' #' @template opt-slide-details #' @@ -775,27 +822,62 @@ epi_slide_mean <- function(x, col_names, ..., before = NULL, after = NULL, ref_t #' # slide a 7-day trailing sum formula on cases #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_sum(cases, before = 6) %>% +#' epi_slide_sum(cases, .window_size = 7) %>% #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_7dsum = slide_value_cases) %>% #' ungroup() -epi_slide_sum <- function(x, col_names, ..., before = NULL, after = NULL, ref_time_values = NULL, - new_col_name = NULL, - all_rows = FALSE, - as_list_col = deprecated(), - names_sep = NULL) { +epi_slide_sum <- function( + .x, .col_names, ..., + .window_size = 0, .align = c("right", "center", "left"), + .ref_time_values = NULL, .all_rows = FALSE) { + # Argument deprecation handling + provided_args <- rlang::call_args_names(rlang::call_match()) + if (any(purrr::map_lgl(provided_args, ~ .x %in% c("x", "col_names", "f", "ref_time_values", "all_rows")))) { + cli::cli_abort( + "epi_slide_sum: you are using one of the following old argument names: `x`, `col_names`, `f`, `ref_time_values`, + or `all_rows`. Please use the new dot-prefixed names: `.x`, `.col_names`, `.f`, + `.ref_time_values`, `.all_rows`." + ) + } + if ("as_list_col" %in% provided_args) { + cli::cli_abort( + "epi_slide_sum: the argument `as_list_col` is deprecated. If FALSE, you can just remove it. + If TRUE, have your given computation wrap its result using `list(result)` instead." + ) + } + if ("names_sep" %in% provided_args) { + cli::cli_abort( + "epi_slide_sum: the argument `names_sep` is deprecated. If NULL, you can remove it, it is now default. + If a string, please manually prefix your column names instead." + ) + } + if ("before" %in% provided_args || "after" %in% provided_args) { + cli::cli_abort( + "epi_slide_sum: `before` and `after` are deprecated for `epi_slide`. Use `.window_size` and `.align` instead. + See the slide documentation for more details." + ) + } + if ("new_col_name" %in% provided_args || ".new_col_name" %in% provided_args) { + cli::cli_abort( + "epi_slide_sum: the argument `new_col_name` is not supported. If you want to customize + the output column names, use `dplyr::rename` after the slide." + ) + } + if ("names_sep" %in% provided_args || ".names_sep" %in% provided_args) { + cli::cli_abort( + "epi_slide_sum: the argument `names_sep` is not supported. If you want to customize + the output column names, use `dplyr::rename` after the slide." + ) + } epi_slide_opt( - x = x, - col_names = {{ col_names }}, - f = data.table::frollsum, + .x = .x, + .col_names = {{ .col_names }}, + .f = data.table::frollsum, ..., - before = before, - after = after, - ref_time_values = ref_time_values, - new_col_name = new_col_name, - as_list_col = as_list_col, - names_sep = names_sep, - all_rows = all_rows + .window_size = .window_size, + .align = .align, + .ref_time_values = .ref_time_values, + .all_rows = .all_rows ) } diff --git a/man-roxygen/basic-slide-details.R b/man-roxygen/basic-slide-details.R index 4f606311..64570976 100644 --- a/man-roxygen/basic-slide-details.R +++ b/man-roxygen/basic-slide-details.R @@ -1,39 +1,53 @@ #' @details To "slide" means to apply a function or formula over a rolling -#' window of time steps for each data group, where the window is centered at a -#' reference time and left and right endpoints are given by the `before` and -#' `after` arguments. -#' -#' If there are not enough time steps available to complete the window at any -#' given reference time, then `epi_slide()` still attempts to perform the -#' computation anyway (it does not require a complete window). The issue of -#' what to do with partial computations (those run on incomplete windows) is -#' therefore left up to the user, either through the specified function or -#' formula `f`, or through post-processing. For a centrally-aligned slide of -#' `n` `time_value`s in a sliding window, set `before = (n-1)/2` and `after = -#' (n-1)/2` when the number of `time_value`s in a sliding window is odd and -#' `before = n/2-1` and `after = n/2` when `n` is even. -#' -#' Sometimes, we want to experiment with various trailing or leading window -#' widths and compare the slide outputs. In the (uncommon) case where -#' zero-width windows are considered, manually pass both the `before` and -#' `after` arguments. -#' -#' If `f` is missing, then ["data-masking"][rlang::args_data_masking] +#' window. The `.window_size` arg determines the width of the window +#' (including the reference time) and the `.align` arg governs how the window +#' is aligned (see below for examples). The `.ref_time_values` arg controls +#' which time values to consider for the slide and `.all_rows` allows you to +#' keep NAs around. +#' +#' `epi_slide()` does not require a complete window (such as on the left +#' boundary of the dataset) and will attempt to perform the computation +#' anyway. The issue of what to do with partial computations (those run on +#' incomplete windows) is therefore left up to the user, either through the +#' specified function or formula `f`, or through post-processing. +#' +#' Let's look at some window examples, assuming that the reference time value +#' is "tv". With .align = "right" and .window_size = 3, the window will be: +#' +#' time_values: tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +#' window: tv - 2, tv - 1, tv +#' +#' With .align = "center" and .window_size = 3, the window will be: +#' +#' time_values: tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +#' window: tv - 1, tv, tv + 1 +#' +#' With .align = "center" and .window_size = 4, the window will be: +#' +#' time_values: tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +#' window: tv - 2, tv - 1, tv, tv + 1 +#' +#' With .align = "left" and .window_size = 3, the window will be: +#' +#' time_values: ttv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +#' window: tv, tv + 1, tv + 2 +#' +#' If `.f` is missing, then ["data-masking"][rlang::args_data_masking] #' expression(s) for tidy evaluation can be specified, for example, as in: #' ``` -#' epi_slide(x, cases_7dav = mean(cases), before = 6) +#' epi_slide(x, cases_7dav = mean(cases), .window_size = 7) #' ``` #' which would be equivalent to: #' ``` -#' epi_slide(x, function(x, g, t) mean(x$cases), before = 6, -#' new_col_name = "cases_7dav") +#' epi_slide(x, function(x, g, t) mean(x$cases), .window_size = 7, +#' .new_col_name = "cases_7dav") #' ``` #' In a manner similar to [`dplyr::mutate`]: #' * Expressions evaluating to length-1 vectors will be recycled to #' appropriate lengths. #' * `, name_var := value` can be used to set the output column name based on #' a variable `name_var` rather than requiring you to use a hard-coded -#' name. (The leading comma is needed to make sure that `f` is treated as +#' name. (The leading comma is needed to make sure that `.f` is treated as #' missing.) #' * `= NULL` can be used to remove results from previous expressions (though #' we don't allow it to remove pre-existing columns). @@ -51,5 +65,5 @@ #' won't let you refer to the output of the earlier expressions, but `.data` #' will. #' * .group_key, which is like `.y` in [`dplyr::group_modify`]. -#' * .ref_time_value, which is the element of `ref_time_values` that +#' * .ref_time_value, which is the element of `.ref_time_values` that #' determined the time window for the current computation. diff --git a/man-roxygen/basic-slide-params.R b/man-roxygen/basic-slide-params.R index f556f540..8a63a817 100644 --- a/man-roxygen/basic-slide-params.R +++ b/man-roxygen/basic-slide-params.R @@ -1,52 +1,35 @@ -#' @param x The `epi_df` object under consideration, [grouped][dplyr::group_by] -#' or ungrouped. If ungrouped, all data in `x` will be treated as part of a +#' @param .x The `epi_df` object under consideration, [grouped][dplyr::group_by] +#' or ungrouped. If ungrouped, all data in `.x` will be treated as part of a #' single data group. -#' @param before,after How far `before` and `after` each `ref_time_value` should -#' the sliding window extend? At least one of these two arguments must be -#' provided; the other's default will be 0. The accepted values for these -#' depend on the type of the `time_value` column: +#' @param .window_size The size of the sliding window. By default, this is 1, +#' meaning that only the current ref_time_value is included. The accepted values +#' here depend on the `time_value` column: #' -#' - if it is a Date and the cadence is daily, then they can be integers -#' (which will be interpreted in units of days) or difftimes with units -#' "days" -#' - if it is a Date and the cadence is weekly, then they must be difftimes -#' with units "weeks" -#' - if it is an integer, then they must be integers +#' - if time_type is Date and the cadence is daily, then `.window_size` can be +#' an integer (which will be interpreted in units of days) or a difftime +#' with units "days" +#' - if time_type is Date and the cadence is weekly, then `.window_size` must +#' be a difftime with units "weeks" +#' - if time_type is an integer, then `.window_size` must be an integer #' -#' Endpoints of the window are inclusive. Common settings: -#' -#' - For trailing/right-aligned windows from `ref_time_value - k` to -#' `ref_time_value`: either pass `before=k` by itself, or pass `before=k, -#' after=0`. -#' - For center-aligned windows from `ref_time_value - k` to -#' `ref_time_value + k`: pass `before=k, after=k`. -#' - For leading/left-aligned windows from `ref_time_value` to -#' `ref_time_value + k`: either pass pass `after=k` by itself, -#' or pass `before=0, after=k`. -#' -#' See "Details:" on how missing rows are handled within the window. -#' @param ref_time_values Time values for sliding computations, meaning, each +#' @param .align The alignment of the sliding window. If `right` (default), then +#' the window has its end at the reference time; if `center`, then the window is +#' centered at the reference time; if `left`, then the window has its start at +#' the reference time. If the alignment is `center` and the window size is odd, +#' then the window will have floor(window_size/2) points before and after the +#' reference time. If the window size is even, then the window will be +#' asymmetric and have one less value on the right side of the reference time +#' (assuming time increases from left to right). +#' @param .ref_time_values Time values for sliding computations, meaning, each #' element of this vector serves as the reference time point for one sliding #' window. If missing, then this will be set to all unique time values in the #' underlying data table, by default. -#' @param all_rows If `all_rows = TRUE`, then all rows of `x` will be kept in -#' the output even with `ref_time_values` provided, with some type of missing +#' @param .all_rows If `.all_rows = TRUE`, then all rows of `.x` will be kept in +#' the output even with `.ref_time_values` provided, with some type of missing #' value marker for the slide computation output column(s) for `time_value`s -#' outside `ref_time_values`; otherwise, there will be one row for each row in -#' `x` that had a `time_value` in `ref_time_values`. Default is `FALSE`. The +#' outside `.ref_time_values`; otherwise, there will be one row for each row in +#' `.x` that had a `time_value` in `.ref_time_values`. Default is `FALSE`. The #' missing value marker is the result of `vctrs::vec_cast`ing `NA` to the type -#' of the slide computation output. If using `as_list_col = TRUE`, note that -#' the missing marker is a `NULL` entry in the list column; for certain -#' operations, you might want to replace these `NULL` entries with a different -#' `NA` marker. -#' @param as_list_col `r lifecycle::badge("deprecated")` if you want a list -#' column as output, you can now just directly output a list from your slide -#' computations. Usually this just means wrapping your output in a length-1 -#' list (outputting `list(result)` instead of `result`). -#' @param names_sep `r lifecycle::badge("deprecated")` if you were specifying -#' `names_sep = NULL`, that's no longer needed. If you were using a non-NULL -#' value, you can either directly prefix your slide computation names, or -#' output a list and then later call `tidyr::unnest(slide_output, -#' , names_sep = )`. -#' @return An `epi_df` object given by appending one or more new columns to `x`, -#' named according to the `new_col_name` argument. +#' of the slide computation output. +#' @return An `epi_df` object given by appending one or more new columns to `.x`, +#' named according to the `.new_col_name` argument. diff --git a/man-roxygen/opt-slide-details.R b/man-roxygen/opt-slide-details.R index 5e8876d2..f78a33db 100644 --- a/man-roxygen/opt-slide-details.R +++ b/man-roxygen/opt-slide-details.R @@ -1,16 +1,33 @@ -#' @details To "slide" means to apply a function over a rolling window of time -#' steps for each data group, where the window is centered at a reference time -#' and left and right endpoints are given by the `before` and `after` -#' arguments. - -#' If there are not enough time steps available to complete the window at any -#' given reference time, then `epi_slide_*()` will fail; it requires a -#' complete window to perform the computation. For a centrally-aligned slide -#' of `n` `time_value`s in a sliding window, set `before = (n-1)/2` and `after -#' = (n-1)/2` when the number of `time_value`s in a sliding window is odd and -#' `before = n/2-1` and `after = n/2` when `n` is even. -#' -#' Sometimes, we want to experiment with various trailing or leading window -#' widths and compare the slide outputs. In the (uncommon) case where -#' zero-width windows are considered, manually pass both the `before` and -#' `after` arguments. +#' @details To "slide" means to apply a function or formula over a rolling +#' window. The `.window_size` arg determines the width of the window +#' (including the reference time) and the `.align` arg governs how the window +#' is aligned (see below for examples). The `.ref_time_values` arg controls +#' which time values to consider for the slide and `.all_rows` allows you to +#' keep NAs around. +#' +#' `epi_slide_*()` does not require a complete window (such as on the left +#' boundary of the dataset) and will attempt to perform the computation +#' anyway. The issue of what to do with partial computations (those run on +#' incomplete windows) is therefore left up to the user, either through the +#' specified function or formula `f`, or through post-processing. +#' +#' Let's look at some window examples, assuming that the reference time value +#' is `tv`. With .align = "right" and .window_size = 3, the window will be: +#' +#' time_values: tv - 4, tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +#' window: [tv - 2, tv - 1, tv] +#' +#' With .align = "center" and .window_size = 3, the window will be: +#' +#' time_values: tv - 4, tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +#' window: [tv - 1, tv, tv + 1] +#' +#' With .align = "center" and .window_size = 4, the window will be: +#' +#' time_values: tv - 4, tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +#' window: [tv - 2, tv - 1, tv, tv + 1] +#' +#' With .align = "left" and .window_size = 3, the window will be: +#' +#' time_values: tv - 4, tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +#' window: [tv, tv + 1, tv + 2] diff --git a/man-roxygen/opt-slide-params.R b/man-roxygen/opt-slide-params.R index 151b4f86..ba4b4877 100644 --- a/man-roxygen/opt-slide-params.R +++ b/man-roxygen/opt-slide-params.R @@ -1,4 +1,4 @@ -#' @param col_names <[`tidy-select`][dplyr_tidy_select]> An unquoted column +#' @param .col_names <[`tidy-select`][dplyr_tidy_select]> An unquoted column #' name(e.g., `cases`), multiple column names (e.g., `c(cases, deaths)`), #' [other tidy-select expression][tidyselect::language], or a vector of #' characters (e.g. `c("cases", "deaths")`). Variable names can be used as if @@ -8,6 +8,3 @@ #' The tidy-selection renaming interface is not supported, and cannot be used #' to provide output column names; if you want to customize the output column #' names, use [`dplyr::rename`] after the slide. -#' @param as_list_col Not supported. Included to match `epi_slide` interface. -#' @param new_col_name Not supported. Included to match `epi_slide` interface. -#' @param names_sep Not supported. Included to match `epi_slide` interface. diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd index 9eea2442..fc675071 100644 --- a/man/epi_slide.Rd +++ b/man/epi_slide.Rd @@ -5,107 +5,85 @@ \title{Slide a function over variables in an \code{epi_df} object} \usage{ epi_slide( - x, - f, + .x, + .f, ..., - before = NULL, - after = NULL, - ref_time_values = NULL, - new_col_name = NULL, - all_rows = FALSE, - as_list_col = deprecated(), - names_sep = deprecated() + .window_size = 1, + .align = c("right", "center", "left"), + .ref_time_values = NULL, + .new_col_name = NULL, + .all_rows = FALSE ) } \arguments{ -\item{x}{The \code{epi_df} object under consideration, \link[dplyr:group_by]{grouped} -or ungrouped. If ungrouped, all data in \code{x} will be treated as part of a +\item{.x}{The \code{epi_df} object under consideration, \link[dplyr:group_by]{grouped} +or ungrouped. If ungrouped, all data in \code{.x} will be treated as part of a single data group.} -\item{f}{Function, formula, or missing; together with \code{...} specifies the +\item{.f}{Function, formula, or missing; together with \code{...} specifies the computation to slide. To "slide" means to apply a computation within a sliding (a.k.a. "rolling") time window for each data group. The window is determined by the \code{before} and \code{after} parameters described below. One time step is typically one day or one week; see details for more explanation. If -a function, \code{f} must take a data frame with the same column names as -the original object, minus any grouping variables, containing the time -window data for one group-\code{ref_time_value} combination; followed by a -one-row tibble containing the values of the grouping variables for the -associated group; followed by any number of named arguments. If a formula, -\code{f} can operate directly on columns accessed via \code{.x$var} or \code{.$var}, as -in \code{~mean(.x$var)} to compute a mean of a column \code{var} for each +a function, \code{.f} must take a data frame with the same column names as the +original object, minus any grouping variables, containing the time window +data for one group-\code{.ref_time_value} combination; followed by a one-row +tibble containing the values of the grouping variables for the associated +group; followed by any number of named arguments. If a formula, \code{.f} can +operate directly on columns accessed via \code{.x$var} or \code{.$var}, as in +\code{~mean(.x$var)} to compute a mean of a column \code{var} for each \code{ref_time_value}-group combination. The group key can be accessed via \code{.y}. -If \code{f} is missing, then \code{...} will specify the computation.} +If \code{.f} is missing, then \code{...} will specify the computation.} \item{...}{Additional arguments to pass to the function or formula specified -via \code{f}. Alternatively, if \code{f} is missing, then the \code{...} is interpreted as -a \link[rlang:args_data_masking]{"data-masking"} expression or expressions for -tidy evaluation; in addition to referring columns directly by name, the +via \code{.f}. Alternatively, if \code{.f} is missing, then the \code{...} is interpreted +as a \link[rlang:args_data_masking]{"data-masking"} expression or expressions +for tidy evaluation; in addition to referring columns directly by name, the expressions have access to \code{.data} and \code{.env} pronouns as in \code{dplyr} verbs, and can also refer to \code{.x}, \code{.group_key}, and \code{.ref_time_value}. See details.} -\item{before, after}{How far \code{before} and \code{after} each \code{ref_time_value} should -the sliding window extend? At least one of these two arguments must be -provided; the other's default will be 0. The accepted values for these -depend on the type of the \code{time_value} column: +\item{.window_size}{The size of the sliding window. By default, this is 1, +meaning that only the current ref_time_value is included. The accepted values +here depend on the \code{time_value} column: \itemize{ -\item if it is a Date and the cadence is daily, then they can be integers -(which will be interpreted in units of days) or difftimes with units -"days" -\item if it is a Date and the cadence is weekly, then they must be difftimes -with units "weeks" -\item if it is an integer, then they must be integers -} - -Endpoints of the window are inclusive. Common settings: -\itemize{ -\item For trailing/right-aligned windows from \code{ref_time_value - k} to -\code{ref_time_value}: either pass \code{before=k} by itself, or pass \verb{before=k, after=0}. -\item For center-aligned windows from \code{ref_time_value - k} to -\code{ref_time_value + k}: pass \verb{before=k, after=k}. -\item For leading/left-aligned windows from \code{ref_time_value} to -\code{ref_time_value + k}: either pass pass \code{after=k} by itself, -or pass \verb{before=0, after=k}. -} - -See "Details:" on how missing rows are handled within the window.} - -\item{ref_time_values}{Time values for sliding computations, meaning, each +\item if time_type is Date and the cadence is daily, then \code{.window_size} can be +an integer (which will be interpreted in units of days) or a difftime +with units "days" +\item if time_type is Date and the cadence is weekly, then \code{.window_size} must +be a difftime with units "weeks" +\item if time_type is an integer, then \code{.window_size} must be an integer +}} + +\item{.align}{The alignment of the sliding window. If \code{right} (default), then +the window has its end at the reference time; if \code{center}, then the window is +centered at the reference time; if \code{left}, then the window has its start at +the reference time. If the alignment is \code{center} and the window size is odd, +then the window will have floor(window_size/2) points before and after the +reference time. If the window size is even, then the window will be +asymmetric and have one less value on the right side of the reference time +(assuming time increases from left to right).} + +\item{.ref_time_values}{Time values for sliding computations, meaning, each element of this vector serves as the reference time point for one sliding window. If missing, then this will be set to all unique time values in the underlying data table, by default.} -\item{new_col_name}{String indicating the name of the new column that will -contain the derivative values. The default is "slide_value" unless your -slide computations output data frames, in which case they will be unpacked -into the constituent columns and those names used. Note that setting +\item{.new_col_name}{String indicating the name of the new column that will +contain the derivative values. Default is "slide_value"; note that setting \code{new_col_name} equal to an existing column name will overwrite this column.} -\item{all_rows}{If \code{all_rows = TRUE}, then all rows of \code{x} will be kept in -the output even with \code{ref_time_values} provided, with some type of missing +\item{.all_rows}{If \code{.all_rows = TRUE}, then all rows of \code{.x} will be kept in +the output even with \code{.ref_time_values} provided, with some type of missing value marker for the slide computation output column(s) for \code{time_value}s -outside \code{ref_time_values}; otherwise, there will be one row for each row in -\code{x} that had a \code{time_value} in \code{ref_time_values}. Default is \code{FALSE}. The +outside \code{.ref_time_values}; otherwise, there will be one row for each row in +\code{.x} that had a \code{time_value} in \code{.ref_time_values}. Default is \code{FALSE}. The missing value marker is the result of \code{vctrs::vec_cast}ing \code{NA} to the type -of the slide computation output. If using \code{as_list_col = TRUE}, note that -the missing marker is a \code{NULL} entry in the list column; for certain -operations, you might want to replace these \code{NULL} entries with a different -\code{NA} marker.} - -\item{as_list_col}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} if you want a list -column as output, you can now just directly output a list from your slide -computations. Usually this just means wrapping your output in a length-1 -list (outputting \code{list(result)} instead of \code{result}).} - -\item{names_sep}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} if you were specifying -\code{names_sep = NULL}, that's no longer needed. If you were using a non-NULL -value, you can either directly prefix your slide computation names, or -output a list and then later call \verb{tidyr::unnest(slide_output, , names_sep = )}.} +of the slide computation output.} } \value{ -An \code{epi_df} object given by appending one or more new columns to \code{x}, -named according to the \code{new_col_name} argument. +An \code{epi_df} object given by appending one or more new columns to \code{.x}, +named according to the \code{.new_col_name} argument. } \description{ Slides a given function over variables in an \code{epi_df} object. See the @@ -114,34 +92,49 @@ for examples. } \details{ To "slide" means to apply a function or formula over a rolling -window of time steps for each data group, where the window is centered at a -reference time and left and right endpoints are given by the \code{before} and -\code{after} arguments. - -If there are not enough time steps available to complete the window at any -given reference time, then \code{epi_slide()} still attempts to perform the -computation anyway (it does not require a complete window). The issue of -what to do with partial computations (those run on incomplete windows) is -therefore left up to the user, either through the specified function or -formula \code{f}, or through post-processing. For a centrally-aligned slide of -\code{n} \code{time_value}s in a sliding window, set \code{before = (n-1)/2} and \code{after = (n-1)/2} when the number of \code{time_value}s in a sliding window is odd and -\code{before = n/2-1} and \code{after = n/2} when \code{n} is even. - -Sometimes, we want to experiment with various trailing or leading window -widths and compare the slide outputs. In the (uncommon) case where -zero-width windows are considered, manually pass both the \code{before} and -\code{after} arguments. - -If \code{f} is missing, then \link[rlang:args_data_masking]{"data-masking"} +window. The \code{.window_size} arg determines the width of the window +(including the reference time) and the \code{.align} arg governs how the window +is aligned (see below for examples). The \code{.ref_time_values} arg controls +which time values to consider for the slide and \code{.all_rows} allows you to +keep NAs around. + +\code{epi_slide()} does not require a complete window (such as on the left +boundary of the dataset) and will attempt to perform the computation +anyway. The issue of what to do with partial computations (those run on +incomplete windows) is therefore left up to the user, either through the +specified function or formula \code{f}, or through post-processing. + +Let's look at some window examples, assuming that the reference time value +is "tv". With .align = "right" and .window_size = 3, the window will be: + +time_values: tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +window: tv - 2, tv - 1, tv + +With .align = "center" and .window_size = 3, the window will be: + +time_values: tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +window: tv - 1, tv, tv + 1 + +With .align = "center" and .window_size = 4, the window will be: + +time_values: tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +window: tv - 2, tv - 1, tv, tv + 1 + +With .align = "left" and .window_size = 3, the window will be: + +time_values: ttv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +window: tv, tv + 1, tv + 2 + +If \code{.f} is missing, then \link[rlang:args_data_masking]{"data-masking"} expression(s) for tidy evaluation can be specified, for example, as in: -\if{html}{\out{
}}\preformatted{epi_slide(x, cases_7dav = mean(cases), before = 6) +\if{html}{\out{
}}\preformatted{epi_slide(x, cases_7dav = mean(cases), .window_size = 7) }\if{html}{\out{
}} which would be equivalent to: -\if{html}{\out{
}}\preformatted{epi_slide(x, function(x, g, t) mean(x$cases), before = 6, - new_col_name = "cases_7dav") +\if{html}{\out{
}}\preformatted{epi_slide(x, function(x, g, t) mean(x$cases), .window_size = 7, + .new_col_name = "cases_7dav") }\if{html}{\out{
}} In a manner similar to \code{\link[dplyr:mutate]{dplyr::mutate}}: @@ -150,7 +143,7 @@ In a manner similar to \code{\link[dplyr:mutate]{dplyr::mutate}}: appropriate lengths. \item \verb{, name_var := value} can be used to set the output column name based on a variable \code{name_var} rather than requiring you to use a hard-coded -name. (The leading comma is needed to make sure that \code{f} is treated as +name. (The leading comma is needed to make sure that \code{.f} is treated as missing.) \item \verb{= NULL} can be used to remove results from previous expressions (though we don't allow it to remove pre-existing columns). @@ -170,7 +163,7 @@ like \code{\link{.data}}; this allows you to use additional {dplyr}, {tidyr}, an won't let you refer to the output of the earlier expressions, but \code{.data} will. \item .group_key, which is like \code{.y} in \code{\link[dplyr:group_map]{dplyr::group_modify}}. -\item .ref_time_value, which is the element of \code{ref_time_values} that +\item .ref_time_value, which is the element of \code{.ref_time_values} that determined the time window for the current computation. } } @@ -180,32 +173,28 @@ determined the time window for the current computation. # the `epi_slide_mean` and `epi_slide_sum` functions instead. jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide(cases_7dav = mean(cases), before = 6) \%>\% - # Remove a nonessential var. to ensure new col is printed + epi_slide(cases_7dav = mean(cases), .window_size = 7) \%>\% dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% ungroup() # slide a 7-day leading average jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide(cases_7dav = mean(cases), after = 6) \%>\% - # Remove a nonessential var. to ensure new col is printed + epi_slide(cases_7dav = mean(cases), .window_size = 7, .align = "left") \%>\% dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% ungroup() # slide a 7-day centre-aligned average jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide(cases_7dav = mean(cases), before = 3, after = 3) \%>\% - # Remove a nonessential var. to ensure new col is printed + epi_slide(cases_7dav = mean(cases), .window_size = 7, .align = "center") \%>\% dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% ungroup() # slide a 14-day centre-aligned average jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide(cases_14dav = mean(cases), before = 6, after = 7) \%>\% - # Remove a nonessential var. to ensure new col is printed + epi_slide(cases_14dav = mean(cases), .window_size = 14, .align = "center") \%>\% dplyr::select(geo_value, time_value, cases, cases_14dav) \%>\% ungroup() @@ -213,11 +202,11 @@ jhu_csse_daily_subset \%>\% jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide( - a = data.frame( + cases_2d = list(data.frame( cases_2dav = mean(cases), cases_2dma = mad(cases) - ), - before = 1, as_list_col = TRUE + )), + .window_size = 2 ) \%>\% ungroup() } diff --git a/man/epi_slide_mean.Rd b/man/epi_slide_mean.Rd index 55acad3c..3412f5a3 100644 --- a/man/epi_slide_mean.Rd +++ b/man/epi_slide_mean.Rd @@ -5,24 +5,21 @@ \title{Optimized slide function for performing rolling averages on an \code{epi_df} object} \usage{ epi_slide_mean( - x, - col_names, + .x, + .col_names, ..., - before = NULL, - after = NULL, - ref_time_values = NULL, - new_col_name = NULL, - all_rows = FALSE, - as_list_col = deprecated(), - names_sep = NULL + .window_size = 0, + .align = c("right", "center", "left"), + .ref_time_values = NULL, + .all_rows = FALSE ) } \arguments{ -\item{x}{The \code{epi_df} object under consideration, \link[dplyr:group_by]{grouped} -or ungrouped. If ungrouped, all data in \code{x} will be treated as part of a +\item{.x}{The \code{epi_df} object under consideration, \link[dplyr:group_by]{grouped} +or ungrouped. If ungrouped, all data in \code{.x} will be treated as part of a single data group.} -\item{col_names}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> An unquoted column +\item{.col_names}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> An unquoted column name(e.g., \code{cases}), multiple column names (e.g., \code{c(cases, deaths)}), \link[tidyselect:language]{other tidy-select expression}, or a vector of characters (e.g. \code{c("cases", "deaths")}). Variable names can be used as if @@ -33,90 +30,95 @@ The tidy-selection renaming interface is not supported, and cannot be used to provide output column names; if you want to customize the output column names, use \code{\link[dplyr:rename]{dplyr::rename}} after the slide.} -\item{...}{Additional arguments to pass to \code{data.table::frollmean}, for -example, \code{na.rm} and \code{algo}. \code{data.table::frollmean} is automatically -passed the data \code{x} to operate on, the window size \code{n}, and the alignment -\code{align}. Providing these args via \code{...} will cause an error.} +\item{...}{Additional arguments to pass to the slide computation \code{.f}, for +example, \code{algo} or \code{na.rm} in data.table functions. You don't need to +specify \code{.x}, \code{.window_size}, or \code{.align} (or \code{before}/\code{after} for slider +functions).} -\item{before, after}{How far \code{before} and \code{after} each \code{ref_time_value} should -the sliding window extend? At least one of these two arguments must be -provided; the other's default will be 0. The accepted values for these -depend on the type of the \code{time_value} column: +\item{.window_size}{The size of the sliding window. By default, this is 1, +meaning that only the current ref_time_value is included. The accepted values +here depend on the \code{time_value} column: \itemize{ -\item if it is a Date and the cadence is daily, then they can be integers -(which will be interpreted in units of days) or difftimes with units -"days" -\item if it is a Date and the cadence is weekly, then they must be difftimes -with units "weeks" -\item if it is an integer, then they must be integers -} - -Endpoints of the window are inclusive. Common settings: -\itemize{ -\item For trailing/right-aligned windows from \code{ref_time_value - k} to -\code{ref_time_value}: either pass \code{before=k} by itself, or pass \verb{before=k, after=0}. -\item For center-aligned windows from \code{ref_time_value - k} to -\code{ref_time_value + k}: pass \verb{before=k, after=k}. -\item For leading/left-aligned windows from \code{ref_time_value} to -\code{ref_time_value + k}: either pass pass \code{after=k} by itself, -or pass \verb{before=0, after=k}. -} - -See "Details:" on how missing rows are handled within the window.} - -\item{ref_time_values}{Time values for sliding computations, meaning, each +\item if time_type is Date and the cadence is daily, then \code{.window_size} can be +an integer (which will be interpreted in units of days) or a difftime +with units "days" +\item if time_type is Date and the cadence is weekly, then \code{.window_size} must +be a difftime with units "weeks" +\item if time_type is an integer, then \code{.window_size} must be an integer +}} + +\item{.align}{The alignment of the sliding window. If \code{right} (default), then +the window has its end at the reference time; if \code{center}, then the window is +centered at the reference time; if \code{left}, then the window has its start at +the reference time. If the alignment is \code{center} and the window size is odd, +then the window will have floor(window_size/2) points before and after the +reference time. If the window size is even, then the window will be +asymmetric and have one less value on the right side of the reference time +(assuming time increases from left to right).} + +\item{.ref_time_values}{Time values for sliding computations, meaning, each element of this vector serves as the reference time point for one sliding window. If missing, then this will be set to all unique time values in the underlying data table, by default.} -\item{new_col_name}{Not supported. Included to match \code{epi_slide} interface.} - -\item{all_rows}{If \code{all_rows = TRUE}, then all rows of \code{x} will be kept in -the output even with \code{ref_time_values} provided, with some type of missing +\item{.all_rows}{If \code{.all_rows = TRUE}, then all rows of \code{.x} will be kept in +the output even with \code{.ref_time_values} provided, with some type of missing value marker for the slide computation output column(s) for \code{time_value}s -outside \code{ref_time_values}; otherwise, there will be one row for each row in -\code{x} that had a \code{time_value} in \code{ref_time_values}. Default is \code{FALSE}. The +outside \code{.ref_time_values}; otherwise, there will be one row for each row in +\code{.x} that had a \code{time_value} in \code{.ref_time_values}. Default is \code{FALSE}. The missing value marker is the result of \code{vctrs::vec_cast}ing \code{NA} to the type -of the slide computation output. If using \code{as_list_col = TRUE}, note that -the missing marker is a \code{NULL} entry in the list column; for certain -operations, you might want to replace these \code{NULL} entries with a different -\code{NA} marker.} - -\item{as_list_col}{Not supported. Included to match \code{epi_slide} interface.} - -\item{names_sep}{Not supported. Included to match \code{epi_slide} interface.} +of the slide computation output.} } \value{ -An \code{epi_df} object given by appending one or more new columns to \code{x}, -named according to the \code{new_col_name} argument. +An \code{epi_df} object given by appending one or more new columns to \code{.x}, +named according to the \code{.new_col_name} argument. } \description{ Slides an n-timestep mean over variables in an \code{epi_df} object. See the \href{https://cmu-delphi.github.io/epiprocess/articles/slide.html}{slide vignette} for examples. } \details{ -Wrapper around \code{epi_slide_opt} with \code{f = datatable::frollmean}. - -To "slide" means to apply a function over a rolling window of time -steps for each data group, where the window is centered at a reference time -and left and right endpoints are given by the \code{before} and \code{after} -arguments. -If there are not enough time steps available to complete the window at any -given reference time, then \verb{epi_slide_*()} will fail; it requires a -complete window to perform the computation. For a centrally-aligned slide -of \code{n} \code{time_value}s in a sliding window, set \code{before = (n-1)/2} and \code{after = (n-1)/2} when the number of \code{time_value}s in a sliding window is odd and -\code{before = n/2-1} and \code{after = n/2} when \code{n} is even. - -Sometimes, we want to experiment with various trailing or leading window -widths and compare the slide outputs. In the (uncommon) case where -zero-width windows are considered, manually pass both the \code{before} and -\code{after} arguments. +Wrapper around \code{epi_slide_opt} with \code{.f = datatable::frollmean}. + +To "slide" means to apply a function or formula over a rolling +window. The \code{.window_size} arg determines the width of the window +(including the reference time) and the \code{.align} arg governs how the window +is aligned (see below for examples). The \code{.ref_time_values} arg controls +which time values to consider for the slide and \code{.all_rows} allows you to +keep NAs around. + +\verb{epi_slide_*()} does not require a complete window (such as on the left +boundary of the dataset) and will attempt to perform the computation +anyway. The issue of what to do with partial computations (those run on +incomplete windows) is therefore left up to the user, either through the +specified function or formula \code{f}, or through post-processing. + +Let's look at some window examples, assuming that the reference time value +is \code{tv}. With .align = "right" and .window_size = 3, the window will be: + +time_values: tv - 4, tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +window: \link{tv - 2, tv - 1, tv} + +With .align = "center" and .window_size = 3, the window will be: + +time_values: tv - 4, tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +window: \link{tv - 1, tv, tv + 1} + +With .align = "center" and .window_size = 4, the window will be: + +time_values: tv - 4, tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +window: \link{tv - 2, tv - 1, tv, tv + 1} + +With .align = "left" and .window_size = 3, the window will be: + +time_values: tv - 4, tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +window: \link{tv, tv + 1, tv + 2} } \examples{ # slide a 7-day trailing average formula on cases jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_mean(cases, before = 6) \%>\% + epi_slide_mean(cases, .window_size = 7) \%>\% # Remove a nonessential var. to ensure new col is printed dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) \%>\% ungroup() @@ -127,7 +129,7 @@ jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide_mean( cases, - before = 6, + .window_size = 7, # `frollmean` options na.rm = TRUE, algo = "exact", hasNA = TRUE ) \%>\% @@ -137,23 +139,23 @@ jhu_csse_daily_subset \%>\% # slide a 7-day leading average jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_mean(cases, after = 6) \%>\% + epi_slide_mean(cases, .window_size = 7, .align = "right") \%>\% # Remove a nonessential var. to ensure new col is printed dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) \%>\% ungroup() -# slide a 7-day centre-aligned average +# slide a 7-day center-aligned average jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_mean(cases, before = 3, after = 3) \%>\% + epi_slide_mean(cases, .window_size = 7, .align = "center") \%>\% # Remove a nonessential var. to ensure new col is printed dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) \%>\% ungroup() -# slide a 14-day centre-aligned average +# slide a 14-day center-aligned average jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_mean(cases, before = 6, after = 7) \%>\% + epi_slide_mean(cases, .window_size = 14, .align = "center") \%>\% # Remove a nonessential var. to ensure new col is printed dplyr::select(geo_value, time_value, cases, cases_14dav = slide_value_cases) \%>\% ungroup() diff --git a/man/epi_slide_opt.Rd b/man/epi_slide_opt.Rd index f0442d1e..e9ea1a8c 100644 --- a/man/epi_slide_opt.Rd +++ b/man/epi_slide_opt.Rd @@ -2,28 +2,26 @@ % Please edit documentation in R/slide.R \name{epi_slide_opt} \alias{epi_slide_opt} -\title{Optimized slide function for performing common rolling computations on an \code{epi_df} object} +\title{Optimized slide function for performing common rolling computations on an +\code{epi_df} object} \usage{ epi_slide_opt( - x, - col_names, - f, + .x, + .col_names, + .f, ..., - before = NULL, - after = NULL, - ref_time_values = NULL, - new_col_name = NULL, - all_rows = FALSE, - as_list_col = deprecated(), - names_sep = NULL + .window_size = 0, + .align = c("right", "center", "left"), + .ref_time_values = NULL, + .all_rows = FALSE ) } \arguments{ -\item{x}{The \code{epi_df} object under consideration, \link[dplyr:group_by]{grouped} -or ungrouped. If ungrouped, all data in \code{x} will be treated as part of a +\item{.x}{The \code{epi_df} object under consideration, \link[dplyr:group_by]{grouped} +or ungrouped. If ungrouped, all data in \code{.x} will be treated as part of a single data group.} -\item{col_names}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> An unquoted column +\item{.col_names}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> An unquoted column name(e.g., \code{cases}), multiple column names (e.g., \code{c(cases, deaths)}), \link[tidyselect:language]{other tidy-select expression}, or a vector of characters (e.g. \code{c("cases", "deaths")}). Variable names can be used as if @@ -34,82 +32,61 @@ The tidy-selection renaming interface is not supported, and cannot be used to provide output column names; if you want to customize the output column names, use \code{\link[dplyr:rename]{dplyr::rename}} after the slide.} -\item{f}{Function; together with \code{...} specifies the computation to slide. -\code{f} must be one of \code{data.table}'s rolling functions +\item{.f}{Function; together with \code{...} specifies the computation to slide. +\code{.f} must be one of \code{data.table}'s rolling functions (\code{frollmean}, \code{frollsum}, \code{frollapply}. See \link[data.table:froll]{data.table::roll}) or one of \code{slider}'s specialized sliding functions (\code{slide_mean}, \code{slide_sum}, -etc. See \link[slider:summary-slide]{slider::summary-slide}). To "slide" means to apply a -computation within a sliding (a.k.a. "rolling") time window for each data -group. The window is determined by the \code{before} and \code{after} parameters -described below. One time step is typically one day or one week; see -details for more explanation. +etc. See \link[slider:summary-slide]{slider::summary-slide}). The optimized \code{data.table} and \code{slider} functions can't be directly passed -as the computation function in \code{epi_slide} without careful handling to -make sure each computation group is made up of the \code{n} dates rather than -\code{n} points. \code{epi_slide_opt} (and wrapper functions \code{epi_slide_mean} and -\code{epi_slide_sum}) take care of window completion automatically to prevent -associated errors.} - -\item{...}{Additional arguments to pass to the slide computation \code{f}, for -example, \code{na.rm} and \code{algo} if \code{f} is a \code{data.table} function. If \code{f} is -a \code{data.table} function, it is automatically passed the data \code{x} to -operate on, the window size \code{n}, and the alignment \code{align}. Providing -these args via \code{...} will cause an error. If \code{f} is a \code{slider} function, -it is automatically passed the data \code{x} to operate on, and number of -points \code{before} and \code{after} to use in the computation.} - -\item{before, after}{How far \code{before} and \code{after} each \code{ref_time_value} should -the sliding window extend? At least one of these two arguments must be -provided; the other's default will be 0. The accepted values for these -depend on the type of the \code{time_value} column: +as the computation function in \code{epi_slide} without careful handling to make +sure each computation group is made up of the \code{.window_size} dates rather +than \code{.window_size} points. \code{epi_slide_opt} (and wrapper functions +\code{epi_slide_mean} and \code{epi_slide_sum}) take care of window completion +automatically to prevent associated errors.} + +\item{...}{Additional arguments to pass to the slide computation \code{.f}, for +example, \code{algo} or \code{na.rm} in data.table functions. You don't need to +specify \code{.x}, \code{.window_size}, or \code{.align} (or \code{before}/\code{after} for slider +functions).} + +\item{.window_size}{The size of the sliding window. By default, this is 1, +meaning that only the current ref_time_value is included. The accepted values +here depend on the \code{time_value} column: \itemize{ -\item if it is a Date and the cadence is daily, then they can be integers -(which will be interpreted in units of days) or difftimes with units -"days" -\item if it is a Date and the cadence is weekly, then they must be difftimes -with units "weeks" -\item if it is an integer, then they must be integers -} - -Endpoints of the window are inclusive. Common settings: -\itemize{ -\item For trailing/right-aligned windows from \code{ref_time_value - k} to -\code{ref_time_value}: either pass \code{before=k} by itself, or pass \verb{before=k, after=0}. -\item For center-aligned windows from \code{ref_time_value - k} to -\code{ref_time_value + k}: pass \verb{before=k, after=k}. -\item For leading/left-aligned windows from \code{ref_time_value} to -\code{ref_time_value + k}: either pass pass \code{after=k} by itself, -or pass \verb{before=0, after=k}. -} - -See "Details:" on how missing rows are handled within the window.} - -\item{ref_time_values}{Time values for sliding computations, meaning, each +\item if time_type is Date and the cadence is daily, then \code{.window_size} can be +an integer (which will be interpreted in units of days) or a difftime +with units "days" +\item if time_type is Date and the cadence is weekly, then \code{.window_size} must +be a difftime with units "weeks" +\item if time_type is an integer, then \code{.window_size} must be an integer +}} + +\item{.align}{The alignment of the sliding window. If \code{right} (default), then +the window has its end at the reference time; if \code{center}, then the window is +centered at the reference time; if \code{left}, then the window has its start at +the reference time. If the alignment is \code{center} and the window size is odd, +then the window will have floor(window_size/2) points before and after the +reference time. If the window size is even, then the window will be +asymmetric and have one less value on the right side of the reference time +(assuming time increases from left to right).} + +\item{.ref_time_values}{Time values for sliding computations, meaning, each element of this vector serves as the reference time point for one sliding window. If missing, then this will be set to all unique time values in the underlying data table, by default.} -\item{new_col_name}{Not supported. Included to match \code{epi_slide} interface.} - -\item{all_rows}{If \code{all_rows = TRUE}, then all rows of \code{x} will be kept in -the output even with \code{ref_time_values} provided, with some type of missing +\item{.all_rows}{If \code{.all_rows = TRUE}, then all rows of \code{.x} will be kept in +the output even with \code{.ref_time_values} provided, with some type of missing value marker for the slide computation output column(s) for \code{time_value}s -outside \code{ref_time_values}; otherwise, there will be one row for each row in -\code{x} that had a \code{time_value} in \code{ref_time_values}. Default is \code{FALSE}. The +outside \code{.ref_time_values}; otherwise, there will be one row for each row in +\code{.x} that had a \code{time_value} in \code{.ref_time_values}. Default is \code{FALSE}. The missing value marker is the result of \code{vctrs::vec_cast}ing \code{NA} to the type -of the slide computation output. If using \code{as_list_col = TRUE}, note that -the missing marker is a \code{NULL} entry in the list column; for certain -operations, you might want to replace these \code{NULL} entries with a different -\code{NA} marker.} - -\item{as_list_col}{Not supported. Included to match \code{epi_slide} interface.} - -\item{names_sep}{Not supported. Included to match \code{epi_slide} interface.} +of the slide computation output.} } \value{ -An \code{epi_df} object given by appending one or more new columns to \code{x}, -named according to the \code{new_col_name} argument. +An \code{epi_df} object given by appending one or more new columns to \code{.x}, +named according to the \code{.new_col_name} argument. } \description{ Slides an n-timestep \link[data.table:froll]{data.table::froll} or \link[slider:summary-slide]{slider::summary-slide} function @@ -118,20 +95,39 @@ over variables in an \code{epi_df} object. See the for examples. } \details{ -To "slide" means to apply a function over a rolling window of time -steps for each data group, where the window is centered at a reference time -and left and right endpoints are given by the \code{before} and \code{after} -arguments. -If there are not enough time steps available to complete the window at any -given reference time, then \verb{epi_slide_*()} will fail; it requires a -complete window to perform the computation. For a centrally-aligned slide -of \code{n} \code{time_value}s in a sliding window, set \code{before = (n-1)/2} and \code{after = (n-1)/2} when the number of \code{time_value}s in a sliding window is odd and -\code{before = n/2-1} and \code{after = n/2} when \code{n} is even. - -Sometimes, we want to experiment with various trailing or leading window -widths and compare the slide outputs. In the (uncommon) case where -zero-width windows are considered, manually pass both the \code{before} and -\code{after} arguments. +To "slide" means to apply a function or formula over a rolling +window. The \code{.window_size} arg determines the width of the window +(including the reference time) and the \code{.align} arg governs how the window +is aligned (see below for examples). The \code{.ref_time_values} arg controls +which time values to consider for the slide and \code{.all_rows} allows you to +keep NAs around. + +\verb{epi_slide_*()} does not require a complete window (such as on the left +boundary of the dataset) and will attempt to perform the computation +anyway. The issue of what to do with partial computations (those run on +incomplete windows) is therefore left up to the user, either through the +specified function or formula \code{f}, or through post-processing. + +Let's look at some window examples, assuming that the reference time value +is \code{tv}. With .align = "right" and .window_size = 3, the window will be: + +time_values: tv - 4, tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +window: \link{tv - 2, tv - 1, tv} + +With .align = "center" and .window_size = 3, the window will be: + +time_values: tv - 4, tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +window: \link{tv - 1, tv, tv + 1} + +With .align = "center" and .window_size = 4, the window will be: + +time_values: tv - 4, tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +window: \link{tv - 2, tv - 1, tv, tv + 1} + +With .align = "left" and .window_size = 3, the window will be: + +time_values: tv - 4, tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +window: \link{tv, tv + 1, tv + 2} } \examples{ # slide a 7-day trailing average formula on cases. This can also be done with `epi_slide_mean` @@ -139,7 +135,7 @@ jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide_opt( cases, - f = data.table::frollmean, before = 6 + .f = data.table::frollmean, .window_size = 7 ) \%>\% # Remove a nonessential var. to ensure new col is printed, and rename new col dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) \%>\% @@ -151,9 +147,9 @@ jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide_opt( cases, - f = data.table::frollmean, before = 6, + .f = data.table::frollmean, .window_size = 7, # `frollmean` options - na.rm = TRUE, algo = "exact", hasNA = TRUE + algo = "exact", hasNA = TRUE, na.rm = TRUE ) \%>\% dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) \%>\% ungroup() @@ -163,18 +159,18 @@ jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide_opt( cases, - f = slider::slide_mean, after = 6 + .f = slider::slide_mean, .window_size = 7, .align = "left" ) \%>\% # Remove a nonessential var. to ensure new col is printed dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) \%>\% ungroup() -# slide a 7-day centre-aligned sum. This can also be done with `epi_slide_sum` +# slide a 7-day center-aligned sum. This can also be done with `epi_slide_sum` jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide_opt( cases, - f = data.table::frollsum, before = 3, after = 3 + .f = data.table::frollsum, .window_size = 6, .align = "center" ) \%>\% # Remove a nonessential var. to ensure new col is printed dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) \%>\% diff --git a/man/epi_slide_sum.Rd b/man/epi_slide_sum.Rd index 8e79474a..20b6abc2 100644 --- a/man/epi_slide_sum.Rd +++ b/man/epi_slide_sum.Rd @@ -5,24 +5,21 @@ \title{Optimized slide function for performing rolling sums on an \code{epi_df} object} \usage{ epi_slide_sum( - x, - col_names, + .x, + .col_names, ..., - before = NULL, - after = NULL, - ref_time_values = NULL, - new_col_name = NULL, - all_rows = FALSE, - as_list_col = deprecated(), - names_sep = NULL + .window_size = 0, + .align = c("right", "center", "left"), + .ref_time_values = NULL, + .all_rows = FALSE ) } \arguments{ -\item{x}{The \code{epi_df} object under consideration, \link[dplyr:group_by]{grouped} -or ungrouped. If ungrouped, all data in \code{x} will be treated as part of a +\item{.x}{The \code{epi_df} object under consideration, \link[dplyr:group_by]{grouped} +or ungrouped. If ungrouped, all data in \code{.x} will be treated as part of a single data group.} -\item{col_names}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> An unquoted column +\item{.col_names}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> An unquoted column name(e.g., \code{cases}), multiple column names (e.g., \code{c(cases, deaths)}), \link[tidyselect:language]{other tidy-select expression}, or a vector of characters (e.g. \code{c("cases", "deaths")}). Variable names can be used as if @@ -33,90 +30,95 @@ The tidy-selection renaming interface is not supported, and cannot be used to provide output column names; if you want to customize the output column names, use \code{\link[dplyr:rename]{dplyr::rename}} after the slide.} -\item{...}{Additional arguments to pass to \code{data.table::frollsum}, for -example, \code{na.rm} and \code{algo}. \code{data.table::frollsum} is automatically -passed the data \code{x} to operate on, the window size \code{n}, and the alignment -\code{align}. Providing these args via \code{...} will cause an error.} +\item{...}{Additional arguments to pass to the slide computation \code{.f}, for +example, \code{algo} or \code{na.rm} in data.table functions. You don't need to +specify \code{.x}, \code{.window_size}, or \code{.align} (or \code{before}/\code{after} for slider +functions).} -\item{before, after}{How far \code{before} and \code{after} each \code{ref_time_value} should -the sliding window extend? At least one of these two arguments must be -provided; the other's default will be 0. The accepted values for these -depend on the type of the \code{time_value} column: +\item{.window_size}{The size of the sliding window. By default, this is 1, +meaning that only the current ref_time_value is included. The accepted values +here depend on the \code{time_value} column: \itemize{ -\item if it is a Date and the cadence is daily, then they can be integers -(which will be interpreted in units of days) or difftimes with units -"days" -\item if it is a Date and the cadence is weekly, then they must be difftimes -with units "weeks" -\item if it is an integer, then they must be integers -} +\item if time_type is Date and the cadence is daily, then \code{.window_size} can be +an integer (which will be interpreted in units of days) or a difftime +with units "days" +\item if time_type is Date and the cadence is weekly, then \code{.window_size} must +be a difftime with units "weeks" +\item if time_type is an integer, then \code{.window_size} must be an integer +}} -Endpoints of the window are inclusive. Common settings: -\itemize{ -\item For trailing/right-aligned windows from \code{ref_time_value - k} to -\code{ref_time_value}: either pass \code{before=k} by itself, or pass \verb{before=k, after=0}. -\item For center-aligned windows from \code{ref_time_value - k} to -\code{ref_time_value + k}: pass \verb{before=k, after=k}. -\item For leading/left-aligned windows from \code{ref_time_value} to -\code{ref_time_value + k}: either pass pass \code{after=k} by itself, -or pass \verb{before=0, after=k}. -} +\item{.align}{The alignment of the sliding window. If \code{right} (default), then +the window has its end at the reference time; if \code{center}, then the window is +centered at the reference time; if \code{left}, then the window has its start at +the reference time. If the alignment is \code{center} and the window size is odd, +then the window will have floor(window_size/2) points before and after the +reference time. If the window size is even, then the window will be +asymmetric and have one less value on the right side of the reference time +(assuming time increases from left to right).} -See "Details:" on how missing rows are handled within the window.} - -\item{ref_time_values}{Time values for sliding computations, meaning, each +\item{.ref_time_values}{Time values for sliding computations, meaning, each element of this vector serves as the reference time point for one sliding window. If missing, then this will be set to all unique time values in the underlying data table, by default.} -\item{new_col_name}{Not supported. Included to match \code{epi_slide} interface.} - -\item{all_rows}{If \code{all_rows = TRUE}, then all rows of \code{x} will be kept in -the output even with \code{ref_time_values} provided, with some type of missing +\item{.all_rows}{If \code{.all_rows = TRUE}, then all rows of \code{.x} will be kept in +the output even with \code{.ref_time_values} provided, with some type of missing value marker for the slide computation output column(s) for \code{time_value}s -outside \code{ref_time_values}; otherwise, there will be one row for each row in -\code{x} that had a \code{time_value} in \code{ref_time_values}. Default is \code{FALSE}. The +outside \code{.ref_time_values}; otherwise, there will be one row for each row in +\code{.x} that had a \code{time_value} in \code{.ref_time_values}. Default is \code{FALSE}. The missing value marker is the result of \code{vctrs::vec_cast}ing \code{NA} to the type -of the slide computation output. If using \code{as_list_col = TRUE}, note that -the missing marker is a \code{NULL} entry in the list column; for certain -operations, you might want to replace these \code{NULL} entries with a different -\code{NA} marker.} - -\item{as_list_col}{Not supported. Included to match \code{epi_slide} interface.} - -\item{names_sep}{Not supported. Included to match \code{epi_slide} interface.} +of the slide computation output.} } \value{ -An \code{epi_df} object given by appending one or more new columns to \code{x}, -named according to the \code{new_col_name} argument. +An \code{epi_df} object given by appending one or more new columns to \code{.x}, +named according to the \code{.new_col_name} argument. } \description{ Slides an n-timestep sum over variables in an \code{epi_df} object. See the \href{https://cmu-delphi.github.io/epiprocess/articles/slide.html}{slide vignette} for examples. } \details{ -Wrapper around \code{epi_slide_opt} with \code{f = datatable::frollsum}. - -To "slide" means to apply a function over a rolling window of time -steps for each data group, where the window is centered at a reference time -and left and right endpoints are given by the \code{before} and \code{after} -arguments. -If there are not enough time steps available to complete the window at any -given reference time, then \verb{epi_slide_*()} will fail; it requires a -complete window to perform the computation. For a centrally-aligned slide -of \code{n} \code{time_value}s in a sliding window, set \code{before = (n-1)/2} and \code{after = (n-1)/2} when the number of \code{time_value}s in a sliding window is odd and -\code{before = n/2-1} and \code{after = n/2} when \code{n} is even. - -Sometimes, we want to experiment with various trailing or leading window -widths and compare the slide outputs. In the (uncommon) case where -zero-width windows are considered, manually pass both the \code{before} and -\code{after} arguments. +Wrapper around \code{epi_slide_opt} with \code{.f = datatable::frollsum}. + +To "slide" means to apply a function or formula over a rolling +window. The \code{.window_size} arg determines the width of the window +(including the reference time) and the \code{.align} arg governs how the window +is aligned (see below for examples). The \code{.ref_time_values} arg controls +which time values to consider for the slide and \code{.all_rows} allows you to +keep NAs around. + +\verb{epi_slide_*()} does not require a complete window (such as on the left +boundary of the dataset) and will attempt to perform the computation +anyway. The issue of what to do with partial computations (those run on +incomplete windows) is therefore left up to the user, either through the +specified function or formula \code{f}, or through post-processing. + +Let's look at some window examples, assuming that the reference time value +is \code{tv}. With .align = "right" and .window_size = 3, the window will be: + +time_values: tv - 4, tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +window: \link{tv - 2, tv - 1, tv} + +With .align = "center" and .window_size = 3, the window will be: + +time_values: tv - 4, tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +window: \link{tv - 1, tv, tv + 1} + +With .align = "center" and .window_size = 4, the window will be: + +time_values: tv - 4, tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +window: \link{tv - 2, tv - 1, tv, tv + 1} + +With .align = "left" and .window_size = 3, the window will be: + +time_values: tv - 4, tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 +window: \link{tv, tv + 1, tv + 2} } \examples{ # slide a 7-day trailing sum formula on cases jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_sum(cases, before = 6) \%>\% + epi_slide_sum(cases, .window_size = 7) \%>\% # Remove a nonessential var. to ensure new col is printed dplyr::select(geo_value, time_value, cases, cases_7dsum = slide_value_cases) \%>\% ungroup() diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index dcf121f6..08738252 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -1,4 +1,5 @@ library(cli) +library(dplyr) test_date <- as.Date("2020-01-01") days_dt <- as.difftime(1, units = "days") @@ -24,7 +25,7 @@ overlap_index <- toy_edf %>% as.Date() # Utility functions for computing expected slide_sum output -compute_slide_external <- function(before, overlap = FALSE) { +compute_slide_external <- function(.window_size, overlap = FALSE) { if (overlap) { toy_edf <- toy_edf %>% filter(time_value %in% overlap_index) @@ -33,16 +34,16 @@ compute_slide_external <- function(before, overlap = FALSE) { } slide_value <- toy_edf %>% group_by(time_value) %>% - summarize(value = sum(value)) %>% - pull(value) %>% - slider::slide_sum(before = before) + summarize(value = sum(.data$value)) %>% + pull(.data$value) %>% + slider::slide_sum(before = .window_size - 1) toy_edf_g %>% mutate(slide_value = slide_value) %>% ungroup() } -compute_slide_external_g <- function(before) { +compute_slide_external_g <- function(.window_size) { toy_edf_g %>% - mutate(slide_value = slider::slide_sum(value, before = before)) %>% + mutate(slide_value = slider::slide_sum(.data$value, before = .window_size - 1)) %>% dplyr::arrange(geo_value, time_value) %>% as_epi_df(as_of = test_date + 100) } @@ -56,27 +57,19 @@ bad_values <- list( ) purrr::walk(bad_values, function(bad_value) { test_that( - format_inline("`before` and `after` in epi_slide fail on {bad_value}"), + format_inline("`.window_size` fails on {bad_value}"), { expect_error( - epi_slide(toy_edf_g, before = bad_value, ref_time_values = test_date + 2), - class = "epiprocess__validate_slide_window_arg" - ) - expect_error( - epi_slide(toy_edf_g, after = bad_value, ref_time_values = test_date + 2), + epi_slide(toy_edf_g, .window_size = bad_value, .ref_time_values = test_date + 2), class = "epiprocess__validate_slide_window_arg" ) } ) }) purrr::walk(bad_values, function(bad_value) { - test_that(format_inline("`before` and `after` in epi_slide_mean fail on {bad_value}"), { + test_that(format_inline("`.window_size` in epi_slide_mean fails on {bad_value}"), { expect_error( - epi_slide_mean(toy_edf_g, col_names = value, before = bad_value, ref_time_values = test_date + 2), - class = "epiprocess__validate_slide_window_arg" - ) - expect_error( - epi_slide_mean(toy_edf_g, col_names = value, after = bad_value, ref_time_values = test_date + 2), + epi_slide_mean(toy_edf_g, .col_names = value, .window_size = bad_value, .ref_time_values = test_date + 2), class = "epiprocess__validate_slide_window_arg" ) }) @@ -84,58 +77,44 @@ purrr::walk(bad_values, function(bad_value) { bad_values <- c(min(toy_edf_g$time_value) - 1, max(toy_edf_g$time_value) + 1) purrr::walk(bad_values, function(bad_value) { - test_that(format_inline("epi_slide[_mean]: `ref_time_values` out of range for all groups {bad_value}"), { + test_that(format_inline("epi_slide[_mean]: `.ref_time_values` out of range for all groups {bad_value}"), { expect_error( - epi_slide(toy_edf_g, f_tib_avg_count, before = 2 * days_dt, ref_time_values = bad_value), + epi_slide(toy_edf_g, f_tib_avg_count, .window_size = 2 * days_dt, .ref_time_values = bad_value), class = "epi_slide__invalid_ref_time_values" ) expect_error( - epi_slide_mean(toy_edf_g, col_names = value, before = 2 * days_dt, ref_time_values = bad_value), + epi_slide_mean(toy_edf_g, .col_names = value, .window_size = 2 * days_dt, .ref_time_values = bad_value), class = "epi_slide_opt__invalid_ref_time_values" ) }) }) test_that( - "epi_slide or epi_slide_mean: `ref_time_values` in range for at least one group generate no error", + "epi_slide or epi_slide_mean: `.ref_time_values` in range for at least one group generate no error", { expect_equal( - epi_slide(toy_edf_g, ~ sum(.x$value), before = 2 * days_dt, ref_time_values = test_date + 5) %>% ungroup(), - compute_slide_external_g(before = 2) %>% ungroup() %>% filter(time_value == test_date + 5) + epi_slide(toy_edf_g, ~ sum(.x$value), .window_size = 2 * days_dt, .ref_time_values = test_date + 5) %>% ungroup(), + compute_slide_external_g(.window_size = 2) %>% ungroup() %>% filter(time_value == test_date + 5) ) expect_equal( - epi_slide_sum(toy_edf_g, value, before = 2 * days_dt, ref_time_values = test_date + 5, na.rm = TRUE) %>% + epi_slide_sum(toy_edf_g, value, .window_size = 2 * days_dt, .ref_time_values = test_date + 5, na.rm = TRUE) %>% ungroup() %>% rename(slide_value = slide_value_value), - compute_slide_external_g(before = 2) %>% ungroup() %>% filter(time_value == test_date + 5) + compute_slide_external_g(.window_size = 2) %>% ungroup() %>% filter(time_value == test_date + 5) ) } ) -test_that("epi_slide_mean errors when `as_list_col` non-NULL", { - expect_error( - toy_edf %>% - filter( - geo_value == "a" - ) %>% - epi_slide_mean( - value, - before = 6 * days_dt, as_list_col = TRUE, na.rm = TRUE - ), - class = "lifecycle_error_deprecated" - ) -}) - test_that("epi_slide alerts if the provided f doesn't take enough args", { expect_no_error( - epi_slide(toy_edf_g, f_tib_avg_count, before = days_dt, ref_time_values = test_date + 1), + epi_slide(toy_edf_g, f_tib_avg_count, .window_size = days_dt, .ref_time_values = test_date + 1), ) expect_no_warning( - epi_slide(toy_edf_g, f_tib_avg_count, before = days_dt, ref_time_values = test_date + 1), + epi_slide(toy_edf_g, f_tib_avg_count, .window_size = days_dt, .ref_time_values = test_date + 1), ) f_x_dots <- function(x, ...) dplyr::tibble(value = mean(x$value), count = length(x$value)) - expect_warning(epi_slide(toy_edf_g, f_x_dots, before = days_dt, ref_time_values = test_date + 1), + expect_warning(epi_slide(toy_edf_g, f_x_dots, .window_size = days_dt, .ref_time_values = test_date + 1), class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots" ) }) @@ -153,10 +132,7 @@ for (all_rows in list(FALSE, TRUE)) { { simpler_slide_call <- function(f) { toy_edf_g %>% - epi_slide( - before = 6 * days_dt, f, - ref_time_values = rtv, all_rows = all_rows - ) + epi_slide(f, .window_size = 6 * days_dt, .ref_time_values = rtv, .all_rows = all_rows) } filter_expected <- function(x) { if (all_rows && !is.null(rtv)) { @@ -170,43 +146,45 @@ for (all_rows in list(FALSE, TRUE)) { expect_equal( simpler_slide_call(~ sum(.x$value)), - compute_slide_external_g(before = 6) %>% filter_expected() + compute_slide_external_g(.window_size = 6) %>% filter_expected() ) expect_equal( simpler_slide_call(~ list(rep(sum(.x$value), 2L))), - compute_slide_external_g(before = 6) %>% + compute_slide_external_g(.window_size = 6) %>% mutate(slide_value = lapply(slide_value, rep, 2L)) %>% filter_expected() ) expect_equal( simpler_slide_call(~ data.frame(slide_value = sum(.x$value))), - compute_slide_external_g(before = 6) %>% filter_expected() + compute_slide_external_g(.window_size = 6) %>% filter_expected() ) expect_equal( simpler_slide_call(~ list(data.frame(slide_value = sum(.x$value)))), - compute_slide_external_g(before = 6) %>% + compute_slide_external_g(.window_size = 6) %>% mutate(slide_value = purrr::map(slide_value, ~ data.frame(slide_value = .x))) %>% filter_expected() ) expect_identical( simpler_slide_call(~ tibble(slide_value = list(sum(.x$value)))), - compute_slide_external_g(before = 6) %>% mutate(slide_value = as.list(slide_value)) %>% filter_expected() + compute_slide_external_g(.window_size = 6) %>% + mutate(slide_value = as.list(slide_value)) %>% + filter_expected() ) # unnamed data-masking expression producing data frame: # unfortunately, we can't pass this directly as `f` and need an extra comma slide_unnamed_df <- toy_edf_g %>% epi_slide( - before = 6L, , data.frame(slide_value = sum(.x$value)), - ref_time_values = rtv, all_rows = all_rows + .window_size = 6L, , data.frame(slide_value = sum(.x$value)), + .ref_time_values = rtv, .all_rows = all_rows ) expect_identical( slide_unnamed_df, - compute_slide_external_g(before = 6) %>% filter_expected() + compute_slide_external_g(.window_size = 6) %>% filter_expected() ) } ) @@ -219,8 +197,8 @@ for (all_rows in list(FALSE, TRUE)) { for (rtv in list(NULL, overlap_index)) { test_that( format_inline( - "epi_slide_sum works with formulas, lists, and data.frame outputs with ref_time_value={rtv} - and all_rows={all_rows}" + "epi_slide_sum works with formulas, lists, and data.frame outputs with .ref_time_value={rtv} + and .all_rows={all_rows}" ), { filter_expected <- function(x) { @@ -237,11 +215,10 @@ for (all_rows in list(FALSE, TRUE)) { toy_edf_g %>% epi_slide_sum( value, - before = 6 * days_dt, - ref_time_values = rtv, all_rows = all_rows, na.rm = TRUE + .window_size = 6 * days_dt, .ref_time_values = rtv, .all_rows = all_rows, na.rm = TRUE ) %>% rename(slide_value = slide_value_value), - compute_slide_external_g(before = 6) %>% filter_expected() + compute_slide_external_g(.window_size = 6) %>% filter_expected() ) } ) @@ -253,13 +230,13 @@ purrr::walk(possible_f, function(f) { test_that("epi_slide computation can use ref_time_value", { # Grouped with multiple geos expect_equal( - toy_edf_g %>% epi_slide(f = f, before = 50 * days_dt), + toy_edf_g %>% epi_slide(f, .window_size = 50 * days_dt), toy_edf_g %>% mutate(slide_value = time_value) ) # Ungrouped with multiple geos expect_equal( - toy_edf %>% epi_slide(f = f, before = 50 * days_dt), + toy_edf %>% epi_slide(f, .window_size = 50 * days_dt), toy_edf %>% mutate(slide_value = time_value) %>% arrange(time_value) ) }) @@ -268,7 +245,7 @@ purrr::walk(possible_f, function(f) { test_that("epi_slide computation via dots can use ref_time_value and group", { # Use ref_time_value expect_equal( - toy_edf_g %>% epi_slide(before = 50 * days_dt, slide_value = .ref_time_value), + toy_edf_g %>% epi_slide(slide_value = .ref_time_value, .window_size = 50 * days_dt), toy_edf_g %>% mutate(slide_value = time_value) ) @@ -276,25 +253,25 @@ test_that("epi_slide computation via dots can use ref_time_value and group", { # `.env`. expect_error(toy_edf_g %>% epi_slide( - before = 50 * days_dt, - slide_value = .env$.ref_time_value + slide_value = .env$.ref_time_value, + .window_size = 50 * days_dt )) # Grouped and use group key as value expect_equal( - toy_edf_g %>% epi_slide(before = 2 * days_dt, slide_value = .group_key$geo_value), + toy_edf_g %>% epi_slide(slide_value = .group_key$geo_value, .window_size = 2 * days_dt), toy_edf_g %>% mutate(slide_value = geo_value) ) # Use entire group_key object expect_equal( - toy_edf_g %>% epi_slide(before = 2 * days_dt, slide_value = nrow(.group_key)), + toy_edf_g %>% epi_slide(.window_size = 2 * days_dt, slide_value = nrow(.group_key)), toy_edf_g %>% mutate(slide_value = 1L) ) # Ungrouped with multiple geos expect_equal( - toy_edf %>% epi_slide(before = 50 * days_dt, slide_value = .ref_time_value), + toy_edf %>% epi_slide(.window_size = 50 * days_dt, slide_value = .ref_time_value), toy_edf %>% mutate(slide_value = time_value) %>% arrange(time_value) ) }) @@ -302,14 +279,14 @@ test_that("epi_slide computation via dots can use ref_time_value and group", { test_that("epi_slide computation via dots outputs the same result using col names and the data var", { expected_output <- toy_edf %>% epi_slide( - before = 2 * days_dt, + .window_size = 2 * days_dt, slide_value = max(time_value) ) %>% as_epi_df(as_of = test_date + 6) result1 <- toy_edf %>% epi_slide( - before = 2 * days_dt, + .window_size = 2 * days_dt, slide_value = max(.x$time_value) ) @@ -317,7 +294,7 @@ test_that("epi_slide computation via dots outputs the same result using col name result2 <- toy_edf %>% epi_slide( - before = 2 * days_dt, + .window_size = 2 * days_dt, slide_value = max(.data$time_value) ) @@ -327,7 +304,7 @@ test_that("epi_slide computation via dots outputs the same result using col name test_that("`epi_slide` can access objects inside of helper functions", { helper <- function(archive_haystack, time_value_needle) { archive_haystack %>% epi_slide( - has_needle = time_value_needle %in% time_value, before = 365000L * days_dt + has_needle = time_value_needle %in% time_value, .window_size = 365000L * days_dt ) } expect_error( @@ -340,13 +317,17 @@ test_that("`epi_slide` can access objects inside of helper functions", { test_that("basic ungrouped epi_slide computation produces expected output", { # Single geo expect_equal( - toy_edf %>% filter(geo_value == "a") %>% epi_slide(before = 50 * days_dt, slide_value = sum(.x$value)), - compute_slide_external_g(before = 50) %>% ungroup() %>% filter(geo_value == "a") %>% arrange(time_value) + toy_edf %>% + filter(geo_value == "a") %>% + epi_slide(.window_size = 50 * days_dt, slide_value = sum(.x$value)), + compute_slide_external_g(.window_size = 50) %>% ungroup() %>% filter(geo_value == "a") %>% arrange(time_value) ) # Multiple geos expect_equal( - toy_edf %>% filter(time_value %in% overlap_index) %>% epi_slide(before = 50 * days_dt, slide_value = sum(.x$value)), - compute_slide_external(before = 50, overlap = TRUE) %>% arrange(time_value) + toy_edf %>% + filter(time_value %in% overlap_index) %>% + epi_slide(.window_size = 50 * days_dt, slide_value = sum(.x$value)), + compute_slide_external(.window_size = 50, overlap = TRUE) %>% arrange(time_value) ) }) @@ -355,16 +336,16 @@ test_that("basic ungrouped epi_slide_mean computation produces expected output", expect_equal( toy_edf %>% filter(geo_value == "a") %>% - epi_slide_sum(value, before = 50 * days_dt, na.rm = TRUE) %>% + epi_slide_sum(value, .window_size = 50 * days_dt, na.rm = TRUE) %>% rename(slide_value = slide_value_value), - compute_slide_external_g(before = 50) %>% ungroup() %>% filter(geo_value == "a") %>% arrange(time_value) + compute_slide_external_g(.window_size = 50) %>% ungroup() %>% filter(geo_value == "a") %>% arrange(time_value) ) # Multiple geos # epi_slide_sum fails when input data groups contain duplicate time_values, # e.g. aggregating across geos expect_error( - toy_edf %>% epi_slide_sum(value, before = 6 * days_dt), + toy_edf %>% epi_slide_sum(value, .window_size = 6 * days_dt), class = "epiprocess__epi_slide_opt__duplicate_time_values" ) }) @@ -375,17 +356,17 @@ test_that("epi_slide can use sequential data masking expressions including NULL" edf_a <- tibble::tibble( geo_value = 1, time_value = 1:10, - value = 1:10 + value = 1:10 * 1.0 ) %>% as_epi_df(as_of = 12L) noisiness_a1 <- edf_a %>% group_by(geo_value) %>% epi_slide( - before = 1L, after = 2L, - valid = nrow(.x) == 4L, # not the best approach... - m = mean(.x$value[1:2]), - noisiness = sqrt(mean((value[3:4] - m)^2)), + .window_size = 5L, .align = "center", + valid = nrow(.x) == 5L, + m = .x$value[1], + noisiness = m + .x$value[5], m = NULL ) %>% ungroup() %>% @@ -394,10 +375,10 @@ test_that("epi_slide can use sequential data masking expressions including NULL" noisiness_a0 <- edf_a %>% filter( - time_value >= min(time_value) + 1L, + time_value >= min(time_value) + 2L, time_value <= max(time_value) - 2L ) %>% - mutate(noisiness = sqrt((3 - 1.5)^2 + (4 - 1.5)^2) / sqrt(2)) + mutate(noisiness = 2 * 3:8) expect_identical(noisiness_a1, noisiness_a0) @@ -411,8 +392,8 @@ test_that("epi_slide can use sequential data masking expressions including NULL" noisiness_b1 <- edf_b %>% group_by(geo_value) %>% epi_slide( - before = 1L, after = 2L, - valid = nrow(.x) == 4L, # not the best approach... + .window_size = 5L, .align = "center", + valid = nrow(.x) == 5L, model = list(lm(value ~ time_value, .x[1:2, ])), pred = list(predict(model[[1L]], newdata = .x[3:4, "time_value"])), model = NULL, @@ -425,7 +406,7 @@ test_that("epi_slide can use sequential data masking expressions including NULL" noisiness_b0 <- edf_b %>% filter( - time_value >= min(time_value) + 1L, + time_value >= min(time_value) + 2L, time_value <= max(time_value) - 2L ) %>% mutate(noisiness = sqrt((1 - 3)^2 + (2 - 4)^2) / sqrt(2)) @@ -435,19 +416,19 @@ test_that("epi_slide can use sequential data masking expressions including NULL" test_that("epi_slide complains on invalid computation outputs", { expect_error( - toy_edf %>% epi_slide(before = 6L, ~ lm(value ~ time_value, .x)), + toy_edf %>% epi_slide(.window_size = 6L, ~ lm(value ~ time_value, .x)), class = "epiprocess__invalid_slide_comp_value" ) expect_no_error( - toy_edf %>% epi_slide(before = 6L, ~ list(lm(value ~ time_value, .x))), + toy_edf %>% epi_slide(.window_size = 6L, ~ list(lm(value ~ time_value, .x))), class = "epiprocess__invalid_slide_comp_value" ) expect_error( - toy_edf %>% epi_slide(before = 6L, model = lm(value ~ time_value, .x)), + toy_edf %>% epi_slide(.window_size = 6L, model = lm(value ~ time_value, .x)), class = "epiprocess__invalid_slide_comp_tidyeval_output" ) expect_no_error( - toy_edf %>% epi_slide(before = 6L, model = list(lm(value ~ time_value, .x))), + toy_edf %>% epi_slide(.window_size = 6L, model = list(lm(value ~ time_value, .x))), class = "epiprocess__invalid_slide_comp_tidyeval_output" ) }) @@ -456,66 +437,71 @@ test_that("epi_slide can use {nm} :=", { nm <- "slide_value" expect_identical( # unfortunately, we can't pass this directly as `f` and need an extra comma - toy_edf_g %>% epi_slide(before = 6L, , !!nm := sum(value)), - compute_slide_external_g(before = 6) + toy_edf_g %>% epi_slide(.window_size = 6L, , !!nm := sum(value)), + compute_slide_external_g(.window_size = 6) ) }) test_that("epi_slide can produce packed outputs", { - packed_basic_result <- compute_slide_external_g(before = 6) %>% + packed_basic_result <- compute_slide_external_g(.window_size = 6) %>% tidyr::pack(container = c(slide_value)) %>% - dplyr_reconstruct(compute_slide_external_g(before = 6)) + dplyr_reconstruct(compute_slide_external_g(.window_size = 6)) expect_identical( - toy_edf_g %>% epi_slide(before = 6L, ~ tibble::tibble(slide_value = sum(.x$value)), new_col_name = "container"), + toy_edf_g %>% + epi_slide(.window_size = 6L, ~ tibble::tibble(slide_value = sum(.x$value)), .new_col_name = "container"), packed_basic_result ) expect_identical( - toy_edf_g %>% epi_slide(before = 6L, container = tibble::tibble(slide_value = sum(.x$value))), + toy_edf_g %>% + epi_slide(.window_size = 6L, container = tibble::tibble(slide_value = sum(.x$value))), packed_basic_result ) expect_identical( - toy_edf_g %>% epi_slide(before = 6L, , tibble::tibble(slide_value = sum(.x$value)), new_col_name = "container"), + toy_edf_g %>% + epi_slide(.window_size = 6L, , tibble::tibble(slide_value = sum(.x$value)), .new_col_name = "container"), packed_basic_result ) }) test_that("nested dataframe output names are controllable", { expect_equal( - toy_edf_g %>% epi_slide(before = 6 * days_dt, ~ data.frame(result = sum(.x$value))), - compute_slide_external_g(before = 6) %>% rename(result = slide_value) + toy_edf_g %>% epi_slide(.window_size = 6 * days_dt, ~ data.frame(result = sum(.x$value))), + compute_slide_external_g(.window_size = 6) %>% rename(result = slide_value) ) expect_equal( - toy_edf_g %>% epi_slide(before = 6 * days_dt, ~ data.frame(value_sum = sum(.x$value))), - compute_slide_external_g(before = 6) %>% rename(value_sum = slide_value) + toy_edf_g %>% epi_slide(.window_size = 6 * days_dt, ~ data.frame(value_sum = sum(.x$value))), + compute_slide_external_g(.window_size = 6) %>% rename(value_sum = slide_value) ) }) # TODO: This seems really strange and counter-intuitive. Deprecate?4 test_that("non-size-1 f outputs are no-op recycled", { expect_equal( - toy_edf %>% filter(time_value %in% overlap_index) %>% epi_slide(before = 6 * days_dt, ~ sum(.x$value) + c(0, 0, 0)), - compute_slide_external(before = 6, overlap = TRUE) %>% arrange(time_value) + toy_edf %>% + filter(time_value %in% overlap_index) %>% + epi_slide(.window_size = 6 * days_dt, ~ sum(.x$value) + c(0, 0, 0)), + compute_slide_external(.window_size = 6, overlap = TRUE) %>% arrange(time_value) ) expect_equal( toy_edf %>% filter(time_value %in% overlap_index) %>% - epi_slide(before = 6 * days_dt, ~ as.list(sum(.x$value) + c(0, 0, 0))), - compute_slide_external(before = 6, overlap = TRUE) %>% + epi_slide(.window_size = 6 * days_dt, ~ as.list(sum(.x$value) + c(0, 0, 0))), + compute_slide_external(.window_size = 6, overlap = TRUE) %>% dplyr::mutate(slide_value = as.list(slide_value)) %>% arrange(time_value) ) expect_equal( toy_edf %>% filter(time_value %in% overlap_index) %>% - epi_slide(before = 6 * days_dt, ~ data.frame(slide_value = sum(.x$value) + c(0, 0, 0))), - compute_slide_external(before = 6, overlap = TRUE) %>% arrange(time_value) + epi_slide(.window_size = 6 * days_dt, ~ data.frame(slide_value = sum(.x$value) + c(0, 0, 0))), + compute_slide_external(.window_size = 6, overlap = TRUE) %>% arrange(time_value) ) # size-1 list is recycled: expect_equal( toy_edf %>% filter(time_value %in% overlap_index) %>% - epi_slide(before = 6 * days_dt, ~ list(tibble(value = sum(.x$value) + c(0, 0, 0)))), - compute_slide_external(before = 6, overlap = TRUE) %>% + epi_slide(.window_size = 6 * days_dt, ~ list(tibble(value = sum(.x$value) + c(0, 0, 0)))), + compute_slide_external(.window_size = 6, overlap = TRUE) %>% group_by(time_value) %>% mutate(slide_value = rep(list(tibble(value = slide_value)), 3L)) %>% ungroup() %>% @@ -526,69 +512,68 @@ test_that("non-size-1 f outputs are no-op recycled", { test_that("`epi_slide` doesn't lose Date class output", { expect_true( toy_edf %>% - epi_slide(before = 5 * days_dt, ~ as.Date("2020-01-01")) %>% + epi_slide(.window_size = 5 * days_dt, ~ as.Date("2020-01-01")) %>% `[[`("slide_value") %>% inherits("Date") ) }) -time_types <- c("days", "weeks", "yearmonths", "integers") -for (time_type in time_types) { - test_that(format_inline("epi_slide and epi_slide_mean: different before/after match for {time_type}"), { - set.seed(0) - n <- 16 - epi_data_no_missing <- rbind( - tibble(geo_value = "al", a = 1:n, b = rnorm(n)), - tibble(geo_value = "ca", a = n:1, b = rnorm(n) + 10), - tibble(geo_value = "fl", a = n:1, b = rnorm(n) * 2) - ) %>% - mutate( - time_value = rep( - switch(time_type, - days = as.Date("2022-01-01") + 1:n, - weeks = as.Date("2022-01-01") + 7L * 1:n, - yearmonths = tsibble::yearmonth(10L + 1:n), - integers = 2000L + 1:n, - ), 3 +for (time_type in c("days", "weeks", "yearmonths", "integers")) { + for (align in c("right", "center", "left")) { + for (window_size in c(1, 6)) { + test_that(format_inline( + "epi_slide and epi_slide_mean: equivalent for + .window_size={window_size}, time_type={time_type}, and .align={align}" + ), { + set.seed(0) + n <- 16 + epi_data_no_missing <- rbind( + tibble(geo_value = "al", a = 1:n, b = rnorm(n)), + tibble(geo_value = "ca", a = n:1, b = rnorm(n) + 10), + tibble(geo_value = "fl", a = n:1, b = rnorm(n) * 2) + ) %>% + mutate( + time_value = rep( + switch(time_type, + days = as.Date("2022-01-01") + 1:n, + weeks = as.Date("2022-01-01") + 7L * 1:n, + yearmonths = tsibble::yearmonth(10L + 1:n), + integers = 2000L + 1:n, + ), 3 + ) + ) %>% + as_epi_df() %>% + group_by(geo_value) + # Remove rows 12, 13, and 14 from every group + epi_data_missing <- epi_data_no_missing %>% slice(1:11, 15:16) + units <- switch(time_type, + days = days_dt, + weeks = weeks_dt, + yearmonths = 1, + integers = 1 ) - ) %>% - as_epi_df() %>% - group_by(geo_value) - # Remove rows 12, 13, and 14 from every group - epi_data_missing <- epi_data_no_missing %>% slice(1:11, 15:16) - - test_time_type_mean <- function(epi_data, before = NULL, after = NULL, ...) { - result1 <- epi_slide(epi_data, ~ data.frame( - slide_value_a = mean(.x$a, rm.na = TRUE), - slide_value_b = mean(.x$b, rm.na = TRUE) - ), - before = before, after = after, ... - ) - result2 <- epi_slide_mean(epi_data, col_names = c(a, b), na.rm = TRUE, before = before, after = after, ...) - expect_equal(result1, result2) - } - - units <- switch(time_type, - days = days_dt, - weeks = weeks_dt, - yearmonths = 1, - integers = 1 - ) + window_size <- window_size * units + + test_time_type_mean <- function(epi_data, ...) { + result1 <- epi_slide(epi_data, ~ data.frame( + slide_value_a = mean(.x$a, rm.na = TRUE), + slide_value_b = mean(.x$b, rm.na = TRUE) + ), + .window_size = window_size, .align = align, ... + ) + result2 <- epi_slide_mean( + epi_data, + .window_size = window_size, .align = align, + .col_names = c(a, b), na.rm = TRUE, ... + ) + expect_equal(result1, result2) + } - test_time_type_mean(epi_data_missing, before = 6 * units) - test_time_type_mean(epi_data_missing, before = 6 * units, after = 1 * units) - test_time_type_mean(epi_data_missing, before = 6 * units, after = 6 * units) - test_time_type_mean(epi_data_missing, before = 1 * units, after = 6 * units) - test_time_type_mean(epi_data_missing, after = 6 * units) - test_time_type_mean(epi_data_missing, after = 1 * units) - - test_time_type_mean(epi_data_no_missing, before = 6 * units) - test_time_type_mean(epi_data_no_missing, before = 6 * units, after = 1 * units) - test_time_type_mean(epi_data_no_missing, before = 6 * units, after = 6 * units) - test_time_type_mean(epi_data_no_missing, before = 1 * units, after = 6 * units) - test_time_type_mean(epi_data_no_missing, after = 6 * units) - test_time_type_mean(epi_data_no_missing, after = 1 * units) - }) + test_time_type_mean(epi_data_missing) + test_time_type_mean(epi_data_no_missing) + }) + } + } } test_that("helper `full_date_seq` returns expected date values", { @@ -724,20 +709,20 @@ test_that("helper `full_date_seq` returns expected date values", { test_that("epi_slide_mean/sum produces same output as epi_slide_opt grouped", { expect_equal( - epi_slide_mean(toy_edf_g, value, before = 50 * days_dt, na.rm = TRUE), - epi_slide_opt(toy_edf_g, value, f = data.table::frollmean, before = 50 * days_dt, na.rm = TRUE) + epi_slide_mean(toy_edf_g, value, .window_size = 50 * days_dt, na.rm = TRUE), + epi_slide_opt(toy_edf_g, value, .f = data.table::frollmean, .window_size = 50 * days_dt, na.rm = TRUE) ) expect_equal( - epi_slide_mean(toy_edf_g, value, before = 50 * days_dt, na.rm = TRUE), - epi_slide_opt(toy_edf_g, value, f = slider::slide_mean, before = 50 * days_dt, na_rm = TRUE) + epi_slide_mean(toy_edf_g, value, .window_size = 50 * days_dt, na.rm = TRUE), + epi_slide_opt(toy_edf_g, value, .f = slider::slide_mean, .window_size = 50 * days_dt, na_rm = TRUE) ) expect_equal( - epi_slide_sum(toy_edf_g, value, before = 50 * days_dt, na.rm = TRUE), - epi_slide_opt(toy_edf_g, value, f = data.table::frollsum, before = 50 * days_dt, na.rm = TRUE) + epi_slide_sum(toy_edf_g, value, .window_size = 50 * days_dt, na.rm = TRUE), + epi_slide_opt(toy_edf_g, value, .f = data.table::frollsum, .window_size = 50 * days_dt, na.rm = TRUE) ) expect_equal( - epi_slide_sum(toy_edf_g, value, before = 50 * days_dt, na.rm = TRUE), - epi_slide_opt(toy_edf_g, value, f = slider::slide_sum, before = 50 * days_dt, na_rm = TRUE) + epi_slide_sum(toy_edf_g, value, .window_size = 50 * days_dt, na.rm = TRUE), + epi_slide_opt(toy_edf_g, value, .f = slider::slide_sum, .window_size = 50 * days_dt, na_rm = TRUE) ) }) @@ -746,15 +731,15 @@ test_that("`epi_slide_opt` errors when passed non-`data.table`, non-`slider` fun expect_no_error( epi_slide_opt( toy_edf_g, - col_names = value, f = reexport_frollmean, - before = days_dt, ref_time_values = test_date + 1 + .col_names = value, .f = reexport_frollmean, + .window_size = days_dt, .ref_time_values = test_date + 1 ) ) expect_error( epi_slide_opt( toy_edf_g, - col_names = value, f = mean, - before = days_dt, ref_time_values = test_date + 1 + .col_names = value, .f = mean, + .window_size = days_dt, .ref_time_values = test_date + 1 ), class = "epiprocess__epi_slide_opt__unsupported_slide_function" ) @@ -769,23 +754,23 @@ multi_columns <- dplyr::bind_rows( test_that("no dplyr warnings from selecting multiple columns", { expect_no_warning( - multi_slid <- epi_slide_mean(multi_columns, col_names = c("value", "value2"), before = 3L) + multi_slid <- epi_slide_mean(multi_columns, .col_names = c("value", "value2"), .window_size = 3L) ) expect_equal( names(multi_slid), c("geo_value", "time_value", "value", "value2", "slide_value_value", "slide_value_value2") ) expect_no_warning( - multi_slid_select <- epi_slide_mean(multi_columns, c(value, value2), before = 3L) + multi_slid_select <- epi_slide_mean(multi_columns, c(value, value2), .window_size = 3L) ) expect_equal(multi_slid_select, multi_slid) expect_no_warning( - multi_slid_select <- epi_slide_mean(multi_columns, starts_with("value"), before = 3L) + multi_slid_select <- epi_slide_mean(multi_columns, starts_with("value"), .window_size = 3L) ) expect_equal(multi_slid_select, multi_slid) }) -test_that("Inf works in before/after in slide and slide_opt", { +test_that("Inf works in .window_size in slide and slide_opt", { # Daily data df <- dplyr::bind_rows( dplyr::tibble(geo_value = "ak", time_value = test_date + 1:200, value = 1:200), @@ -796,13 +781,13 @@ test_that("Inf works in before/after in slide and slide_opt", { df %>% group_by(geo_value) %>% epi_slide( - before = Inf, + .window_size = Inf, slide_value = sum(value) ), df %>% group_by(geo_value) %>% epi_slide( - before = 365000, + .window_size = 365000, slide_value = sum(value) ) ) @@ -810,14 +795,14 @@ test_that("Inf works in before/after in slide and slide_opt", { df %>% group_by(geo_value) %>% epi_slide_opt( - before = Inf, - f = data.table::frollsum, - col_names = value + .window_size = Inf, + .f = data.table::frollsum, + .col_names = value ), df %>% group_by(geo_value) %>% epi_slide( - before = 365000, + .window_size = 365000, slide_value_value = sum(value) ) ) @@ -825,14 +810,14 @@ test_that("Inf works in before/after in slide and slide_opt", { df %>% group_by(geo_value) %>% epi_slide_opt( - before = Inf, - f = slider::slide_sum, - col_names = value + .window_size = Inf, + .f = slider::slide_sum, + .col_names = value ), df %>% group_by(geo_value) %>% epi_slide( - before = 365000, + .window_size = 365000, slide_value_value = sum(value) ) ) @@ -848,13 +833,13 @@ test_that("Inf works in before/after in slide and slide_opt", { df %>% group_by(geo_value) %>% epi_slide( - before = Inf, + .window_size = Inf, slide_value = sum(value) ), df %>% group_by(geo_value) %>% epi_slide( - before = 365000 * weeks_dt, + .window_size = 365000 * weeks_dt, slide_value = sum(value) ) ) @@ -862,14 +847,14 @@ test_that("Inf works in before/after in slide and slide_opt", { df %>% group_by(geo_value) %>% epi_slide_opt( - col_names = value, - f = data.table::frollsum, - before = Inf + .col_names = value, + .f = data.table::frollsum, + .window_size = Inf ), df %>% group_by(geo_value) %>% epi_slide( - before = 365000 * weeks_dt, + .window_size = 365000 * weeks_dt, slide_value_value = sum(value) ) ) @@ -877,14 +862,14 @@ test_that("Inf works in before/after in slide and slide_opt", { df %>% group_by(geo_value) %>% epi_slide_opt( - before = Inf, - f = slider::slide_sum, - col_names = value + .window_size = Inf, + .f = slider::slide_sum, + .col_names = value ), df %>% group_by(geo_value) %>% epi_slide( - before = 365000 * weeks_dt, + .window_size = 365000 * weeks_dt, slide_value_value = sum(value) ) ) diff --git a/vignettes/advanced.Rmd b/vignettes/advanced.Rmd index 3eaafb8d..93dfb361 100644 --- a/vignettes/advanced.Rmd +++ b/vignettes/advanced.Rmd @@ -46,22 +46,16 @@ These differences in basic behavior make some common slide operations require le When using more advanced features, more complex rules apply: -* Generalization: `epi_slide(edf, ....., ref_time_values=my_ref_time_values)` +* Generalization: `epi_slide(edf, ....., .ref_time_values=my_ref_time_values)` will output one row for every row in `edf` with `time_value` appearing inside `my_ref_time_values`, and is analogous to a `dplyr::mutate`&`dplyr::arrange` - followed by `dplyr::filter` to those `ref_time_values`. We call this property + followed by `dplyr::filter` to those `.ref_time_values`. We call this property **size stability**, and describe how it is achieved in the following sections. The default behavior described above is a special case of this general rule - based on a default value of `ref_time_values`. -* Exception/feature: `epi_slide(edf, ....., ref_time_values=my_ref_time_values, - all_rows=TRUE)` will not just output rows for `my_ref_time_values`, but + based on a default value of `.ref_time_values`. +* Exception/feature: `epi_slide(edf, ....., .ref_time_values=my_ref_time_values, + .all_rows=TRUE)` will not just output rows for `my_ref_time_values`, but instead will output one row per row in `edf`. -* Exception/feature: `epi_slide(edf, ....., as_list_col=TRUE)` will format the - output to add a single list-class computed column. -* Exception/feature: `epix_slide(ea, ....., as_list_col=TRUE)` will format the - output to have one row per computation and a single list-class computed column - (in addition to the grouping variables and `time_value`), as if we had used - `tidyr::chop()` or `tidyr::nest()`. * Clarification: `ea %>% group_by(....., .drop=FALSE) %>% epix_slide(, .....)` will call the computation on any missing groups according to `dplyr`'s `.drop=FALSE` rules, resulting in additional @@ -94,18 +88,18 @@ edf <- tibble( # 2-day trailing average, per geo value edf %>% group_by(geo_value) %>% - epi_slide(x_2dav = mean(x), before = 1) %>% + epi_slide(x_2dav = mean(x), .window_size = 2) %>% ungroup() # 2-day trailing average, marginally edf %>% - epi_slide(x_2dav = mean(x), before = 1) + epi_slide(x_2dav = mean(x), .window_size = 2) ``` ```{r, include = FALSE} # More checks (not included) edf %>% - epi_slide(x_2dav = mean(x), before = 1, ref_time_values = as.Date("2020-06-02")) + epi_slide(x_2dav = mean(x), .window_size = 2, .ref_time_values = as.Date("2020-06-02")) edf %>% # pretend that observations about time_value t are reported in version t (nowcasts) @@ -131,7 +125,7 @@ same result as the last one. ```{r} edf %>% - epi_slide(y_2dav = rep(mean(x), 3), before = 1) + epi_slide(y_2dav = rep(mean(x), 3), .window_size = 2) ``` However, if the output is an atomic vector (rather than a single value) and it @@ -140,7 +134,7 @@ are trying to return 2 things for 3 states. ```{r, error = TRUE} edf %>% - epi_slide(x_2dav = rep(mean(x), 2), before = 1) + epi_slide(x_2dav = rep(mean(x), 2), .window_size = 2) ``` ## Multi-column outputs @@ -148,16 +142,14 @@ edf %>% Now we move on to outputs that are data frames with a single row but multiple columns. Working with this type of output structure has in fact has already been demonstrated in the [slide -vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html). When -we set `as_list_col = TRUE` in the call to `epi_slide()`, the resulting `epi_df` -object returned by `epi_slide()` has a list column containing the slide values. +vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html). ```{r} edf2 <- edf %>% group_by(geo_value) %>% epi_slide( - a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), - before = 1, as_list_col = TRUE + a = list(data.frame(x_2dav = mean(x), x_2dma = mad(x))), + .window_size = 2 ) %>% ungroup() @@ -166,66 +158,32 @@ length(edf2$a) edf2$a[[2]] ``` -When we use `as_list_col = FALSE` (the default in `epi_slide()`), the function -unnests (in the sense of `tidyr::unnest()`) the list column `a`, so that the -resulting `epi_df` has multiple new columns containing the slide values. The -default is to name these unnested columns by prefixing the name assigned to the -list column (here `a`) onto the column names of the output data frame from the -slide computation (here `x_2dav` and `x_2dma`) separated by "_". - -```{r} -edf %>% - group_by(geo_value) %>% - epi_slide( - a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), - before = 1, as_list_col = FALSE - ) %>% - ungroup() -``` - -We can use `names_sep = NULL` (which gets passed to `tidyr::unnest()`) to drop -the prefix associated with list column name, in naming the unnested columns. +If you do not wrap the data.frame in a list above, the resulting `epi_df` has +multiple new columns containing the slide values. The default is to name these +unnested columns by prefixing the name assigned to the list column (here `a`) +onto the column names of the output data frame from the slide computation (here +`x_2dav` and `x_2dma`) separated by "_". ```{r} edf %>% group_by(geo_value) %>% epi_slide( a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), - before = 1, as_list_col = FALSE, names_sep = NULL + .window_size = 2 ) %>% ungroup() ``` Furthermore, `epi_slide()` will recycle the single row data frame as needed in -order to make the result size stable, just like the case for atomic values. +order to make the result size stable, just like the case for atomic values (note +that we are not grouping here by geo_value). ```{r} edf %>% epi_slide( a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), - before = 1, as_list_col = FALSE, names_sep = NULL - ) -``` - -```{r, include = FALSE} -# More checks (not included) -edf %>% - epi_slide( - a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), - ref_time_values = as.Date("2020-06-02"), - before = 1, as_list_col = FALSE, names_sep = NULL + .window_size = 2 ) - -edf %>% - mutate(version = time_value) %>% - as_epi_archive() %>% - group_by(geo_value) %>% - epix_slide( - a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), - ref_time_values = as.Date("2020-06-02"), - before = 1, as_list_col = FALSE, names_sep = NULL - ) %>% - ungroup() ``` ## Multi-row outputs @@ -250,16 +208,14 @@ edf %>% epi_slide(function(d, group_key, ref_time_value) { obj <- lm(y ~ x, data = d) return( - as.data.frame( - predict(obj, - newdata = d %>% - group_by(geo_value) %>% - filter(time_value == max(time_value)), - interval = "prediction", level = 0.9 - ) - ) + predict( + obj, + newdata = d %>% group_by(geo_value) %>% filter(time_value == max(time_value)), + interval = "prediction", + level = 0.9 + ) %>% as.data.frame() %>% list() ) - }, before = 1, new_col_name = "fc", names_sep = NULL) + }, .window_size = 2) ``` The above example focused on simplicity to show how to work with multi-row @@ -471,7 +427,7 @@ k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { fc = prob_arx(.data$percent_cli, .data$case_rate_7d_av, .data$geo_value, .data$time_value, args = prob_arx_args(ahead = ahead) ), - before = 119, ref_time_values = fc_time_values + before = 219, ref_time_values = fc_time_values ) %>% mutate( target_date = .data$time_value + ahead, as_of = TRUE, @@ -483,7 +439,7 @@ k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { fc = prob_arx(.data$percent_cli, .data$case_rate_7d_av, .data$geo_value, .data$time_value, args = prob_arx_args(ahead = ahead) ), - before = 119, ref_time_values = fc_time_values + .window_size = 220, .ref_time_values = fc_time_values ) %>% mutate(target_date = .data$time_value + ahead, as_of = FALSE) } diff --git a/vignettes/aggregation.Rmd b/vignettes/aggregation.Rmd index ec5f36af..585b5b0a 100644 --- a/vignettes/aggregation.Rmd +++ b/vignettes/aggregation.Rmd @@ -182,7 +182,7 @@ Explicit imputation for missingness (zero-filling in our case) can be important for protecting against bugs in all sorts of downstream tasks. For example, even something as simple as a 7-day trailing average is complicated by missingness. The function `epi_slide()` looks for all rows within a window of 7 days anchored -on the right at the reference time point (when `before = 6`). +on the right at the reference time point (when `.window_size = 7`). But when some days in a given week are missing because they were censored because they had small case counts, taking an average of the observed case counts can be misleading and is unintentionally biased upwards. Meanwhile, @@ -194,7 +194,7 @@ running `epi_slide()` on the zero-filled data brings these trailing averages xt %>% as_epi_df(as_of = as.Date("2024-03-20")) %>% group_by(geo_value) %>% - epi_slide(cases_7dav = mean(cases), before = 6) %>% + epi_slide(cases_7dav = mean(cases), .window_size = 7) %>% ungroup() %>% filter( geo_value == "Plymouth, MA", @@ -205,7 +205,7 @@ xt %>% xt_filled %>% as_epi_df(as_of = as.Date("2024-03-20")) %>% group_by(geo_value) %>% - epi_slide(cases_7dav = mean(cases), before = 6) %>% + epi_slide(cases_7dav = mean(cases), .window_size = 7) %>% ungroup() %>% filter( geo_value == "Plymouth, MA", diff --git a/vignettes/archive.Rmd b/vignettes/archive.Rmd index ff37f7cc..3d616d92 100644 --- a/vignettes/archive.Rmd +++ b/vignettes/archive.Rmd @@ -128,7 +128,7 @@ object is instantiated, if they are not explicitly specified in the function call (as it did in the case above). ## Summarizing Revision Behavior -There are many ways to examine the ways that signals change across different revisions. +There are many ways to examine the ways that signals change across different revisions. The simplest that is included directly in epiprocess is `revision_summary()`, which computes simple summary statistics for each key (by default, `(geo_value,time_value)` pairs), such as the lag to the first value (latency). In addition to the per key summary, it also returns an overall summary: ```{r} revision_details <- revision_summary(x, print_inform = TRUE) @@ -402,8 +402,8 @@ k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { x_latest %>% group_by(.data$geo_value) %>% epi_slide( - fc = prob_arx(.data$percent_cli, .data$case_rate_7d_av, ahead = ahead), before = 119, - ref_time_values = fc_time_values + fc = prob_arx(.data$percent_cli, .data$case_rate_7d_av, ahead = ahead), .window_size = 120, + .ref_time_values = fc_time_values ) %>% mutate(target_date = .data$time_value + ahead, as_of = FALSE) %>% ungroup() diff --git a/vignettes/slide.Rmd b/vignettes/slide.Rmd index 8264a963..892b6cca 100644 --- a/vignettes/slide.Rmd +++ b/vignettes/slide.Rmd @@ -20,8 +20,7 @@ understands addition and subtraction. For example, if the time values are coded as `Date` objects, then one time step is one day, since `as.Date("2022-01-01") + 1` equals `as.Date("2022-01-02")`. Alternatively, the time step can be specified manually in the call to `epi_slide()`; you can read the documentation for more -details. Furthermore, the alignment of the running window used in `epi_slide()` -is specified by `before` and `after`. +details. As in getting started guide, we'll fetch daily reported COVID-19 cases from CA, FL, NY, and TX (note: here we're using new, not cumulative cases) using the @@ -70,7 +69,7 @@ first call `group_by()`. ```{r} x %>% group_by(geo_value) %>% - epi_slide_mean("cases", before = 6) %>% + epi_slide_mean("cases", .window_size = 7) %>% ungroup() %>% head(10) ``` @@ -91,7 +90,7 @@ first call `group_by()`. ```{r} x %>% group_by(geo_value) %>% - epi_slide(~ mean(.x$cases), before = 6) %>% + epi_slide(~ mean(.x$cases), .window_size = 7) %>% ungroup() %>% head(10) ``` @@ -101,12 +100,12 @@ original `epi_df` object (and must refer to them with the prefix `.x$`). As we can see, the function `epi_slide()` returns an `epi_df` object with a new column appended that contains the results (from sliding), named `slide_value` as the default. We can of course change this post hoc, or we can instead specify a new -name up front using the `new_col_name` argument: +name up front using the `.new_col_name` argument: ```{r} x <- x %>% group_by(geo_value) %>% - epi_slide(~ mean(.x$cases), before = 6, new_col_name = "cases_7dav") %>% + epi_slide(~ mean(.x$cases), .window_size = 7, .new_col_name = "cases_7dav") %>% ungroup() head(x, 10) @@ -127,20 +126,20 @@ instead of `.ref_time_value`. We can also pass a function for the first argument in `epi_slide()`. In this case, the passed function must accept the following arguments: -In this case, the passed function `f` must accept the following arguments: a +In this case, the passed function `.f` must accept the following arguments: a data frame with the same column names as the original object, minus any grouping -variables, containing the time window data for one group-`ref_time_value` +variables, containing the time window data for one group-`.ref_time_value` combination; followed by a one-row tibble containing the values of the grouping -variables for the associated group; followed by the associated `ref_time_value`. +variables for the associated group; followed by the associated `.ref_time_value`. It can accept additional arguments; `epi_slide()` will forward any `...` args it -receives to `f`. +receives to `.f`. Recreating the last example of a 7-day trailing average: ```{r} x <- x %>% group_by(geo_value) %>% - epi_slide(function(x, gk, rtv) mean(x$cases), before = 6, new_col_name = "cases_7dav") %>% + epi_slide(function(x, gk, rtv) mean(x$cases), .window_size = 7, .new_col_name = "cases_7dav") %>% ungroup() head(x, 10) @@ -151,13 +150,13 @@ head(x, 10) Perhaps the most convenient way to setup a computation in `epi_slide()` is to pass in an expression for tidy evaluation. In this case, we can simply define the name of the new column directly as part of the expression, setting it equal -to a computation in which we can access any columns of `x` by name, just as we +to a computation in which we can access any columns of `.x` by name, just as we would in a call to `dplyr::mutate()`, or any of the `dplyr` verbs. For example: ```{r} x <- x %>% group_by(geo_value) %>% - epi_slide(cases_7dav = mean(cases), before = 6) %>% + epi_slide(cases_7dav = mean(cases), .window_size = 7) %>% ungroup() head(x, 10) @@ -254,14 +253,14 @@ fc_time_values <- seq(as.Date("2020-06-01"), x %>% group_by(geo_value) %>% epi_slide( - fc = prob_ar(cases_7dav), before = 119, - ref_time_values = fc_time_values + fc = prob_ar(cases_7dav), .window_size = 120, + .ref_time_values = fc_time_values ) %>% ungroup() %>% head(10) ``` -Note that here we have utilized an argument `ref_time_values` to perform the +Note that here we have utilized an argument `.ref_time_values` to perform the sliding computation (here, compute a forecast) at a specific subset of reference time values. We get out a ["packed"][tidyr::pack] data frame column `fc` containing `fc$point`, `fc$lower`, and `fc$upper` that correspond to the point @@ -282,8 +281,8 @@ k_week_ahead <- function(x, ahead = 7) { x %>% group_by(.data$geo_value) %>% epi_slide( - fc = prob_ar(.data$cases_7dav, ahead = ahead), before = 119, - ref_time_values = fc_time_values, all_rows = TRUE + fc = prob_ar(.data$cases_7dav, ahead = ahead), + .window_size = 120, .ref_time_values = fc_time_values, .all_rows = TRUE ) %>% ungroup() %>% mutate(target_date = .data$time_value + ahead)