From 9f33e1b1572e9354ca85527d71d19a80f535132f Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 1 Aug 2024 15:17:18 -0700 Subject: [PATCH 01/11] tests(assert_sufficient_f_args): test vs. mean, sum, slice; use expect_no* --- tests/testthat/test-utils.R | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index d18f9f48..b16c8ebe 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -77,10 +77,10 @@ test_that("assert_sufficient_f_args alerts if the provided f doesn't take enough f_xgt_dots <- function(x, g, t, ...) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) # If `regexp` is NA, asserts that there should be no errors/messages. - expect_error(assert_sufficient_f_args(f_xgt), regexp = NA) - expect_warning(assert_sufficient_f_args(f_xgt), regexp = NA) - expect_error(assert_sufficient_f_args(f_xgt_dots), regexp = NA) - expect_warning(assert_sufficient_f_args(f_xgt_dots), regexp = NA) + expect_no_error(assert_sufficient_f_args(f_xgt)) + expect_no_warning(assert_sufficient_f_args(f_xgt)) + expect_no_error(assert_sufficient_f_args(f_xgt_dots)) + expect_no_warning(assert_sufficient_f_args(f_xgt_dots)) f_x_dots <- function(x, ...) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) f_dots <- function(...) dplyr::tibble(value = c(5), count = c(2)) @@ -102,6 +102,21 @@ test_that("assert_sufficient_f_args alerts if the provided f doesn't take enough class = "epiprocess__assert_sufficient_f_args__f_needs_min_args" ) + # Make sure we generate the same sort of conditions on some external functions + # that have caused surprises in the past: + expect_warning(assert_sufficient_f_args(mean), + regexp = ", the group key and reference time value will be included", + class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots" + ) + expect_warning(assert_sufficient_f_args(sum), + regexp = ", the window data, group key, and reference time value will be included", + class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots" + ) + expect_warning(assert_sufficient_f_args(dplyr::slice), + regexp = ", the group key and reference time value will be included", + class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots" + ) + f_xs_dots <- function(x, setting = "a", ...) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) f_xs <- function(x, setting = "a") dplyr::tibble(value = mean(x$binary), count = length(x$binary)) expect_warning(assert_sufficient_f_args(f_xs_dots, setting = "b"), From 14cd7363f35ecf258eaebb3322293f89f132b071 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 1 Aug 2024 18:46:17 -0700 Subject: [PATCH 02/11] BREAKING CHANGE(epix_slide): output `version` column, other re/dual-names In `epix_slide()`: - warn-deprecate `.ref_time_values =` in favor of `.versions =` - allow tidyeval or formula comps to use `.ref_time_value` or `.version` to access the ref_time_value/version (currently, these two things are always the same) - output a `version` column, not a `time_value` column - rename `epix_slide_ref_time_values_default` -> `epix_slide_versions_default` - some other cleanup from a rebase combining with dot-prefixing and other slide changes --- NAMESPACE | 2 + R/grouped_epi_archive.R | 38 ++++---- R/methods-epi_archive.R | 8 +- R/slide.R | 2 +- R/utils.R | 82 +++++++++++++---- man-roxygen/ref-time-value-label.R | 2 + man/epix_slide.Rd | 22 ++--- tests/testthat/test-epix_slide.R | 102 +++++++++++----------- tests/testthat/test-grouped_epi_archive.R | 8 +- tests/testthat/test-utils.R | 70 +++++++-------- 10 files changed, 197 insertions(+), 139 deletions(-) create mode 100644 man-roxygen/ref-time-value-label.R diff --git a/NAMESPACE b/NAMESPACE index fc6aaf74..fa4f76df 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -48,8 +48,10 @@ export("%>%") export(archive_cases_dv_subset) export(arrange) export(arrange_canonical) +export(as_diagonal_slide_computation) export(as_epi_archive) export(as_epi_df) +export(as_time_slide_computation) export(as_tsibble) export(autoplot) export(clone) diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index 9e9279fc..d97d7307 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -209,14 +209,16 @@ epix_slide.grouped_epi_archive <- function( .f, ..., .before = Inf, - .ref_time_values = NULL, + .versions = NULL, .new_col_name = NULL, .all_versions = FALSE) { - # Deprecated argument handling + + # Perform some deprecated argument checks without using ` = + # deprecated()` in the function signature, because they are from + # early development versions and much more likely to be clutter than + # informative in the signature. provided_args <- rlang::call_args_names(rlang::call_match()) - if (any(purrr::map_lgl( - provided_args, ~ .x %in% c("x", "f", "before", "ref_time_values", "new_col_name", "all_versions") - ))) { + if (any(provided_args %in% c("x", "f", "before", "ref_time_values", "new_col_name", "all_versions", "group_by"))) { cli::cli_abort( "epix_slide: you are using one of the following old argument names: `x`, `f`, `before`, `ref_time_values`, `new_col_name`, `all_versions`. Please use the new names: `.x`, `.f`, `.before`, `.ref_time_values`, @@ -255,19 +257,21 @@ epix_slide.grouped_epi_archive <- function( } # Argument validation - if (is.null(.ref_time_values)) { - ref_time_values <- epix_slide_ref_time_values_default(.x$private$ungrouped) + if (is.null(.versions)) { + .versions <- epix_slide_versions_default(.x$private$ungrouped) } else { - assert_numeric(.ref_time_values, min.len = 1L, null.ok = FALSE, any.missing = FALSE) - if (any(.ref_time_values > .x$private$ungrouped$versions_end)) { - cli_abort("Some `ref_time_values` are greater than the latest version in the archive.") + assert_numeric(.versions, min.len = 1L, null.ok = FALSE, any.missing = FALSE) + if (any(.versions > .x$private$ungrouped$versions_end)) { + cli_abort("All `.versions` must be less than or equal to the latest version in the archive.") } - if (anyDuplicated(.ref_time_values) != 0L) { - cli_abort("Some `ref_time_values` are duplicated.") + if (anyDuplicated(.versions) != 0L) { + cli_abort("All `.versions` must be unique.") } # Sort, for consistency with `epi_slide`, although the current # implementation doesn't take advantage of it. - ref_time_values <- sort(.ref_time_values) + .versions <- sort(.versions) + ref_time_values <- sort(ref_time_values) + .versions <- sort(.versions) } validate_slide_window_arg(.before, .x$private$ungrouped$time_type) @@ -287,14 +291,14 @@ epix_slide.grouped_epi_archive <- function( cli_abort("If `f` is missing then a computation must be specified via `...`.") } - f <- as_slide_computation(quosures) + f <- as_diagonal_slide_computation(quosures) # Magic value that passes zero args as dots in calls below. Equivalent to # `... <- missing_arg()`, but use `assign` to avoid warning about # improper use of dots. assign("...", missing_arg()) } else { used_data_masking <- FALSE - f <- as_slide_computation(.f, ...) + f <- as_diagonal_slide_computation(.f, ...) } # Computation for one group, one time value @@ -326,7 +330,7 @@ epix_slide.grouped_epi_archive <- function( # redundant work. `group_modify()` provides the group key, we provide the # ref time value (appropriately recycled) and comp_value (appropriately # named / unpacked, for quick feedback) - res <- list(time_value = vctrs::vec_rep(ref_time_value, vctrs::vec_size(comp_value))) + res <- list(version = vctrs::vec_rep(ref_time_value, vctrs::vec_size(comp_value))) if (is.null(new_col_name)) { if (inherits(comp_value, "data.frame")) { @@ -350,7 +354,7 @@ epix_slide.grouped_epi_archive <- function( return(validate_tibble(new_tibble(res))) } - out <- lapply(ref_time_values, function(ref_time_value) { + out <- lapply(versions, function(ref_time_value) { # Ungrouped as-of data; `epi_df` if `all_versions` is `FALSE`, # `epi_archive` if `all_versions` is `TRUE`: as_of_raw <- .x$private$ungrouped %>% epix_as_of( diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index c89ed61c..169e9270 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -802,7 +802,7 @@ epix_slide <- function( .f, ..., .before = Inf, - .ref_time_values = NULL, + .versions = NULL, .new_col_name = NULL, .all_versions = FALSE) { UseMethod("epix_slide") @@ -816,7 +816,7 @@ epix_slide.epi_archive <- function( .f, ..., .before = Inf, - .ref_time_values = NULL, + .versions = NULL, .new_col_name = NULL, .all_versions = FALSE) { # For an "ungrouped" slide, treat all rows as belonging to one big @@ -826,7 +826,7 @@ epix_slide.epi_archive <- function( group_by(.x), .f, ..., - .before = .before, .ref_time_values = .ref_time_values, + .before = .before, .versions = .versions, .new_col_name = .new_col_name, .all_versions = .all_versions ) %>% # We want a slide on ungrouped archives to output something @@ -841,7 +841,7 @@ epix_slide.epi_archive <- function( #' Default value for `ref_time_values` in an `epix_slide` #' #' @noRd -epix_slide_ref_time_values_default <- function(ea) { +epix_slide_versions_default <- function(ea) { versions_with_updates <- c(ea$DT$version, ea$versions_end) ref_time_values <- tidyr::full_seq(versions_with_updates, guess_period(versions_with_updates)) return(ref_time_values) diff --git a/R/slide.R b/R/slide.R index 02e45329..a9f3a86c 100644 --- a/R/slide.R +++ b/R/slide.R @@ -193,7 +193,7 @@ epi_slide <- function( used_data_masking <- FALSE } - f <- as_slide_computation(.f, ...) + f <- as_time_slide_computation(.f, ...) # Create a wrapper that calculates and passes `.ref_time_value` to the # computation. `i` is contained in the `f_wrapper_factory` environment such diff --git a/R/utils.R b/R/utils.R index 9c47594d..c585abec 100644 --- a/R/utils.R +++ b/R/utils.R @@ -89,21 +89,21 @@ paste_lines <- function(lines) { paste(paste0(lines, "\n"), collapse = "") } - #' Assert that a sliding computation function takes enough args #' #' @param f Function; specifies a computation to slide over an `epi_df` or -#' `epi_archive` in `epi_slide` or `epix_slide`. +#' `epi_archive` in `epi_slide` or `epix_slide`. #' @param ... Dots that will be forwarded to `f` from the dots of `epi_slide` or #' `epix_slide`. +#' @template ref-time-value-label #' #' @importFrom rlang is_missing #' @importFrom purrr map_lgl #' @importFrom utils tail #' #' @noRd -assert_sufficient_f_args <- function(f, ...) { - mandatory_f_args_labels <- c("window data", "group key", "reference time value") +assert_sufficient_f_args <- function(f, ..., .ref_time_value_label) { + mandatory_f_args_labels <- c("window data", "group key", .ref_time_value_label) n_mandatory_f_args <- length(mandatory_f_args_labels) args <- formals(args(f)) args_names <- names(args) @@ -265,21 +265,42 @@ assert_sufficient_f_args <- function(f, ...) { #' @param ... Additional arguments to pass to the function or formula #' specified via `x`. If `x` is a quosure, any arguments passed via `...` #' will be ignored. +#' +#' @param .ref_time_value_long_varnames `r lifecycle::badge("experimental")` +#' Character vector. What variable names should we allow formulas and +#' data-masking tidy evaluation to use to refer to `ref_time_value` for the +#' computation (in addition to `.z` in formulas)? E.g., `".ref_time_value"` or +#' `c(".ref_time_value", ".version")`. +#' +#' @template ref-time-value-label +#' #' @examples -#' f <- as_slide_computation(~ .x + 1) -#' f(10) +#' f1 <- as_slide_computation(~ .z - .x$time_value, +#' .ref_time_value_long_varnames = character(0L), +#' .ref_time_value_label = "third argument" +#' ) +#' f1(tibble::tibble(time_value = 10), tibble::tibble(), 12) +#' +#' f2 <- as_time_slide_computation(~ .ref_time_value - .x$time_value) +#' f2(tibble::tibble(time_value = 10), tibble::tibble(), 12) #' -#' g <- as_slide_computation(~ -1 * .) +#' f3 <- as_diagonal_slide_computation(~ .version - .x$time_value) +#' f3(tibble::tibble(time_value = 10), tibble::tibble(), 12) +#' +#' f4 <- as_diagonal_slide_computation(~ .ref_time_value - .x$time_value) +#' f4(tibble::tibble(time_value = 10), tibble::tibble(), 12) +#' +#' g <- as_time_slide_computation(~ -1 * .) #' g(4) #' -#' h <- as_slide_computation(~ .x - .group_key) +#' h <- as_time_slide_computation(~ .x - .group_key) #' h(6, 3) #' #' @importFrom rlang is_function new_function f_env is_environment missing_arg #' f_rhs is_formula caller_arg caller_env #' #' @noRd -as_slide_computation <- function(f, ...) { +as_slide_computation <- function(f, ..., .ref_time_value_long_varnames, .ref_time_value_label) { arg <- caller_arg(f) call <- caller_env() @@ -301,7 +322,9 @@ as_slide_computation <- function(f, ...) { # through the quosures. data_mask$.x <- .x data_mask$.group_key <- .group_key - data_mask$.ref_time_value <- .ref_time_value + for (ref_time_value_long_varname in .ref_time_value_long_varnames) { + data_mask[[ref_time_value_long_varname]] <- .ref_time_value + } common_size <- NULL # The data mask is an environment; it doesn't track the binding order. # We'll track that separately. For efficiency, we'll use `c` to add to @@ -373,7 +396,7 @@ as_slide_computation <- function(f, ...) { if (is_function(f)) { # Check that `f` takes enough args - assert_sufficient_f_args(f, ...) + assert_sufficient_f_args(f, ..., .ref_time_value_label = .ref_time_value_label) return(f) } @@ -410,13 +433,19 @@ as_slide_computation <- function(f, ...) { ) } - args <- list( - ... = missing_arg(), - .x = quote(..1), .y = quote(..2), .z = quote(..3), - . = quote(..1), .group_key = quote(..2), .ref_time_value = quote(..3) + args <- c( + list( + ... = missing_arg(), + .x = quote(..1), .y = quote(..2), .z = quote(..3), + . = quote(..1), .group_key = quote(..2) + ), + `names<-`( + rep(list(quote(..3)), length(.ref_time_value_long_varnames)), + .ref_time_value_long_varnames + ) ) fn <- new_function(args, f_rhs(f), env) - fn <- structure(fn, class = c("epiprocess_slide_computation", "function")) + fn <- structure(fn, class = c("epiprocess_formula_slide_computation", "function")) return(fn) } @@ -432,6 +461,27 @@ as_slide_computation <- function(f, ...) { ) } +#' @rdname as_slide_computation +#' @export +#' @noRd +as_time_slide_computation <- function(f, ...) { + as_slide_computation( + f, ..., + .ref_time_value_long_varnames = ".ref_time_value", + .ref_time_value_label = "reference time value" + ) +} + +#' @rdname as_slide_computation +#' @export +#' @noRd +as_diagonal_slide_computation <- function(f, ...) { + as_slide_computation( + f, ..., + .ref_time_value_long_varnames = c(".version", ".ref_time_value"), + .ref_time_value_label = "version" + ) +} guess_geo_type <- function(geo_value) { if (is.character(geo_value)) { diff --git a/man-roxygen/ref-time-value-label.R b/man-roxygen/ref-time-value-label.R new file mode 100644 index 00000000..c81615b9 --- /dev/null +++ b/man-roxygen/ref-time-value-label.R @@ -0,0 +1,2 @@ +#' @param .ref_time_value_label String; how to describe/label the `ref_time_value` in +#' error messages; e.g., "reference time value" or "version". diff --git a/man/epix_slide.Rd b/man/epix_slide.Rd index d2f0c68f..75a99994 100644 --- a/man/epix_slide.Rd +++ b/man/epix_slide.Rd @@ -11,7 +11,7 @@ epix_slide( .f, ..., .before = Inf, - .ref_time_values = NULL, + .versions = NULL, .new_col_name = NULL, .all_versions = FALSE ) @@ -21,7 +21,7 @@ epix_slide( .f, ..., .before = Inf, - .ref_time_values = NULL, + .versions = NULL, .new_col_name = NULL, .all_versions = FALSE ) @@ -31,7 +31,7 @@ epix_slide( .f, ..., .before = Inf, - .ref_time_values = NULL, + .versions = NULL, .new_col_name = NULL, .all_versions = FALSE ) @@ -74,14 +74,6 @@ in the archive is "day", and the \code{.ref_time_value} is January 8, then the smallest time_value in the snapshot will be January 1. If missing, then the default is no limit on the time values, so the full snapshot is given.} -\item{.ref_time_values}{Reference time values / versions for sliding -computations; each element of this vector serves both as the anchor point -for the \code{time_value} window for the computation and the \code{max_version} -\code{epix_as_of} which we fetch data in this window. If missing, then this will -set to a regularly-spaced sequence of values set to cover the range of -\code{version}s in the \code{DT} plus the \code{versions_end}; the spacing of values will -be guessed (using the GCD of the skips between values).} - \item{.new_col_name}{String indicating the name of the new column that will contain the derivative values. The default is "slide_value" unless your slide computations output data frames, in which case they will be unpacked @@ -94,6 +86,14 @@ TRUE, then \code{.f} will be passed the version history (all \code{.ref_time_value - before} and \code{.ref_time_value}. Otherwise, \code{.f} will be passed only the most recent \code{version} for every unique \code{time_value}. Default is \code{FALSE}.} + +\item{.ref_time_values}{Reference time values / versions for sliding +computations; each element of this vector serves both as the anchor point +for the \code{time_value} window for the computation and the \code{max_version} +\code{epix_as_of} which we fetch data in this window. If missing, then this will +set to a regularly-spaced sequence of values set to cover the range of +\code{version}s in the \code{DT} plus the \code{versions_end}; the spacing of values will +be guessed (using the GCD of the skips between values).} } \value{ A tibble whose columns are: the grouping variables, \code{time_value}, diff --git a/tests/testthat/test-epix_slide.R b/tests/testthat/test-epix_slide.R index 87edfdb5..179d9427 100644 --- a/tests/testthat/test-epix_slide.R +++ b/tests/testthat/test-epix_slide.R @@ -13,7 +13,7 @@ x <- tibble::tribble( test_date + 6, test_date + c(1:2, 4:5), 2^(7:10), test_date + 7, test_date + 2:6, 2^(11:15) ) %>% - tidyr::unnest(c(time_value, binary)) + tidyr::unchop(c(time_value, binary)) xx <- bind_cols(geo_value = rep("ak", 15), x) %>% as_epi_archive() @@ -29,7 +29,7 @@ test_that("epix_slide works as intended", { xx2 <- tibble( geo_value = rep("ak", 4), - time_value = test_date + c(4, 5, 6, 7), + version = test_date + c(4, 5, 6, 7), sum_binary = c( 2^3 + 2^2, 2^6 + 2^3, @@ -80,7 +80,7 @@ test_that("epix_slide works as intended with list cols", { ) xx_dfrow2 <- tibble( geo_value = rep("ak", 4), - time_value = test_date + c(4, 5, 6, 7), + version = test_date + c(4, 5, 6, 7), slide_value = c( 2^3 + 2^2, @@ -108,7 +108,7 @@ test_that("epix_slide works as intended with list cols", { ) xx_df2 <- tibble( geo_value = rep("ak", 4), - time_value = test_date + c(4, 5, 6, 7), + version = test_date + c(4, 5, 6, 7), slide_value = list( c(2^3, 2^2), @@ -128,7 +128,7 @@ test_that("epix_slide works as intended with list cols", { ) xx_scalar2 <- tibble( geo_value = rep("ak", 4), - time_value = test_date + c(4, 5, 6, 7), + version = test_date + c(4, 5, 6, 7), slide_value = list( 2^3 + 2^2, @@ -148,7 +148,7 @@ test_that("epix_slide works as intended with list cols", { ) xx_vec2 <- tibble( geo_value = rep("ak", 4), - time_value = test_date + c(4, 5, 6, 7), + version = test_date + c(4, 5, 6, 7), slide_value = list( c(2^3, 2^2), @@ -182,7 +182,7 @@ test_that("epix_slide `.before` validation works", { test_that("quosure passing issue in epix_slide is resolved + other potential issues", { # (First part adapted from @examples) - time_values <- seq(as.Date("2020-06-01"), + versions <- seq(as.Date("2020-06-01"), as.Date("2020-06-02"), by = "1 day" ) @@ -200,7 +200,7 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss epix_slide( .f = ~ mean(.x$case_rate_7d_av), .before = 2, - .ref_time_values = time_values, + .versions = versions, .new_col_name = "case_rate_3d_av" ) reference_by_neither <- ea %>% @@ -208,7 +208,7 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss epix_slide( .f = ~ mean(.x$case_rate_7d_av), .before = 2, - .ref_time_values = time_values, + .versions = versions, .new_col_name = "case_rate_3d_av" ) # test the passing-something-that-must-be-enquosed behavior: @@ -220,7 +220,7 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss epix_slide( .f = ~ mean(.x$case_rate_7d_av), .before = 2, - .ref_time_values = time_values, + .versions = versions, .new_col_name = "case_rate_3d_av" ), reference_by_modulus @@ -231,7 +231,7 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss .x = ea %>% group_by(.data$modulus), .f = ~ mean(.x$case_rate_7d_av), .before = 2, - .ref_time_values = time_values, + .versions = versions, .new_col_name = "case_rate_3d_av" ), reference_by_modulus @@ -242,7 +242,7 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss epix_slide( .f = ~ mean(.x$case_rate_7d_av), .before = 2, - .ref_time_values = time_values, + .versions = versions, .new_col_name = "case_rate_3d_av" ), reference_by_modulus @@ -253,7 +253,7 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss .x = ea %>% group_by(dplyr::across(all_of("modulus"))), .f = ~ mean(.x$case_rate_7d_av), .before = 2, - .ref_time_values = time_values, + .versions = versions, .new_col_name = "case_rate_3d_av" ), reference_by_modulus @@ -264,7 +264,7 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss epix_slide( .f = ~ mean(.x$case_rate_7d_av), .before = 2, - .ref_time_values = time_values, + .versions = versions, .new_col_name = "case_rate_3d_av" ), reference_by_modulus @@ -276,7 +276,7 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss .x = ea %>% group_by(dplyr::across(tidyselect::all_of(my_group_by))), .f = ~ mean(.x$case_rate_7d_av), .before = 2, - .ref_time_values = time_values, + .versions = versions, .new_col_name = "case_rate_3d_av" ), reference_by_modulus @@ -287,7 +287,7 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss epix_slide( .f = ~ mean(.x$case_rate_7d_av), .before = 2, - .ref_time_values = time_values, + .versions = versions, .new_col_name = "case_rate_3d_av" ), reference_by_modulus @@ -298,7 +298,7 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss .x = ea, .f = ~ mean(.x$case_rate_7d_av), .before = 2, - .ref_time_values = time_values, + .versions = versions, .new_col_name = "case_rate_3d_av" ), reference_by_neither @@ -307,7 +307,7 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss ea %>% epix_slide( .f = ~ mean(.x$case_rate_7d_av), .before = 2, - .ref_time_values = time_values, + .versions = versions, .new_col_name = "case_rate_3d_av" ), reference_by_neither @@ -323,7 +323,7 @@ ea <- tibble::tribble( test_date + 6, test_date + 1:5, 2^(5:1), test_date + 7, test_date + 1:6, 2^(6:1) ) %>% - tidyr::unnest(c(time_value, binary)) %>% + tidyr::unchop(c(time_value, binary)) %>% mutate(geo_value = "ak") %>% as_epi_archive() @@ -350,7 +350,7 @@ test_that("epix_slide with .all_versions option has access to all older versions expect_true(inherits(result1, "tbl_df")) result2 <- tibble::tribble( - ~time_value, ~n_versions, ~n_row, ~dt_class1, ~dt_key, + ~version, ~n_versions, ~n_row, ~dt_class1, ~dt_key, test_date + 2, 1L, sum(1:1), "data.table", key(ea$DT), test_date + 3, 2L, sum(1:2), "data.table", key(ea$DT), test_date + 4, 3L, sum(1:3), "data.table", key(ea$DT), @@ -388,7 +388,7 @@ test_that("epix_slide with .all_versions option has access to all older versions epix_slide( # unfortunately, we can't pass this directly as `f` and need an extra comma , - slide_fn(.x, .group_key, .ref_time_value), + slide_fn(.x, .group_key, .version), .before = 10^3, .all_versions = TRUE ) @@ -404,14 +404,14 @@ test_that("epix_as_of and epix_slide with long enough window are compatible", { diff_mean = mean(diff(x$binary)) ) } - ref_time_value1 <- test_date + version1 <- test_date expect_identical( - ea %>% epix_as_of(ref_time_value1) %>% f1() %>% mutate(time_value = ref_time_value1, .before = 1L), + ea %>% epix_as_of(version1) %>% f1() %>% mutate(version = version1, .before = 1L), ea %>% epix_slide( f1, .before = 1000, - .ref_time_values = ref_time_value1 + .versions = version1 ) ) @@ -420,11 +420,11 @@ test_that("epix_as_of and epix_slide with long enough window are compatible", { x %>% # extract time&version-lag-1 data: epix_slide( - function(subx, subgk, rtv) { + function(subx, subgk, version) { tibble(data = list( subx %>% - filter(time_value == attr(subx, "metadata")$as_of - 1) %>% - rename(real_time_value = time_value, lag1 = binary) + filter(time_value == version - 1) %>% + rename(lag1 = binary) )) }, .before = 1 @@ -437,17 +437,17 @@ test_that("epix_as_of and epix_slide with long enough window are compatible", { ) %>% summarize(mean_abs_delta = mean(abs(binary - lag1))) } - ref_time_value2 <- test_date + 5 + version2 <- test_date + 5 expect_identical( ea %>% - epix_as_of(ref_time_value2, all_versions = TRUE) %>% + epix_as_of(version2, all_versions = TRUE) %>% f2() %>% - mutate(time_value = ref_time_value2, .before = 1L), + mutate(version = version2, .before = 1L), ea %>% epix_slide( f2, .before = 1000, - .ref_time_values = ref_time_value2, + .versions = version2, .all_versions = TRUE ) ) @@ -466,14 +466,14 @@ test_that("epix_as_of and epix_slide with long enough window are compatible", { epix_slide( f2, .before = 1000, - .ref_time_values = ref_time_value2, + .versions = version2, .all_versions = TRUE ) %>% filter(geo_value == "ak"), ea %>% # using `ea` here is like filtering `ea_multigeo` to `geo_value=="x"` - epix_as_of(ref_time_value2, all_versions = TRUE) %>% + epix_as_of(version2, all_versions = TRUE) %>% f2() %>% - transmute(geo_value = "ak", time_value = ref_time_value2, mean_abs_delta) %>% + transmute(geo_value = "ak", version = version2, mean_abs_delta) %>% group_by(geo_value) ) }) @@ -489,7 +489,7 @@ test_that("epix_slide `f` is passed an ungrouped `epi_archive` when `.all_versio epix_slide( .f = slide_fn, .before = 1, - .ref_time_values = test_date + 5, + .versions = test_date + 5, .new_col_name = "out", .all_versions = TRUE ) @@ -507,7 +507,7 @@ test_that("epix_slide with .all_versions option works as intended", { xx2 <- tibble( geo_value = rep("ak", 4), - time_value = test_date + c(4, 5, 6, 7), + version = test_date + c(4, 5, 6, 7), sum_binary = c( 2^3 + 2^2, 2^6 + 2^3, @@ -564,7 +564,7 @@ test_that("epix_slide works with 0-row computation outputs", { ea %>% epix_slide_empty(), tibble::tibble( - time_value = ea$DT$version[integer(0)] + version = ea$DT$version[integer(0)] ) ) expect_identical( @@ -573,7 +573,7 @@ test_that("epix_slide works with 0-row computation outputs", { epix_slide_empty(), tibble::tibble( geo_value = ea$DT$geo_value[integer(0)], - time_value = ea$DT$version[integer(0)] + version = ea$DT$version[integer(0)] ) %>% group_by(geo_value) ) @@ -583,7 +583,7 @@ test_that("epix_slide works with 0-row computation outputs", { ea %>% epix_slide_empty(.all_versions = TRUE), tibble::tibble( - time_value = ea$DT$version[integer(0)] + version = ea$DT$version[integer(0)] ) ) expect_identical( @@ -592,7 +592,7 @@ test_that("epix_slide works with 0-row computation outputs", { epix_slide_empty(.all_versions = TRUE), tibble::tibble( geo_value = ea$DT$geo_value[integer(0)], - time_value = ea$DT$version[integer(0)] + version = ea$DT$version[integer(0)] ) %>% group_by(geo_value) ) @@ -610,10 +610,10 @@ test_that("epix_slide alerts if the provided f doesn't take enough args", { ) }) -test_that("epix_slide computation via formula can use ref_time_value", { +test_that("epix_slide computation via formula can use version", { xx_ref <- tibble( geo_value = rep("ak", 4), - time_value = test_date + c(4, 5, 6, 7), + version = test_date + c(4, 5, 6, 7), slide_value = test_date + c(4, 5, 6, 7) ) %>% group_by(geo_value) @@ -621,7 +621,7 @@ test_that("epix_slide computation via formula can use ref_time_value", { xx1 <- xx %>% group_by(.data$geo_value) %>% epix_slide( - .f = ~.ref_time_value, + .f = ~.version, .before = 2 ) @@ -646,10 +646,10 @@ test_that("epix_slide computation via formula can use ref_time_value", { expect_identical(xx3, xx_ref) }) -test_that("epix_slide computation via function can use ref_time_value", { +test_that("epix_slide computation via function can use version", { xx_ref <- tibble( geo_value = rep("ak", 4), - time_value = test_date + c(4, 5, 6, 7), + version = test_date + c(4, 5, 6, 7), slide_value = test_date + c(4, 5, 6, 7) ) %>% group_by(geo_value) @@ -664,11 +664,11 @@ test_that("epix_slide computation via function can use ref_time_value", { expect_identical(xx1, xx_ref) }) -test_that("epix_slide computation via dots can use ref_time_value and group", { - # ref_time_value +test_that("epix_slide computation via dots can use version and group", { + # version xx_ref <- tibble( geo_value = rep("ak", 4), - time_value = test_date + c(4, 5, 6, 7), + version = test_date + c(4, 5, 6, 7), slide_value = test_date + c(4, 5, 6, 7) ) %>% group_by(geo_value) @@ -677,7 +677,7 @@ test_that("epix_slide computation via dots can use ref_time_value and group", { group_by(.data$geo_value) %>% epix_slide( .before = 2, - slide_value = .ref_time_value + slide_value = .version ) expect_identical(xx1, xx_ref) @@ -685,7 +685,7 @@ test_that("epix_slide computation via dots can use ref_time_value and group", { # group_key xx_ref <- tibble( geo_value = rep("ak", 4), - time_value = test_date + c(4, 5, 6, 7), + version = test_date + c(4, 5, 6, 7), slide_value = "ak" ) %>% group_by(geo_value) @@ -752,7 +752,7 @@ test_that("`epix_slide` doesn't decay date output", { test_that("`epix_slide` can access objects inside of helper functions", { helper <- function(archive_haystack, time_value_needle) { - archive_haystack %>% epix_slide(has_needle = time_value_needle %in% time_value, .before = Inf) + archive_haystack %>% epix_slide(has_needle = time_value_needle %in% time_value) } expect_no_error(helper(archive_cases_dv_subset, as.Date("2021-01-01"))) expect_no_error(helper(xx, 3L)) diff --git a/tests/testthat/test-grouped_epi_archive.R b/tests/testthat/test-grouped_epi_archive.R index 388ed614..1e953d6f 100644 --- a/tests/testthat/test-grouped_epi_archive.R +++ b/tests/testthat/test-grouped_epi_archive.R @@ -52,7 +52,7 @@ test_that("Grouping, regrouping, and ungrouping archives works as intended", { grouped_factor_then_nonfactor %>% epix_slide(.before = 10, s = sum(value)), tibble::tribble( - ~age_group, ~geo_value, ~time_value, ~s, + ~age_group, ~geo_value, ~version, ~s, "pediatric", NA_character_, "2000-01-02", 0, "adult", "us", "2000-01-02", 121, "pediatric", "us", "2000-01-03", 5, @@ -60,7 +60,7 @@ test_that("Grouping, regrouping, and ungrouping archives works as intended", { ) %>% mutate( age_group = ordered(age_group, c("pediatric", "adult")), - time_value = as.Date(time_value) + version = as.Date(version) ) %>% group_by(age_group, geo_value, .drop = FALSE) ) @@ -69,7 +69,7 @@ test_that("Grouping, regrouping, and ungrouping archives works as intended", { group_by(geo_value, age_group, .drop = FALSE) %>% epix_slide(.before = 10, s = sum(value)), tibble::tribble( - ~geo_value, ~age_group, ~time_value, ~s, + ~geo_value, ~age_group, ~version, ~s, "us", "pediatric", "2000-01-02", 0, "us", "adult", "2000-01-02", 121, "us", "pediatric", "2000-01-03", 5, @@ -77,7 +77,7 @@ test_that("Grouping, regrouping, and ungrouping archives works as intended", { ) %>% mutate( age_group = ordered(age_group, c("pediatric", "adult")), - time_value = as.Date(time_value) + version = as.Date(version) ) %>% # as_epi_df(as_of = as.Date("2000-01-03"), # other_keys = "age_group") %>% diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index b16c8ebe..b84c1e4a 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -77,56 +77,56 @@ test_that("assert_sufficient_f_args alerts if the provided f doesn't take enough f_xgt_dots <- function(x, g, t, ...) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) # If `regexp` is NA, asserts that there should be no errors/messages. - expect_no_error(assert_sufficient_f_args(f_xgt)) - expect_no_warning(assert_sufficient_f_args(f_xgt)) - expect_no_error(assert_sufficient_f_args(f_xgt_dots)) - expect_no_warning(assert_sufficient_f_args(f_xgt_dots)) + expect_no_error(assert_sufficient_f_args(f_xgt, .ref_time_value_label = "reference time value")) + expect_no_warning(assert_sufficient_f_args(f_xgt, .ref_time_value_label = "reference time value")) + expect_no_error(assert_sufficient_f_args(f_xgt_dots, .ref_time_value_label = "reference time value")) + expect_no_warning(assert_sufficient_f_args(f_xgt_dots, .ref_time_value_label = "reference time value")) f_x_dots <- function(x, ...) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) f_dots <- function(...) dplyr::tibble(value = c(5), count = c(2)) f_x <- function(x) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) f <- function() dplyr::tibble(value = c(5), count = c(2)) - expect_warning(assert_sufficient_f_args(f_x_dots), + expect_warning(assert_sufficient_f_args(f_x_dots, .ref_time_value_label = "reference time value"), regexp = ", the group key and reference time value will be included", class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots" ) - expect_warning(assert_sufficient_f_args(f_dots), + expect_warning(assert_sufficient_f_args(f_dots, .ref_time_value_label = "reference time value"), regexp = ", the window data, group key, and reference time value will be included", class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots" ) - expect_error(assert_sufficient_f_args(f_x), + expect_error(assert_sufficient_f_args(f_x, .ref_time_value_label = "reference time value"), class = "epiprocess__assert_sufficient_f_args__f_needs_min_args" ) - expect_error(assert_sufficient_f_args(f), + expect_error(assert_sufficient_f_args(f, .ref_time_value_label = "reference time value"), class = "epiprocess__assert_sufficient_f_args__f_needs_min_args" ) # Make sure we generate the same sort of conditions on some external functions # that have caused surprises in the past: - expect_warning(assert_sufficient_f_args(mean), + expect_warning(assert_sufficient_f_args(mean, .ref_time_value_label = "reference time value"), regexp = ", the group key and reference time value will be included", class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots" ) - expect_warning(assert_sufficient_f_args(sum), + expect_warning(assert_sufficient_f_args(sum, .ref_time_value_label = "reference time value"), regexp = ", the window data, group key, and reference time value will be included", class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots" ) - expect_warning(assert_sufficient_f_args(dplyr::slice), + expect_warning(assert_sufficient_f_args(dplyr::slice, .ref_time_value_label = "reference time value"), regexp = ", the group key and reference time value will be included", class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots" ) f_xs_dots <- function(x, setting = "a", ...) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) f_xs <- function(x, setting = "a") dplyr::tibble(value = mean(x$binary), count = length(x$binary)) - expect_warning(assert_sufficient_f_args(f_xs_dots, setting = "b"), + expect_warning(assert_sufficient_f_args(f_xs_dots, setting = "b", .ref_time_value_label = "reference time value"), class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots" ) - expect_error(assert_sufficient_f_args(f_xs, setting = "b"), + expect_error(assert_sufficient_f_args(f_xs, setting = "b", .ref_time_value_label = "reference time value"), class = "epiprocess__assert_sufficient_f_args__f_needs_min_args_plus_forwarded" ) - expect_error(assert_sufficient_f_args(f_xgt, "b"), + expect_error(assert_sufficient_f_args(f_xgt, "b", .ref_time_value_label = "reference time value"), class = "epiprocess__assert_sufficient_f_args__f_needs_min_args_plus_forwarded" ) }) @@ -136,15 +136,15 @@ test_that("assert_sufficient_f_args alerts if the provided f has defaults for th f_xgt_dots <- function(x = 1, g, t, ...) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) f_x_dots <- function(x = 1, ...) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) - expect_error(assert_sufficient_f_args(f_xgt), + expect_error(assert_sufficient_f_args(f_xgt, .ref_time_value_label = "reference time value"), regexp = "pass the group key to `f`'s g argument,", class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults" ) - expect_error(assert_sufficient_f_args(f_xgt_dots), + expect_error(assert_sufficient_f_args(f_xgt_dots, .ref_time_value_label = "reference time value"), regexp = "pass the window data to `f`'s x argument,", class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults" ) - expect_error(suppressWarnings(assert_sufficient_f_args(f_x_dots)), + expect_error(suppressWarnings(assert_sufficient_f_args(f_x_dots, .ref_time_value_label = "reference time value")), class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults" ) @@ -153,23 +153,23 @@ test_that("assert_sufficient_f_args alerts if the provided f has defaults for th f_xs_dots <- function(x = 1, setting = "a", ...) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) # forwarding named dots should prevent some complaints: - expect_no_error(assert_sufficient_f_args(f_xsgt, setting = "b")) - expect_no_error(assert_sufficient_f_args(f_xsgt_dots, setting = "b")) - expect_error(suppressWarnings(assert_sufficient_f_args(f_xs_dots, setting = "b")), + expect_no_error(assert_sufficient_f_args(f_xsgt, setting = "b", .ref_time_value_label = "reference time value")) + expect_no_error(assert_sufficient_f_args(f_xsgt_dots, setting = "b", .ref_time_value_label = "reference time value")) + expect_error(suppressWarnings(assert_sufficient_f_args(f_xs_dots, setting = "b", .ref_time_value_label = "reference time value")), regexp = "pass the window data to `f`'s x argument", class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults" ) # forwarding unnamed dots should not: - expect_error(assert_sufficient_f_args(f_xsgt, "b"), + expect_error(assert_sufficient_f_args(f_xsgt, "b", .ref_time_value_label = "reference time value"), class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults" ) - expect_error(assert_sufficient_f_args(f_xsgt_dots, "b"), + expect_error(assert_sufficient_f_args(f_xsgt_dots, "b", .ref_time_value_label = "reference time value"), class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults" ) expect_error( expect_warning( - assert_sufficient_f_args(f_xs_dots, "b"), + assert_sufficient_f_args(f_xs_dots, "b", .ref_time_value_label = "reference time value"), class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots" ), class = "epiprocess__assert_sufficient_f_args__required_args_contain_defaults" @@ -178,7 +178,7 @@ test_that("assert_sufficient_f_args alerts if the provided f has defaults for th # forwarding no dots should produce a different error message in some cases: expect_error( expect_warning( - assert_sufficient_f_args(f_xs_dots), + assert_sufficient_f_args(f_xs_dots, .ref_time_value_label = "reference time value"), class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots" ), regexp = "window data and group key to `f`'s x and setting argument", @@ -188,43 +188,43 @@ test_that("assert_sufficient_f_args alerts if the provided f has defaults for th test_that("computation formula-derived functions take all argument types", { # positional - expect_identical(as_slide_computation(~ ..2 + ..3)(1, 2, 3), 5) - expect_identical(as_slide_computation(~..1)(1, 2, 3), 1) + expect_identical(as_time_slide_computation(~ ..2 + ..3)(1, 2, 3), 5) + expect_identical(as_time_slide_computation(~..1)(1, 2, 3), 1) # Matching rlang, purr, dplyr usage - expect_identical(as_slide_computation(~ .x + .z)(1, 2, 3), 4) - expect_identical(as_slide_computation(~ .x + .y)(1, 2, 3), 3) + expect_identical(as_time_slide_computation(~ .x + .z)(1, 2, 3), 4) + expect_identical(as_time_slide_computation(~ .x + .y)(1, 2, 3), 3) # named - expect_identical(as_slide_computation(~ . + .ref_time_value)(1, 2, 3), 4) - expect_identical(as_slide_computation(~.group_key)(1, 2, 3), 2) + expect_identical(as_time_slide_computation(~ . + .ref_time_value)(1, 2, 3), 4) + expect_identical(as_time_slide_computation(~.group_key)(1, 2, 3), 2) }) test_that("as_slide_computation passes functions unaltered", { f <- function(a, b, c) { a * b * c + 5 } - expect_identical(as_slide_computation(f), f) + expect_identical(as_time_slide_computation(f), f) }) test_that("as_slide_computation raises errors as expected", { # Formulas must be one-sided - expect_error(as_slide_computation(y ~ ..1), + expect_error(as_time_slide_computation(y ~ ..1), class = "epiprocess__as_slide_computation__formula_is_twosided" ) # Formulas can't be paired with ... - expect_error(as_slide_computation(~..1, method = "fn"), + expect_error(as_time_slide_computation(~..1, method = "fn"), class = "epiprocess__as_slide_computation__formula_with_dots" ) # `f_env` must be an environment formula_without_env <- stats::as.formula(~..1) rlang::f_env(formula_without_env) <- 5 - expect_error(as_slide_computation(formula_without_env), + expect_error(as_time_slide_computation(formula_without_env), class = "epiprocess__as_slide_computation__formula_has_no_env" ) # `f` must be a function, formula, or string - expect_error(as_slide_computation(5), + expect_error(as_time_slide_computation(5), class = "epiprocess__as_slide_computation__cant_convert_catchall" ) }) From 3110a7fe0b585b40848ff1cbf3a69d57b1a0a1df Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 5 Aug 2024 17:19:35 -0700 Subject: [PATCH 03/11] Fix vignette re. old clobberable version default + use versions_end - Default is now to not mark any versions as clobberable; simply remove discussion of old default as it was to explain a surprise/annoyance in normal use. - Favor using `$versions_end` to get the latest version; while in examples it's probably similar, in general, it's more "correct" and should be faster. --- vignettes/archive.Rmd | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/vignettes/archive.Rmd b/vignettes/archive.Rmd index d0deaf52..bfd67a46 100644 --- a/vignettes/archive.Rmd +++ b/vignettes/archive.Rmd @@ -172,16 +172,6 @@ date was June 1, 2021. From this we can infer that the doctor's visits signal was 2 days latent on June 1. Also, we can see that the metadata in the `epi_df` object has the version date recorded in the `as_of` field. -By default, using the maximum of the `version` column in the underlying data table in an -`epi_archive` object itself generates a snapshot of the latest values of signal -variables in the entire archive. The `epix_as_of()` function issues a warning in -this case, since updates to the current version may still come in at a later -point in time, due to various reasons, such as synchronization issues. - -```{r} -x_latest <- epix_as_of(x, max_version = max(x$DT$version)) -``` - Below, we pull several snapshots from the archive, spaced one month apart. We overlay the corresponding signal curves as colored lines, with the version dates marked by dotted vertical lines, and draw the latest curve in black (from the @@ -384,7 +374,7 @@ points in time and forecast horizons. The former comes from using `epi_slide()` to the latest snapshot of the data `x_latest`. ```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 6} -x_latest <- epix_as_of(x, max_version = max(x$DT$version)) +x_latest <- epix_as_of(x, x$versions_end) # Simple function to produce forecasts k weeks ahead k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { From aa73944d5458a119c704bf8198c48f6a912351fc Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 5 Aug 2024 17:35:39 -0700 Subject: [PATCH 04/11] Rename max_version -> version in epix_as_of since this seems like more appropriate and consistent naming for the main use case of extracting an `epi_df` snapshot. --- R/methods-epi_archive.R | 45 ++++++++++++++--------- man/epix_as_of.Rd | 21 ++++++++--- tests/testthat/test-methods-epi_archive.R | 10 ++--- vignettes/archive.Rmd | 4 +- 4 files changed, 49 insertions(+), 31 deletions(-) diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 169e9270..2fc9d58f 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -6,26 +6,28 @@ #' examples. #' #' @param x An `epi_archive` object -#' @param max_version Time value specifying the max version to permit in the +#' @param version Time value specifying the max version to permit in the #' snapshot. That is, the snapshot will comprise the unique rows of the #' current archive data that represent the most up-to-date signal values, as -#' of the specified `max_version` (and whose time values are at least +#' of the specified `version` (and whose time values are at least #' `min_time_value`.) #' @param min_time_value Time value specifying the min time value to permit in #' the snapshot. Default is `-Inf`, which effectively means that there is no #' minimum considered. #' @param all_versions If `all_versions = TRUE`, then the output will be in #' `epi_archive` format, and contain rows in the specified `time_value` range -#' having `version <= max_version`. The resulting object will cover a +#' having `version <= version`. The resulting object will cover a #' potentially narrower `version` and `time_value` range than `x`, depending #' on user-provided arguments. Otherwise, there will be one row in the output -#' for the `max_version` of each `time_value`. Default is `FALSE`. +#' for the `version` of each `time_value`. Default is `FALSE`. +#' @param max_version `r lifecycle::badge("deprecated")` please use `version` +#' argument instead. #' @return An `epi_df` object. #' #' @examples #' epix_as_of( #' archive_cases_dv_subset, -#' max_version = max(archive_cases_dv_subset$DT$version) +#' version = max(archive_cases_dv_subset$DT$version) #' ) #' #' range(archive_cases_dv_subset$DT$version) # 2020-06-02 -- 2021-12-01 @@ -58,31 +60,37 @@ #' #' @importFrom data.table between key #' @export -epix_as_of <- function(x, max_version, min_time_value = -Inf, all_versions = FALSE) { +epix_as_of <- function(x, version, min_time_value = -Inf, all_versions = FALSE, + max_version = deprecated()) { assert_class(x, "epi_archive") + if (lifecycle::is_present(max_version)) { + lifecycle::deprecate_warn("0.8.1", "epix_as_of(max_version =)", "epix_as_of(version =)") + version <- max_version + } + other_keys <- setdiff( key(x$DT), c("geo_value", "time_value", "version") ) - # Check a few things on max_version - if (!identical(class(max_version), class(x$DT$version))) { + # Check a few things on version + if (!identical(class(version), class(x$DT$version))) { cli_abort( - "`max_version` must have the same `class` vector as `epi_archive$DT$version`." + "`version` must have the same `class` vector as `epi_archive$DT$version`." ) } - if (!identical(typeof(max_version), typeof(x$DT$version))) { + if (!identical(typeof(version), typeof(x$DT$version))) { cli_abort( - "`max_version` must have the same `typeof` as `epi_archive$DT$version`." + "`version` must have the same `typeof` as `epi_archive$DT$version`." ) } - assert_scalar(max_version, na.ok = FALSE) - if (max_version > x$versions_end) { - cli_abort("`max_version` must be at most `epi_archive$versions_end`.") + assert_scalar(version, na.ok = FALSE) + if (version > x$versions_end) { + cli_abort("`version` must be at most `epi_archive$versions_end`.") } assert_logical(all_versions, len = 1) - if (!is.na(x$clobberable_versions_start) && max_version >= x$clobberable_versions_start) { + if (!is.na(x$clobberable_versions_start) && version >= x$clobberable_versions_start) { cli_warn( 'Getting data as of some recent version which could still be overwritten (under routine circumstances) without assigning a new @@ -96,13 +104,14 @@ epix_as_of <- function(x, max_version, min_time_value = -Inf, all_versions = FAL # Filter by version and return if (all_versions) { # epi_archive is copied into result, so we can modify result directly - result <- epix_truncate_versions_after(x, max_version) + result <- epix_truncate_versions_after(x, version) result$DT <- result$DT[time_value >= min_time_value, ] # nolint: object_usage_linter return(result) } # Make sure to use data.table ways of filtering and selecting - as_of_epi_df <- x$DT[time_value >= min_time_value & version <= max_version, ] %>% # nolint: object_usage_linter + .version <- version # workaround for `i` arg not supporting `..` feature + as_of_epi_df <- x$DT[time_value >= min_time_value & version <= .version, ] %>% # nolint: object_usage_linter unique( by = c("geo_value", "time_value", other_keys), fromLast = TRUE @@ -110,7 +119,7 @@ epix_as_of <- function(x, max_version, min_time_value = -Inf, all_versions = FAL tibble::as_tibble() %>% dplyr::select(-"version") %>% as_epi_df( - as_of = max_version, + as_of = version, other_keys = other_keys ) diff --git a/man/epix_as_of.Rd b/man/epix_as_of.Rd index 4ab23882..c3682489 100644 --- a/man/epix_as_of.Rd +++ b/man/epix_as_of.Rd @@ -4,15 +4,21 @@ \alias{epix_as_of} \title{Generate a snapshot from an \code{epi_archive} object} \usage{ -epix_as_of(x, max_version, min_time_value = -Inf, all_versions = FALSE) +epix_as_of( + x, + version, + min_time_value = -Inf, + all_versions = FALSE, + max_version = deprecated() +) } \arguments{ \item{x}{An \code{epi_archive} object} -\item{max_version}{Time value specifying the max version to permit in the +\item{version}{Time value specifying the max version to permit in the snapshot. That is, the snapshot will comprise the unique rows of the current archive data that represent the most up-to-date signal values, as -of the specified \code{max_version} (and whose time values are at least +of the specified \code{version} (and whose time values are at least \code{min_time_value}.)} \item{min_time_value}{Time value specifying the min time value to permit in @@ -21,10 +27,13 @@ minimum considered.} \item{all_versions}{If \code{all_versions = TRUE}, then the output will be in \code{epi_archive} format, and contain rows in the specified \code{time_value} range -having \code{version <= max_version}. The resulting object will cover a +having \code{version <= version}. The resulting object will cover a potentially narrower \code{version} and \code{time_value} range than \code{x}, depending on user-provided arguments. Otherwise, there will be one row in the output -for the \code{max_version} of each \code{time_value}. Default is \code{FALSE}.} +for the \code{version} of each \code{time_value}. Default is \code{FALSE}.} + +\item{max_version}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} please use \code{version} +argument instead.} } \value{ An \code{epi_df} object. @@ -37,7 +46,7 @@ examples. \examples{ epix_as_of( archive_cases_dv_subset, - max_version = max(archive_cases_dv_subset$DT$version) + version = max(archive_cases_dv_subset$DT$version) ) range(archive_cases_dv_subset$DT$version) # 2020-06-02 -- 2021-12-01 diff --git a/tests/testthat/test-methods-epi_archive.R b/tests/testthat/test-methods-epi_archive.R index 6686400b..f035c8c5 100644 --- a/tests/testthat/test-methods-epi_archive.R +++ b/tests/testthat/test-methods-epi_archive.R @@ -25,13 +25,13 @@ test_that("Errors are thrown due to bad epix_as_of inputs", { test_that("Warning against max_version being clobberable", { # none by default - expect_warning(regexp = NA, ea %>% epix_as_of(max_version = max(ea$DT$version))) - expect_warning(regexp = NA, ea %>% epix_as_of(max_version = min(ea$DT$version))) + expect_warning(regexp = NA, ea %>% epix_as_of(max(ea$DT$version))) + expect_warning(regexp = NA, ea %>% epix_as_of(min(ea$DT$version))) # but with `clobberable_versions_start` non-`NA`, yes ea_with_clobberable <- ea ea_with_clobberable$clobberable_versions_start <- max(ea_with_clobberable$DT$version) - expect_warning(ea_with_clobberable %>% epix_as_of(max_version = max(ea$DT$version))) - expect_warning(regexp = NA, ea_with_clobberable %>% epix_as_of(max_version = min(ea$DT$version))) + expect_warning(ea_with_clobberable %>% epix_as_of(max(ea$DT$version))) + expect_warning(regexp = NA, ea_with_clobberable %>% epix_as_of(min(ea$DT$version))) }) test_that("epix_as_of properly grabs the data and doesn't mutate key", { @@ -43,7 +43,7 @@ test_that("epix_as_of properly grabs the data and doesn't mutate key", { old_key <- data.table::key(ea2$DT) edf_as_of <- ea2 %>% - epix_as_of(max_version = as.Date("2020-06-03")) + epix_as_of(as.Date("2020-06-03")) edf_expected <- as_epi_df(tibble( geo_value = "ca", diff --git a/vignettes/archive.Rmd b/vignettes/archive.Rmd index bfd67a46..1f5ee1e3 100644 --- a/vignettes/archive.Rmd +++ b/vignettes/archive.Rmd @@ -159,7 +159,7 @@ of the archive in `epi_df` format. This represents the most up-to-date values of the signal variables as of a given version. ```{r} -x_snapshot <- epix_as_of(x, max_version = as.Date("2021-06-01")) +x_snapshot <- epix_as_of(x, as.Date("2021-06-01")) class(x_snapshot) head(x_snapshot) max(x_snapshot$time_value) @@ -183,7 +183,7 @@ theme_set(theme_bw()) self_max <- max(x$DT$version) versions <- seq(as.Date("2020-06-01"), self_max - 1, by = "1 month") snapshots <- map_dfr(versions, function(v) { - epix_as_of(x, max_version = v) %>% mutate(version = v) + epix_as_of(x, v) %>% mutate(version = v) }) %>% bind_rows( x_latest %>% mutate(version = self_max) From 547d156228e2f093119fde131a0b6ebccc122e24 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 20 Aug 2024 16:05:35 -0700 Subject: [PATCH 05/11] fix(epix_slide): partial time_value -> version output col rename --- R/grouped_epi_archive.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index d97d7307..2c7bbea9 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -278,7 +278,7 @@ epix_slide.grouped_epi_archive <- function( checkmate::assert_string(.new_col_name, null.ok = TRUE) if (identical(.new_col_name, "time_value")) { - cli_abort('`new_col_name` must not be `"time_value"`; `epix_slide()` uses that column name to attach the `ref_time_value` associated with each slide computation') # nolint: line_length_linter + cli_abort('`.new_col_name` must not be `"version"`; `epix_slide()` uses that column name to attach which of the `.versions` is associated with each slide computation') # nolint: line_length_linter } assert_logical(.all_versions, len = 1L) @@ -342,7 +342,7 @@ epix_slide.grouped_epi_archive <- function( } } else { # vector or packed data.frame-type column (note: new_col_name of - # "time_value" is disallowed): + # "version" is disallowed): res[[new_col_name]] <- comp_value } From 4290363c3b1a5443ac4da3b3504b094c99664f00 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 21 Aug 2024 16:27:30 -0700 Subject: [PATCH 06/11] Add group_vars.grouped_epi_archive --- NAMESPACE | 1 + R/archive.R | 9 +++++++-- R/grouped_epi_archive.R | 11 +++++++++-- man/group_by.epi_archive.Rd | 12 ++++++++++-- tests/testthat/test-methods-epi_archive.R | 6 +++++- 5 files changed, 32 insertions(+), 7 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index fa4f76df..a417837f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -27,6 +27,7 @@ S3method(group_by,epi_df) S3method(group_by,grouped_epi_archive) S3method(group_by_drop_default,grouped_epi_archive) S3method(group_modify,epi_df) +S3method(group_vars,grouped_epi_archive) S3method(groups,grouped_epi_archive) S3method(guess_period,Date) S3method(guess_period,POSIXt) diff --git a/R/archive.R b/R/archive.R index f7b11aff..48dbf9ec 100644 --- a/R/archive.R +++ b/R/archive.R @@ -585,8 +585,8 @@ print.epi_archive <- function(x, ..., class = TRUE, methods = TRUE) { #' `...`. #' @param .drop As described in [`dplyr::group_by`]; determines treatment of #' factor columns. -#' @param x For `groups` or `ungroup`: a `grouped_epi_archive`; for -#' `is_grouped_epi_archive`: any object +#' @param x For `groups`, `group_vars`, or `ungroup`: a `grouped_epi_archive`; +#' for `is_grouped_epi_archive`: any object #' @param .tbl (For `group_by_drop_default`:) an `epi_archive` or #' `grouped_epi_archive` (`epi_archive` dispatches to the S3 default method; #' `grouped_epi_archive` dispatches its own S3 method) @@ -665,6 +665,11 @@ print.epi_archive <- function(x, ..., class = TRUE, methods = TRUE) { #' group_by(geo_value, age_group) %>% #' ungroup(age_group) #' +#' # To get the grouping variable names as a character vector: +#' toy_archive %>% +#' group_by(geo_value) %>% +#' group_vars() +#' #' # To get the grouping variable names as a `list` of `name`s (a.k.a. symbols): #' toy_archive %>% #' group_by(geo_value) %>% diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index 2c7bbea9..c63ae98e 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -55,7 +55,7 @@ new_grouped_epi_archive <- function(x, vars, drop) { or `ungroup` first.", class = "epiprocess__grouped_epi_archive__ungrouped_arg_is_already_grouped", epiprocess__ungrouped_class = class(x), - epiprocess__ungrouped_groups = groups(x) + epiprocess__ungrouped_group_vars = group_vars(x) ) } assert_class(x, "epi_archive") @@ -160,6 +160,14 @@ group_by_drop_default.grouped_epi_archive <- function(.tbl) { .tbl$private$drop } +#' @include methods-epi_archive.R +#' @rdname group_by.epi_archive +#' +#' @importFrom dplyr group_vars +#' @export +group_vars.grouped_epi_archive <- function(x) { + x$private$vars +} #' @include methods-epi_archive.R #' @rdname group_by.epi_archive @@ -170,7 +178,6 @@ groups.grouped_epi_archive <- function(x) { rlang::syms(x$private$vars) } - #' @include methods-epi_archive.R #' @rdname group_by.epi_archive #' diff --git a/man/group_by.epi_archive.Rd b/man/group_by.epi_archive.Rd index e7c46311..aa6c2e2a 100644 --- a/man/group_by.epi_archive.Rd +++ b/man/group_by.epi_archive.Rd @@ -5,6 +5,7 @@ \alias{grouped_epi_archive} \alias{group_by.grouped_epi_archive} \alias{group_by_drop_default.grouped_epi_archive} +\alias{group_vars.grouped_epi_archive} \alias{groups.grouped_epi_archive} \alias{ungroup.grouped_epi_archive} \alias{is_grouped_epi_archive} @@ -16,6 +17,8 @@ \method{group_by_drop_default}{grouped_epi_archive}(.tbl) +\method{group_vars}{grouped_epi_archive}(x) + \method{groups}{grouped_epi_archive}(x) \method{ungroup}{grouped_epi_archive}(x, ...) @@ -52,8 +55,8 @@ factor columns.} \item{.tbl}{A \code{grouped_epi_archive} object.} -\item{x}{For \code{groups} or \code{ungroup}: a \code{grouped_epi_archive}; for -\code{is_grouped_epi_archive}: any object} +\item{x}{For \code{groups}, \code{group_vars}, or \code{ungroup}: a \code{grouped_epi_archive}; +for \code{is_grouped_epi_archive}: any object} } \description{ \code{group_by} and related methods for \code{epi_archive}, \code{grouped_epi_archive} @@ -131,6 +134,11 @@ toy_archive \%>\% group_by(geo_value, age_group) \%>\% ungroup(age_group) +# To get the grouping variable names as a character vector: +toy_archive \%>\% + group_by(geo_value) \%>\% + group_vars() + # To get the grouping variable names as a `list` of `name`s (a.k.a. symbols): toy_archive \%>\% group_by(geo_value) \%>\% diff --git a/tests/testthat/test-methods-epi_archive.R b/tests/testthat/test-methods-epi_archive.R index f035c8c5..803a11bd 100644 --- a/tests/testthat/test-methods-epi_archive.R +++ b/tests/testthat/test-methods-epi_archive.R @@ -110,7 +110,6 @@ test_that("epix_truncate_version_after returns the same grouping type as input e expect_true(is_grouped_epi_archive(ea_as_of)) }) - test_that("epix_truncate_version_after returns the same groups as input grouped_epi_archive", { ea2 <- ea2_data %>% as_epi_archive() @@ -122,3 +121,8 @@ test_that("epix_truncate_version_after returns the same groups as input grouped_ epix_truncate_versions_after(max_version = as.Date("2020-06-04")) expect_equal(ea_as_of %>% groups(), ea_expected %>% groups()) }) + +test_that("group_vars works as expected", { + expect_equal(ea2_data %>% as_epi_archive() %>% group_by(geo_value) %>% group_vars(), + "geo_value") +}) From 4b112e57221ef75f6cc2d24a208512ac38aa72b6 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 21 Aug 2024 23:51:59 -0700 Subject: [PATCH 07/11] refactor+tweak: add+use `format_class_vec` helper for messages Collapse with empty string in order to not have extra whitespace if used with `cat` rather than `cli_*`. --- R/archive.R | 2 +- R/utils.R | 10 +++++++++- man/format_class_vec.Rd | 17 +++++++++++++++++ 3 files changed, 27 insertions(+), 2 deletions(-) create mode 100644 man/format_class_vec.Rd diff --git a/R/archive.R b/R/archive.R index 48dbf9ec..5cf55ff6 100644 --- a/R/archive.R +++ b/R/archive.R @@ -49,7 +49,7 @@ validate_version_bound <- function(version_bound, x, na_ok = FALSE, if (!identical(class(version_bound), class(x[["version"]]))) { cli_abort( "{version_bound_arg} must have the same `class` vector as x$version, - which has a `class` of {paste(collapse = ' ', deparse(class(x$version)))}", + which has a `class` of {format_class_vec(class(x$version))}", class = "epiprocess__version_bound_mismatched_class" ) } diff --git a/R/utils.R b/R/utils.R index c585abec..a6a4382b 100644 --- a/R/utils.R +++ b/R/utils.R @@ -89,6 +89,14 @@ paste_lines <- function(lines) { paste(paste0(lines, "\n"), collapse = "") } +#' Format a class vector as a string via deparsing it +#' +#' @param class_vec `chr`; output of `class(object)` for some `object` +#' @return string +format_class_vec <- function(class_vec) { + paste(collapse = "", deparse(class_vec)) +} + #' Assert that a sliding computation function takes enough args #' #' @param f Function; specifies a computation to slide over an `epi_df` or @@ -451,7 +459,7 @@ as_slide_computation <- function(f, ..., .ref_time_value_long_varnames, .ref_tim } cli_abort( - "Can't convert an object of class {paste(collapse = ' ', deparse(class(f)))} + "Can't convert an object of class {format_class_vec(class(f))} to a slide computation", class = "epiprocess__as_slide_computation__cant_convert_catchall", epiprocess__f = f, diff --git a/man/format_class_vec.Rd b/man/format_class_vec.Rd new file mode 100644 index 00000000..b2b96678 --- /dev/null +++ b/man/format_class_vec.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{format_class_vec} +\alias{format_class_vec} +\title{Format a class vector as a string via deparsing it} +\usage{ +format_class_vec(class_vec) +} +\arguments{ +\item{class_vec}{\code{chr}; output of \code{class(object)} for some \code{object}} +} +\value{ +string +} +\description{ +Format a class vector as a string via deparsing it +} From 8838c712e64fa297f8e72beac05b66d83399667e Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 22 Aug 2024 00:29:41 -0700 Subject: [PATCH 08/11] WIP Add docs for de-dupe approach, part of the required validation Forbidding `new_col_name` being among the labeling columns addresses some dedupe cases where deduping would always lead to failure except for completely-redundant computations (that only output computation labels rather than and actual computation). - This might not be complete in a edge case where `"slide_value"` is a grouping variable. (E.g., from using a slide to assign a categorical trend, then doing a grouped slide based on the trend.) This is definitely only part of the dedupe handling. Unpacked-column outputs need to actually be de-duped. Also, fix incorrect documentation for time_value filter for .all_versions = TRUE while rebasing on other slide updates. --- R/methods-epi_archive.R | 19 +++++++++++-------- R/slide.R | 19 ++++++++++++++++++- R/utils.R | 22 ++++++++++++++++++++++ man/epi_slide.Rd | 4 +++- man/epix_slide.Rd | 19 +++++++++++-------- man/format_chr_with_quotes.Rd | 19 +++++++++++++++++++ 6 files changed, 84 insertions(+), 18 deletions(-) create mode 100644 man/format_chr_with_quotes.Rd diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 2fc9d58f..1471ea8a 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -650,15 +650,18 @@ epix_detailed_restricted_mutate <- function(.data, ...) { #' set to a regularly-spaced sequence of values set to cover the range of #' `version`s in the `DT` plus the `versions_end`; the spacing of values will #' be guessed (using the GCD of the skips between values). -#' @param .new_col_name String indicating the name of the new column that will -#' contain the derivative values. The default is "slide_value" unless your -#' slide computations output data frames, in which case they will be unpacked -#' into the constituent columns and those names used. Note that setting -#' `.new_col_name` equal to an existing column name will overwrite this column. +#' @param .new_col_name Either `NULL` or a string indicating the name of the new +#' column that will contain the derived values. The default, `NULL`, will use +#' the name "slide_value" unless your slide computations output data frames, +#' in which case they will be unpacked into the constituent columns and those +#' names used. If the resulting column name(s) overlap with the column names +#' used for labeling the computations, which are `group_vars(x)` and +#' `"version"`, then the values for these columns must be identical to the +#' labels we assign. #' @param .all_versions (Not the same as `.all_rows` parameter of `epi_slide`.) If -#' TRUE, then `.f` will be passed the version history (all -#' `version <= .ref_time_value`) for rows having `time_value` between -#' `.ref_time_value - before` and `.ref_time_value`. Otherwise, `.f` will be +#' `.all_versions = TRUE`, then `.f` will be passed the version history (all +#' `version <= .ref_time_value`) for rows having `time_value` of at least +#' `.version - before`. Otherwise, `.f` will be #' passed only the most recent `version` for every unique `time_value`. #' Default is `FALSE`. #' @return A tibble whose columns are: the grouping variables, `time_value`, diff --git a/R/slide.R b/R/slide.R index a9f3a86c..49827600 100644 --- a/R/slide.R +++ b/R/slide.R @@ -27,7 +27,9 @@ #' and can also refer to `.x`, `.group_key`, and `.ref_time_value`. See #' details. #' @param .new_col_name String indicating the name of the new column that will -#' contain the derivative values. Default is "slide_value"; note that setting +#' contain the derivative values. The default is "slide_value" unless your +#' slide computations output data frames, in which case they will be unpacked +#' into the constituent columns and those names used. Note that setting #' `new_col_name` equal to an existing column name will overwrite this column. #' #' @template basic-slide-details @@ -169,6 +171,21 @@ epi_slide <- function( } } + checkmate::assert_string(new_col_name, null.ok = TRUE) + if (!is.null(new_col_name)) { + if (new_col_name %in% group_vars(x)) { + cli_abort(c("`new_col_name` must not be one of the grouping column name(s); + `epi_slide()` uses these column name(s) to label what group + each slide computation came from.", + "i" = "{cli::qty(length(group_vars(x)))} grouping column name{?s} + {?was/were} {format_chr_with_quotes(group_vars(x))}", + "x" = "`new_col_name` was {format_chr_with_quotes(new_col_name)}")) + } + if (identical(new_col_name, "time_value")) { + cli_abort('`new_col_name` must not be `"time_value"`; `epi_slide()` uses that column name to attach the `ref_time_value` associated with each slide computation') # nolint: line_length_linter + } + } + # Arrange by increasing time_value x <- arrange(.x, .data$time_value) diff --git a/R/utils.R b/R/utils.R index a6a4382b..e3fd28bf 100644 --- a/R/utils.R +++ b/R/utils.R @@ -97,6 +97,28 @@ format_class_vec <- function(class_vec) { paste(collapse = "", deparse(class_vec)) } +#' Format a character vector as a string via deparsing/quoting each +#' +#' @param x `chr`; e.g., `colnames` of some data frame +#' @param empty string; what should be output if `x` is of length 0? +#' @return string +format_chr_with_quotes <- function(x, empty = "*none*") { + if (length(x) == 0L) { + empty + } else { + # Deparse to get quoted + escape-sequenced versions of varnames; collapse to + # single line (assuming no newlines in `x`). Though if we hand this to cli + # it may insert them (even in middle of quotes) while wrapping lines. + deparsed_collapsed <- paste(collapse = "", deparse(x)) + if (length(x) == 1L) { + deparsed_collapsed + } else { + # remove surrounding `c()`: + substr(deparsed_collapsed, 3L, nchar(deparsed_collapsed) - 1L) + } + } +} + #' Assert that a sliding computation function takes enough args #' #' @param f Function; specifies a computation to slide over an `epi_df` or diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd index fc675071..950c19e3 100644 --- a/man/epi_slide.Rd +++ b/man/epi_slide.Rd @@ -70,7 +70,9 @@ window. If missing, then this will be set to all unique time values in the underlying data table, by default.} \item{.new_col_name}{String indicating the name of the new column that will -contain the derivative values. Default is "slide_value"; note that setting +contain the derivative values. The default is "slide_value" unless your +slide computations output data frames, in which case they will be unpacked +into the constituent columns and those names used. Note that setting \code{new_col_name} equal to an existing column name will overwrite this column.} \item{.all_rows}{If \code{.all_rows = TRUE}, then all rows of \code{.x} will be kept in diff --git a/man/epix_slide.Rd b/man/epix_slide.Rd index 75a99994..e71c775d 100644 --- a/man/epix_slide.Rd +++ b/man/epix_slide.Rd @@ -74,16 +74,19 @@ in the archive is "day", and the \code{.ref_time_value} is January 8, then the smallest time_value in the snapshot will be January 1. If missing, then the default is no limit on the time values, so the full snapshot is given.} -\item{.new_col_name}{String indicating the name of the new column that will -contain the derivative values. The default is "slide_value" unless your -slide computations output data frames, in which case they will be unpacked -into the constituent columns and those names used. Note that setting -\code{.new_col_name} equal to an existing column name will overwrite this column.} +\item{.new_col_name}{Either \code{NULL} or a string indicating the name of the new +column that will contain the derived values. The default, \code{NULL}, will use +the name "slide_value" unless your slide computations output data frames, +in which case they will be unpacked into the constituent columns and those +names used. If the resulting column name(s) overlap with the column names +used for labeling the computations, which are \code{group_vars(x)} and +\code{"version"}, then the values for these columns must be identical to the +labels we assign.} \item{.all_versions}{(Not the same as \code{.all_rows} parameter of \code{epi_slide}.) If -TRUE, then \code{.f} will be passed the version history (all -\code{version <= .ref_time_value}) for rows having \code{time_value} between -\code{.ref_time_value - before} and \code{.ref_time_value}. Otherwise, \code{.f} will be +\code{.all_versions = TRUE}, then \code{.f} will be passed the version history (all +\code{version <= .ref_time_value}) for rows having \code{time_value} of at least +\code{.version - before}. Otherwise, \code{.f} will be passed only the most recent \code{version} for every unique \code{time_value}. Default is \code{FALSE}.} diff --git a/man/format_chr_with_quotes.Rd b/man/format_chr_with_quotes.Rd new file mode 100644 index 00000000..b62b172e --- /dev/null +++ b/man/format_chr_with_quotes.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{format_chr_with_quotes} +\alias{format_chr_with_quotes} +\title{Format a character vector as a string via deparsing/quoting each} +\usage{ +format_chr_with_quotes(x, empty = "*none*") +} +\arguments{ +\item{x}{\code{chr}; e.g., \code{colnames} of some data frame} + +\item{empty}{string; what should be output if \code{x} is of length 0?} +} +\value{ +string +} +\description{ +Format a character vector as a string via deparsing/quoting each +} From 91279528ac146b5819b0aeff430b7dfe70a89383 Mon Sep 17 00:00:00 2001 From: brookslogan Date: Thu, 22 Aug 2024 07:39:59 +0000 Subject: [PATCH 09/11] Style and fix&improve some .new_col_name validation --- R/grouped_epi_archive.R | 15 +++++++++++++-- R/slide.R | 11 ++++++----- tests/testthat/test-methods-epi_archive.R | 6 ++++-- 3 files changed, 23 insertions(+), 9 deletions(-) diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index c63ae98e..0506af9c 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -284,8 +284,19 @@ epix_slide.grouped_epi_archive <- function( validate_slide_window_arg(.before, .x$private$ungrouped$time_type) checkmate::assert_string(.new_col_name, null.ok = TRUE) - if (identical(.new_col_name, "time_value")) { - cli_abort('`.new_col_name` must not be `"version"`; `epix_slide()` uses that column name to attach which of the `.versions` is associated with each slide computation') # nolint: line_length_linter + if (!is.null(.new_col_name)) { + if (.new_col_name %in% x$private$vars) { + cli_abort(c("`new_col_name` must not be one of the grouping column name(s); + `epix_slide()` uses these column name(s) to label what group + each slide computation came from.", + "i" = "{cli::qty(length(x$private$vars))} grouping column name{?s} + {?was/were} {format_chr_with_quotes(x$private$vars)}", + "x" = "`new_col_name` was {format_chr_with_quotes(new_col_name)}" + )) + } + if (identical(.new_col_name, "version")) { + cli_abort('`.new_col_name` must not be `"version"`; `epix_slide()` uses that column name to attach the element of `.versions` associated with each slide computation') # nolint: line_length_linter + } } assert_logical(.all_versions, len = 1L) diff --git a/R/slide.R b/R/slide.R index 49827600..a1ee8c54 100644 --- a/R/slide.R +++ b/R/slide.R @@ -174,15 +174,16 @@ epi_slide <- function( checkmate::assert_string(new_col_name, null.ok = TRUE) if (!is.null(new_col_name)) { if (new_col_name %in% group_vars(x)) { - cli_abort(c("`new_col_name` must not be one of the grouping column name(s); + cli_abort(c("`.new_col_name` must not be one of the grouping column name(s); `epi_slide()` uses these column name(s) to label what group each slide computation came from.", - "i" = "{cli::qty(length(group_vars(x)))} grouping column name{?s} - {?was/were} {format_chr_with_quotes(group_vars(x))}", - "x" = "`new_col_name` was {format_chr_with_quotes(new_col_name)}")) + "i" = "{cli::qty(length(group_vars(.x)))} grouping column name{?s} + {?was/were} {format_chr_with_quotes(group_vars(.x))}", + "x" = "`.new_col_name` was {format_chr_with_quotes(.new_col_name)}" + )) } if (identical(new_col_name, "time_value")) { - cli_abort('`new_col_name` must not be `"time_value"`; `epi_slide()` uses that column name to attach the `ref_time_value` associated with each slide computation') # nolint: line_length_linter + cli_abort('`.new_col_name` must not be `"time_value"`; `epi_slide()` uses that column name to attach the element of `.ref_time_values` associated with each slide computation') # nolint: line_length_linter } } diff --git a/tests/testthat/test-methods-epi_archive.R b/tests/testthat/test-methods-epi_archive.R index 803a11bd..45ba6ea1 100644 --- a/tests/testthat/test-methods-epi_archive.R +++ b/tests/testthat/test-methods-epi_archive.R @@ -123,6 +123,8 @@ test_that("epix_truncate_version_after returns the same groups as input grouped_ }) test_that("group_vars works as expected", { - expect_equal(ea2_data %>% as_epi_archive() %>% group_by(geo_value) %>% group_vars(), - "geo_value") + expect_equal( + ea2_data %>% as_epi_archive() %>% group_by(geo_value) %>% group_vars(), + "geo_value" + ) }) From 7af57cc62e4925eebb89dea8ba0c4e67dc415c6d Mon Sep 17 00:00:00 2001 From: brookslogan Date: Mon, 26 Aug 2024 20:48:52 +0000 Subject: [PATCH 10/11] style: styler (GHA) --- R/grouped_epi_archive.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index 0506af9c..32efb1aa 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -219,7 +219,6 @@ epix_slide.grouped_epi_archive <- function( .versions = NULL, .new_col_name = NULL, .all_versions = FALSE) { - # Perform some deprecated argument checks without using ` = # deprecated()` in the function signature, because they are from # early development versions and much more likely to be clutter than From 1181b97a49cb6684537ab27df5c75059343d9a79 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 29 Aug 2024 12:27:20 -0700 Subject: [PATCH 11/11] Fix slide rebase issues, other partial renames, dotprefix internalfn Since we're passing along ... from outer fns to our inner helper fns taking ..., the internal fns should also dot-prefix if outer should. --- R/grouped_epi_archive.R | 30 ++++++++--------- R/slide.R | 72 ++++++++++++++++++++--------------------- R/utils.R | 8 ++--- 3 files changed, 54 insertions(+), 56 deletions(-) diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index 32efb1aa..b37c31bc 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -224,7 +224,7 @@ epix_slide.grouped_epi_archive <- function( # early development versions and much more likely to be clutter than # informative in the signature. provided_args <- rlang::call_args_names(rlang::call_match()) - if (any(provided_args %in% c("x", "f", "before", "ref_time_values", "new_col_name", "all_versions", "group_by"))) { + if (any(provided_args %in% c("x", "f", "before", "ref_time_values", "new_col_name", "all_versions"))) { cli::cli_abort( "epix_slide: you are using one of the following old argument names: `x`, `f`, `before`, `ref_time_values`, `new_col_name`, `all_versions`. Please use the new names: `.x`, `.f`, `.before`, `.ref_time_values`, @@ -276,20 +276,18 @@ epix_slide.grouped_epi_archive <- function( # Sort, for consistency with `epi_slide`, although the current # implementation doesn't take advantage of it. .versions <- sort(.versions) - ref_time_values <- sort(ref_time_values) - .versions <- sort(.versions) } validate_slide_window_arg(.before, .x$private$ungrouped$time_type) checkmate::assert_string(.new_col_name, null.ok = TRUE) if (!is.null(.new_col_name)) { - if (.new_col_name %in% x$private$vars) { - cli_abort(c("`new_col_name` must not be one of the grouping column name(s); + if (.new_col_name %in% .x$private$vars) { + cli_abort(c("`.new_col_name` must not be one of the grouping column name(s); `epix_slide()` uses these column name(s) to label what group each slide computation came from.", - "i" = "{cli::qty(length(x$private$vars))} grouping column name{?s} - {?was/were} {format_chr_with_quotes(x$private$vars)}", + "i" = "{cli::qty(length(.x$private$vars))} grouping column name{?s} + {?was/were} {format_chr_with_quotes(.x$private$vars)}", "x" = "`new_col_name` was {format_chr_with_quotes(new_col_name)}" )) } @@ -321,10 +319,10 @@ epix_slide.grouped_epi_archive <- function( # Computation for one group, one time value comp_one_grp <- function(.data_group, .group_key, f, ..., - ref_time_value, + version, new_col_name) { # Carry out the specified computation - comp_value <- f(.data_group, .group_key, ref_time_value, ...) + comp_value <- f(.data_group, .group_key, version, ...) # If this wasn't a tidyeval computation, we still need to check the output # types. We'll let `group_modify` and `vec_rbind` deal with checking for @@ -347,7 +345,7 @@ epix_slide.grouped_epi_archive <- function( # redundant work. `group_modify()` provides the group key, we provide the # ref time value (appropriately recycled) and comp_value (appropriately # named / unpacked, for quick feedback) - res <- list(version = vctrs::vec_rep(ref_time_value, vctrs::vec_size(comp_value))) + res <- list(version = vctrs::vec_rep(version, vctrs::vec_size(comp_value))) if (is.null(new_col_name)) { if (inherits(comp_value, "data.frame")) { @@ -371,12 +369,12 @@ epix_slide.grouped_epi_archive <- function( return(validate_tibble(new_tibble(res))) } - out <- lapply(versions, function(ref_time_value) { + out <- lapply(.versions, function(version) { # Ungrouped as-of data; `epi_df` if `all_versions` is `FALSE`, # `epi_archive` if `all_versions` is `TRUE`: as_of_raw <- .x$private$ungrouped %>% epix_as_of( - ref_time_value, - min_time_value = ref_time_value - .before, + version, + min_time_value = version - .before, all_versions = .all_versions ) @@ -412,7 +410,7 @@ epix_slide.grouped_epi_archive <- function( # Convert each subgroup chunk to an archive before running the calculation. group_modify_fn <- function(.data_group, .group_key, f, ..., - ref_time_value, + version, new_col_name) { # .data_group is coming from as_of_df as a tibble, but we # want to feed `comp_one_grp` an `epi_archive` backed by a @@ -423,7 +421,7 @@ epix_slide.grouped_epi_archive <- function( .data_group_archive$DT <- .data_group comp_one_grp(.data_group_archive, .group_key, f = f, ..., - ref_time_value = ref_time_value, + version = version, new_col_name = new_col_name ) } @@ -434,7 +432,7 @@ epix_slide.grouped_epi_archive <- function( dplyr::group_by(as_of_df, !!!syms(.x$private$vars), .drop = .x$private$drop), group_modify_fn, f = f, ..., - ref_time_value = ref_time_value, + version = version, new_col_name = .new_col_name, .keep = TRUE ) diff --git a/R/slide.R b/R/slide.R index a1ee8c54..bc879d09 100644 --- a/R/slide.R +++ b/R/slide.R @@ -171,9 +171,9 @@ epi_slide <- function( } } - checkmate::assert_string(new_col_name, null.ok = TRUE) - if (!is.null(new_col_name)) { - if (new_col_name %in% group_vars(x)) { + checkmate::assert_string(.new_col_name, null.ok = TRUE) + if (!is.null(.new_col_name)) { + if (.new_col_name %in% group_vars(.x)) { cli_abort(c("`.new_col_name` must not be one of the grouping column name(s); `epi_slide()` uses these column name(s) to label what group each slide computation came from.", @@ -182,24 +182,24 @@ epi_slide <- function( "x" = "`.new_col_name` was {format_chr_with_quotes(.new_col_name)}" )) } - if (identical(new_col_name, "time_value")) { + if (identical(.new_col_name, "time_value")) { cli_abort('`.new_col_name` must not be `"time_value"`; `epi_slide()` uses that column name to attach the element of `.ref_time_values` associated with each slide computation') # nolint: line_length_linter } } # Arrange by increasing time_value - x <- arrange(.x, .data$time_value) + .x <- arrange(.x, .data$time_value) # Now set up starts and stops for sliding/hopping starts <- .ref_time_values - before stops <- .ref_time_values + after - # If `f` is missing, interpret ... as an expression for tidy evaluation + # If `.f` is missing, interpret ... as an expression for tidy evaluation if (missing(.f)) { used_data_masking <- TRUE quosures <- enquos(...) if (length(quosures) == 0) { - cli_abort("If `f` is missing then a computation must be specified via `...`.") + cli_abort("If `.f` is missing then a computation must be specified via `...`.") } .f <- quosures @@ -231,29 +231,29 @@ epi_slide <- function( slide_one_grp <- function(.data_group, .group_key, # see `?group_modify` ..., # `...` to `epi_slide` forwarded here - f_factory, - starts, - stops, - ref_time_values, - all_rows, - new_col_name) { + .f_factory, + .starts, + .stops, + .ref_time_values, + .all_rows, + .new_col_name) { # Figure out which reference time values appear in the data group in the # first place (we need to do this because it could differ based on the # group, hence the setup/checks for the reference time values based on all # the data could still be off): - o <- ref_time_values %in% .data_group$time_value - starts <- starts[o] - stops <- stops[o] - kept_ref_time_values <- ref_time_values[o] + o <- .ref_time_values %in% .data_group$time_value + .starts <- .starts[o] + .stops <- .stops[o] + kept_ref_time_values <- .ref_time_values[o] - f <- f_factory(kept_ref_time_values) + f <- .f_factory(kept_ref_time_values) # Compute the slide values slide_values_list <- slider::hop_index( .x = .data_group, .i = .data_group$time_value, - .starts = starts, - .stops = stops, + .starts = .starts, + .stops = .stops, .f = f, .group_key, ... ) @@ -309,7 +309,7 @@ epi_slide <- function( } # If all rows, then pad slide values with NAs, else filter down data group - if (all_rows) { + if (.all_rows) { orig_values <- slide_values slide_values <- vctrs::vec_rep(vctrs::vec_cast(NA, orig_values), nrow(.data_group)) vctrs::vec_slice(slide_values, o) <- orig_values @@ -318,7 +318,7 @@ epi_slide <- function( } result <- - if (is.null(new_col_name)) { + if (is.null(.new_col_name)) { if (inherits(slide_values, "data.frame")) { # unpack into separate columns (without name prefix) and, if there are # re-bindings, make the last one win for determining column value & @@ -330,25 +330,25 @@ epi_slide <- function( } } else { # vector or packed data.frame-type column: - mutate(.data_group, !!new_col_name := slide_values) + mutate(.data_group, !!.new_col_name := slide_values) } return(result) } - x <- group_modify(x, slide_one_grp, + .x <- group_modify(.x, slide_one_grp, ..., - f_factory = f_wrapper_factory, - starts = starts, - stops = stops, - ref_time_values = .ref_time_values, - all_rows = .all_rows, - new_col_name = .new_col_name, + .f_factory = f_wrapper_factory, + .starts = starts, + .stops = stops, + .ref_time_values = .ref_time_values, + .all_rows = .all_rows, + .new_col_name = .new_col_name, .keep = FALSE ) - return(x) + return(.x) } #' Optimized slide function for performing common rolling computations on an @@ -480,9 +480,9 @@ epi_slide_opt <- function( if (nrow(.x) == 0L) { cli_abort( c( - "input data `x` unexpectedly has 0 rows", + "input data `.x` unexpectedly has 0 rows", "i" = "If this computation is occuring within an `epix_slide` call, - check that `epix_slide` `.ref_time_values` argument was set appropriately" + check that `epix_slide` `.versions` argument was set appropriately" ), class = "epiprocess__epi_slide_opt__0_row_input", epiprocess__x = .x @@ -577,13 +577,13 @@ epi_slide_opt <- function( } } - # Make a complete date sequence between min(x$time_value) and max(x$time_value). + # Make a complete date sequence between min(.x$time_value) and max(.x$time_value). date_seq_list <- full_date_seq(.x, before, after, time_type) all_dates <- date_seq_list$all_dates pad_early_dates <- date_seq_list$pad_early_dates pad_late_dates <- date_seq_list$pad_late_dates - # The position of a given column can be differ between input `x` and + # The position of a given column can be differ between input `.x` and # `.data_group` since the grouping step by default drops grouping columns. # To avoid rerunning `eval_select` for every `.data_group`, convert # positions of user-provided `col_names` into string column names. We avoid @@ -621,7 +621,7 @@ epi_slide_opt <- function( group will result in incorrect results", "i" = "Please change the grouping structure of the input data so that each group has non-duplicate time values (e.g. `x %>% group_by(geo_value) - %>% epi_slide_opt(f = frollmean)`)", + %>% epi_slide_opt(.f = frollmean)`)", "i" = "Use `epi_slide` to aggregate across groups" ), class = "epiprocess__epi_slide_opt__duplicate_time_values", diff --git a/R/utils.R b/R/utils.R index e3fd28bf..79f2e96d 100644 --- a/R/utils.R +++ b/R/utils.R @@ -494,9 +494,9 @@ as_slide_computation <- function(f, ..., .ref_time_value_long_varnames, .ref_tim #' @rdname as_slide_computation #' @export #' @noRd -as_time_slide_computation <- function(f, ...) { +as_time_slide_computation <- function(.f, ...) { as_slide_computation( - f, ..., + .f, ..., .ref_time_value_long_varnames = ".ref_time_value", .ref_time_value_label = "reference time value" ) @@ -505,9 +505,9 @@ as_time_slide_computation <- function(f, ...) { #' @rdname as_slide_computation #' @export #' @noRd -as_diagonal_slide_computation <- function(f, ...) { +as_diagonal_slide_computation <- function(.f, ...) { as_slide_computation( - f, ..., + .f, ..., .ref_time_value_long_varnames = c(".version", ".ref_time_value"), .ref_time_value_label = "version" )