diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index f4b17a4b..2fca5dbd 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -4,7 +4,7 @@ on: push: branches: [main, master] pull_request: - branches: [main, master] + branches: [main, master, dev] name: R-CMD-check diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index 087f0b05..847176d3 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -4,7 +4,7 @@ on: push: branches: [main, master] pull_request: - branches: [main, master] + branches: [main, master, dev] release: types: [published] workflow_dispatch: diff --git a/R/archive.R b/R/archive.R index d5be0ed4..086d31b3 100644 --- a/R/archive.R +++ b/R/archive.R @@ -640,7 +640,7 @@ epi_archive = slide = function(f, ..., before, ref_time_values, time_step, new_col_name = "slide_value", as_list_col = FALSE, names_sep = "_", - all_rows = FALSE, all_versions = FALSE) { + all_versions = FALSE) { # For an "ungrouped" slide, treat all rows as belonging to one big # group (group by 0 vars), like `dplyr::summarize`, and let the # resulting `grouped_epi_archive` handle the slide: @@ -649,7 +649,7 @@ epi_archive = before = before, ref_time_values = ref_time_values, time_step = time_step, new_col_name = new_col_name, as_list_col = as_list_col, names_sep = names_sep, - all_rows = all_rows, all_versions = all_versions + all_versions = all_versions ) %>% # We want a slide on ungrouped archives to output something # ungrouped, rather than retaining the trivial (0-variable) diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index 1c6bd311..ddd4527f 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -190,7 +190,11 @@ grouped_epi_archive = slide = function(f, ..., before, ref_time_values, time_step, new_col_name = "slide_value", as_list_col = FALSE, names_sep = "_", - all_rows = FALSE, all_versions = FALSE) { + all_versions = FALSE) { + # Perform some deprecated argument checks without using ` = + # deprecated()` in the function signature, because they are from + # early development versions and much more likely to be clutter than + # informative in the signature. if ("group_by" %in% nse_dots_names(...)) { Abort(" The `group_by` argument to `slide` has been removed; please use @@ -200,7 +204,15 @@ grouped_epi_archive = this check is a false positive, but you will still need to use a different column name here and rename the resulting column after the slide.) - ") + ", class = "epiprocess__epix_slide_group_by_parameter_deprecated") + } + if ("all_rows" %in% nse_dots_names(...)) { + Abort(" + The `all_rows` argument has been removed from `epix_slide` (but + is still supported in `epi_slide`). Since `epix_slide` now + allows any number of rows out of slide computations, it's + unclear how `all_rows=TRUE` should fill in missing results. + ", class = "epiprocess__epix_slide_all_rows_parameter_deprecated") } if (missing(ref_time_values)) { @@ -247,30 +259,14 @@ grouped_epi_archive = if (! (rlang::is_string(names_sep) || is.null(names_sep)) ) { Abort("`names_sep` must be a (single) string or NULL.") } - if (!rlang::is_bool(all_rows)) { - Abort("`all_rows` must be TRUE or FALSE.") - } if (!rlang::is_bool(all_versions)) { Abort("`all_versions` must be TRUE or FALSE.") } - # Each computation is expected to output a data frame with either - # one element/row total or one element/row per encountered - # nongrouping, nontime, nonversion key value. These nongrouping, - # nontime, nonversion key columns can be seen as the "effective" key - # of the computation; the computation might return an object that - # reports a different key or no key, but the "effective" key should - # still be a valid unique key for the data, and is something that we - # could use even with `.keep = FALSE`. - comp_effective_key_vars = - setdiff(key(private$ungrouped$DT), - c(private$vars, "time_value", "version")) - # Computation for one group, one time value comp_one_grp = function(.data_group, .group_key, f, ..., ref_time_value, - comp_effective_key_vars, new_col) { # Carry out the specified computation comp_value = f(.data_group, .group_key, ...) @@ -282,77 +278,12 @@ grouped_epi_archive = .data_group = .data_group$DT } - # Calculate the number of output elements/rows we expect the - # computation to output: one per distinct "effective computation - # key variable" value encountered in the input. - # - # Note: this mirrors how `epi_slide` does things if we're using - # unique keys, but can diverge if using nonunique keys. The - # `epi_slide` approach of counting occurrences of the - # `ref_time_value` in the `time_value` column, which helps lines - # up the computation results with corresponding rows of the - # input data, wouldn't quite apply here: we'd want to line up - # with rows (from the same group) with `version` matching the - # `ref_time_value`, but would still need to summarize these rows - # somehow and drop the `time_value` input column, but this - # summarization requires something like a to-be-unique output - # key to determine a sensible number of rows to output (and the - # contents of those rows). - count = - if (length(comp_effective_key_vars) != 0L) { - comp_effective_key_vals_in_comp_input = - if (data.table::is.data.table(.data_group)) { - .data_group[, comp_effective_key_vars, with=FALSE] - } else { - .data_group[, comp_effective_key_vars] - } - sum(!duplicated(comp_effective_key_vals_in_comp_input)) - } else { - # Same idea as above, but accounting for `duplicated` working - # differently (outputting `logical(0)`) on 0-column inputs - # rather than matching the number of rows. (Instead, we use - # the same count we would get if we were counting distinct - # values of a column defined as `rep(val, target_n_rows)`.) - if (nrow(.data_group) == 0L) { - 0L - } else { - 1L - } - } - - # If we get back an atomic vector - if (is.atomic(comp_value)) { - if (length(comp_value) == 1) { - comp_value = rep(comp_value, count) - } - # If not a singleton, should be the right length, else abort - else if (length(comp_value) != count) { - Abort('If the slide computation returns an atomic vector, then it must have either (a) a single element, or (b) one element per distinct combination of key variables, excluding the `time_value`, `version`, and grouping variables, that is present in the first argument to the computation.') - } - } - - # If we get back a data frame - else if (is.data.frame(comp_value)) { - if (nrow(comp_value) == 1) { - comp_value = rep(list(comp_value), count) - } - # If not a single row, should be the right length, else abort - else if (nrow(comp_value) != count) { - Abort("If the slide computation returns a data frame, then it must have a single row, or else one row per appearance of the reference time value in the local window.") - } - # Make into a list - else { - comp_value = split(comp_value, seq_len(nrow(comp_value))) - } - } - - # If neither an atomic vector data frame, then abort - else { + if (! (is.atomic(comp_value) || is.data.frame(comp_value))) { Abort("The slide computation must return an atomic vector or a data frame.") } - + # Label every result row with the `ref_time_value`: - return(tibble::tibble(time_value = rep(.env$ref_time_value, count), + return(tibble::tibble(time_value = .env$ref_time_value, !!new_col := .env$comp_value)) } @@ -391,7 +322,6 @@ grouped_epi_archive = group_modify_fn = function(.data_group, .group_key, f, ..., ref_time_value, - comp_effective_key_vars, new_col) { # .data_group is coming from as_of_df as a tibble, but we # want to feed `comp_one_grp` an `epi_archive` backed by a @@ -402,7 +332,6 @@ grouped_epi_archive = .data_group_archive$DT = .data_group comp_one_grp(.data_group_archive, .group_key, f = f, ..., ref_time_value = ref_time_value, - comp_effective_key_vars = comp_effective_key_vars, new_col = new_col ) } @@ -414,7 +343,6 @@ grouped_epi_archive = dplyr::group_modify(group_modify_fn, f = f, ..., ref_time_value = ref_time_value, - comp_effective_key_vars = comp_effective_key_vars, new_col = new_col, .keep = TRUE) ) @@ -501,13 +429,18 @@ grouped_epi_archive = if (!as_list_col) { x = tidyr::unnest(x, !!new_col, names_sep = names_sep) } - - # Join to get all rows, if we need to, then return - if (all_rows) { - cols = c(private$vars, "time_value") - y = unique(private$ungrouped$DT[, ..cols]) - x = dplyr::left_join(y, x, by = cols) + + if (is_epi_df(x)) { + # The analogue of `epi_df`'s `as_of` metadata for an archive is + # `$versions_end`, at least in the current absence of + # separate fields/columns denoting the "archive version" with a + # different resolution, or from the perspective of a different + # stage of a data pipeline. The `as_of` that is automatically + # derived won't always match; override: + + attr(x, "metadata")[["as_of"]] <- private$ungrouped$versions_end } + return(x) } ) diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 348c5c01..420e5f1c 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -478,16 +478,17 @@ epix_detailed_restricted_mutate = function(.data, ...) { } else { # Have `dplyr` do the `dplyr_col_modify`, keeping the column-level-aliasing # and must-copy-on-write-if-refcount-more-than-1 model, obtaining a tibble, - # then `setDT`-ing it in place to be a `data.table`. The key should still be - # valid (assuming that the user did not explicitly alter `key(.data$DT)` or - # the columns by reference somehow within `...` tidyeval-style computations, - # or trigger refcount-1 alterations due to still having >1 refcounts on the - # columns), so in between, set the "sorted" attribute accordingly to prevent - # attempted sorting (including potential extra copies) or sortedness - # checking, then `setDT`. - out_DT = dplyr::dplyr_col_modify(in_tbl, col_modify_cols) # tibble - data.table::setattr(out_DT, "sorted", data.table::key(.data$DT)) - data.table::setDT(out_DT, key=key(.data$DT)) + # then convert it into a `data.table`. The key should still be valid + # (assuming that the user did not explicitly alter `key(.data$DT)` or the + # columns by reference somehow within `...` tidyeval-style computations, or + # trigger refcount-1 alterations due to still having >1 refcounts on the + # columns), set the "sorted" attribute accordingly to prevent attempted + # sorting (including potential extra copies) or sortedness checking, then + # `setDT` (rather than `as.data.table`, in order to prevent column copying + # to establish ownership according to `data.table`'s memory model). + out_DT = dplyr::dplyr_col_modify(in_tbl, col_modify_cols) %>% + data.table::setattr("sorted", data.table::key(.data$DT)) %>% + data.table::setDT(key=key(.data$DT)) out_archive = .data$clone() out_archive$DT <- out_DT request_names = names(col_modify_cols) @@ -512,13 +513,13 @@ epix_detailed_restricted_mutate = function(.data, ...) { #' #' @param .data An `epi_archive` or `grouped_epi_archive` #' @param ... Similar to [`dplyr::group_by`] (see "Details:" for edge cases); -#' * In `group_by`: unquoted variable name(s) or other ["data -#' masking"][dplyr::dplyr_data_masking] expression(s). It's possible to use -#' [`dplyr::mutate`]-like syntax here to calculate new columns on which to +#' * For `group_by`: unquoted variable name(s) or other +#' ["data masking"][dplyr::dplyr_data_masking] expression(s). It's possible to +#' use [`dplyr::mutate`]-like syntax here to calculate new columns on which to #' perform grouping, but note that, if you are regrouping an already-grouped #' `.data` object, the calculations will be carried out ignoring such grouping #' (same as [in dplyr][dplyr::group_by]). -#' * In `ungroup`: either +#' * For `ungroup`: either #' * empty, in order to remove the grouping and output an `epi_archive`; or #' * variable name(s) or other ["tidy-select"][dplyr::dplyr_tidy_select] #' expression(s), in order to remove the matching variables from the list of @@ -527,12 +528,13 @@ epix_detailed_restricted_mutate = function(.data, ...) { #' the variable selection from `...` only; if `TRUE`, the output will be #' grouped by the current grouping variables plus the variable selection from #' `...`. -#' @param .drop As in [`dplyr::group_by`]; determines treatment of factor -#' columns. -#' @param x a `grouped_epi_archive`, or, in `is_grouped_epi_archive`, any object -#' @param .tbl An `epi_archive` or `grouped_epi_archive` (`epi_archive` -#' dispatches to the S3 default method, and `grouped_epi_archive` dispatches -#' its own S3 method) +#' @param .drop As described in [`dplyr::group_by`]; determines treatment of +#' factor columns. +#' @param x For `groups` or `ungroup`: a `grouped_epi_archive`; for +#' `is_grouped_epi_archive`: any object +#' @param .tbl (For `group_by_drop_default`:) an `epi_archive` or +#' `grouped_epi_archive` (`epi_archive` dispatches to the S3 default method; +#' `grouped_epi_archive` dispatches its own S3 method) #' #' @details #' @@ -614,19 +616,10 @@ epix_detailed_restricted_mutate = function(.data, ...) { #' # To get the grouping variable names as a `list` of `name`s (a.k.a. symbols): #' toy_archive %>% group_by(geo_value) %>% groups() #' -#' # `.drop = FALSE` is supported in a sense; `f` is called on 0-row inputs for -#' # the missing groups identified by `dplyr`, but the row-recycling rules will -#' # exclude the corresponding outputs of `f` from the output of the slide: -#' all.equal( -#' toy_archive %>% -#' group_by(geo_value, age_group, .drop=FALSE) %>% -#' epix_slide(f = ~ sum(.x$value), before = 20) %>% -#' ungroup(), -#' toy_archive %>% -#' group_by(geo_value, age_group, .drop=TRUE) %>% -#' epix_slide(f = ~ sum(.x$value), before = 20) %>% -#' ungroup() -#' ) +#' toy_archive %>% +#' group_by(geo_value, age_group, .drop=FALSE) %>% +#' epix_slide(f = ~ sum(.x$value), before = 20) %>% +#' ungroup() #' #' @importFrom dplyr group_by #' @export @@ -635,6 +628,20 @@ epix_detailed_restricted_mutate = function(.data, ...) { group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_drop_default(.data)) { # `add` makes no difference; this is an ungrouped `epi_archive`. detailed_mutate = epix_detailed_restricted_mutate(.data, ...) + if (!rlang::is_bool(.drop)) { + Abort("`.drop` must be TRUE or FALSE") + } else if (!.drop) { + grouping_cols = as.list(detailed_mutate[["archive"]][["DT"]])[detailed_mutate[["request_names"]]] + grouping_col_is_factor = purrr::map_lgl(grouping_cols, is.factor) + # ^ Use `as.list` to try to avoid any possibility of a deep copy. + if (!any(grouping_col_is_factor)) { + Warn("`.drop=FALSE` but there are no factor grouping columns; did you mean to convert one of the columns to a factor beforehand?", + class = "epiprocess__group_by_epi_archive_drop_FALSE_no_factors") + } else if (any(diff(grouping_col_is_factor) == -1L)) { + Warn("`.drop=FALSE` but there are one or more non-factor grouping columns listed after a factor grouping column; this may produce groups with `NA`s for these columns; see https://github.com/tidyverse/dplyr/issues/5369#issuecomment-683762553; depending on how you want completion to work, you might instead want to convert all grouping columns to factors beforehand, specify the non-factor grouping columns first, or use `.drop=TRUE` and add a call to `tidyr::complete`.", + class = "epiprocess__group_by_epi_archive_drop_FALSE_nonfactor_after_factor") + } + } grouped_epi_archive$new(detailed_mutate[["archive"]], detailed_mutate[["request_names"]], drop = .drop) @@ -707,15 +714,12 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr #' @param names_sep String specifying the separator to use in `tidyr::unnest()` #' when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix #' from `new_col_name` entirely. -#' @param all_rows If `all_rows = TRUE`, then the output will have one row per -#' combination of grouping variables and unique time values in the underlying -#' data table. Otherwise, there will be one row in the output for each time -#' value in `x` that acts as a reference time value. Default is `FALSE`. -#' @param all_versions If `all_versions = TRUE`, then `f` will be passed the -#' version history (all `version <= ref_time_value`) for rows having -#' `time_value` between `ref_time_value - before` and `ref_time_value`. -#' Otherwise, `f` will be passed only the most recent `version` for every -#' unique `time_value`. Default is `FALSE`. +#' @param all_versions (Not the same as `all_rows` parameter of `epi_slide`.) If +#' `all_versions = TRUE`, then `f` will be passed the version history (all +#' `version <= ref_time_value`) for rows having `time_value` between +#' `ref_time_value - before` and `ref_time_value`. Otherwise, `f` will be +#' passed only the most recent `version` for every unique `time_value`. +#' Default is `FALSE`. #' @return A tibble whose columns are: the grouping variables, `time_value`, #' containing the reference time values for the slide computation, and a #' column named according to the `new_col_name` argument, containing the slide @@ -736,32 +740,34 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr #' ends); `epi_slide` windows extend from `before` time steps before a #' `ref_time_value` through `after` time steps after `ref_time_value`. #' 3. The input class and columns are similar but different: `epix_slide` -#' keeps all columns and the `epi_df`-ness of the first input to the -#' computation; `epi_slide` only provides the grouping variables in the second -#' input, and will convert the first input into a regular tibble if the -#' grouping variables include the essential `geo_value` column. +#' (with the default `all_versions=FALSE`) keeps all columns and the +#' `epi_df`-ness of the first argument to each computation; `epi_slide` only +#' provides the grouping variables in the second input, and will convert the +#' first input into a regular tibble if the grouping variables include the +#' essential `geo_value` column. (With `all_versions=TRUE`, `epix_slide` will +#' will provide an `epi_archive` rather than an `epi-df` to each +#' computation.) #' 4. The output class and columns are similar but different: `epix_slide()` -#' returns a tibble containing only the grouping variables, `time_value`, and -#' the new column(s) from the slide computation `f`, whereas `epi_slide()` -#' returns an `epi_df` with all original variables plus the new columns from -#' the slide computation. -#' 5. Unless grouping by `geo_value` and all `other_keys`, there will be -#' row-recyling behavior meant to resemble `epi_slide`'s results, based on the -#' distinct combinations of `geo_value`, `time_value`, and all `other_keys` -#' present in the version data with `time_value` matching one of the -#' `ref_time_values`. However, due to reporting latency or reporting dropping -#' in and out, this may not exactly match the behavior of "corresponding" -#' `epi_df`s. -#' 6. Similar to the row recyling, while `all_rows=TRUE` is designed to mimic -#' `epi_slide` by completing based on distinct combinations of `geo_value`, -#' `time_value`, and all `other_keys` present in the version data with -#' `time_value` matching one of the `ref_time_values`, this can have unexpected -#' behaviors due reporting latency or reporting dropping in and out. +#' returns an `epi_df` or tibble containing only the grouping variables, +#' `time_value`, and the new column(s) from the slide computations, whereas +#' `epi_slide()` returns an `epi_df` with all original variables plus the new +#' columns from the slide computations. +#' 5. There are no size stability checks or element/row recycling to maintain +#' size stability in `epix_slide`, unlike in `epi_slide`. (`epix_slide` is +#' roughly analogous to [`dplyr::reframe`] in `dplyr` 1.1.0 +#' or[`dplyr::summarize`] in `dplyr` 1.0.0, while `epi_slide` is roughly +#' analogous to `dplyr::mutate` followed by `dplyr::arrange`) This is detailed +#' in the "advanced" vignette. +#' 6. `all_rows` is not supported in `epix_slide`; since the slide +#' computations are allowed more flexibility in their outputs than in +#' `epi_slide`, we can't guess a good representation for missing computations +#' for excluded group-`ref_time_value` pairs. #' 7. The `ref_time_values` default for `epix_slide` is based on making an #' evenly-spaced sequence out of the `version`s in the `DT` plus the #' `versions_end`, rather than the `time_value`s. -#' Apart from this, the interfaces between `epix_slide()` and `epi_slide()` are -#' the same. +#' +#' Apart from the above distinctions, the interfaces between `epix_slide()` and +#' `epi_slide()` are the same. #' #' Furthermore, the current function can be considerably slower than #' `epi_slide()`, for two reasons: (1) it must repeatedly fetch @@ -840,7 +846,7 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr epix_slide = function(x, f, ..., before, ref_time_values, time_step, new_col_name = "slide_value", as_list_col = FALSE, names_sep = "_", - all_rows = FALSE, all_versions = FALSE) { + all_versions = FALSE) { if (!is_epi_archive(x, grouped_okay=TRUE)) { Abort("`x` must be of class `epi_archive` or `grouped_epi_archive`.") } @@ -850,7 +856,6 @@ epix_slide = function(x, f, ..., before, ref_time_values, new_col_name = new_col_name, as_list_col = as_list_col, names_sep = names_sep, - all_rows = all_rows, all_versions = all_versions )) } diff --git a/_pkgdown.yml b/_pkgdown.yml index d9e2ec79..bba3ea8d 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -56,7 +56,7 @@ reference: desc: Details on `epi_archive`, and basic functionality. - contents: - matches("archive") -- title: `epix_*()` functions +- title: "`epix_*()` functions" desc: Functions that act on an `epi_archive` and/or `grouped_epi_archive` object. - contents: - starts_with("epix") diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index 8ae412a6..a4a58645 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -322,7 +322,6 @@ details. new_col_name = "slide_value", as_list_col = FALSE, names_sep = "_", - all_rows = FALSE, all_versions = FALSE )}\if{html}{\out{}} } diff --git a/man/epix_slide.Rd b/man/epix_slide.Rd index 54d16e86..116dd657 100644 --- a/man/epix_slide.Rd +++ b/man/epix_slide.Rd @@ -14,7 +14,6 @@ epix_slide( new_col_name = "slide_value", as_list_col = FALSE, names_sep = "_", - all_rows = FALSE, all_versions = FALSE ) } @@ -84,16 +83,12 @@ by prepending \code{new_col_name} to the names of the list elements.} when \code{as_list_col = FALSE}. Default is "_". Using \code{NULL} drops the prefix from \code{new_col_name} entirely.} -\item{all_rows}{If \code{all_rows = TRUE}, then the output will have one row per -combination of grouping variables and unique time values in the underlying -data table. Otherwise, there will be one row in the output for each time -value in \code{x} that acts as a reference time value. Default is \code{FALSE}.} - -\item{all_versions}{If \code{all_versions = TRUE}, then \code{f} will be passed the -version history (all \code{version <= ref_time_value}) for rows having -\code{time_value} between \code{ref_time_value - before} and \code{ref_time_value}. -Otherwise, \code{f} will be passed only the most recent \code{version} for every -unique \code{time_value}. Default is \code{FALSE}.} +\item{all_versions}{(Not the same as \code{all_rows} parameter of \code{epi_slide}.) If +\code{all_versions = TRUE}, then \code{f} will be passed the version history (all +\code{version <= ref_time_value}) for rows having \code{time_value} between +\code{ref_time_value - before} and \code{ref_time_value}. Otherwise, \code{f} will be +passed only the most recent \code{version} for every unique \code{time_value}. +Default is \code{FALSE}.} } \value{ A tibble whose columns are: the grouping variables, \code{time_value}, @@ -126,34 +121,36 @@ time interval (e.g., day) are only published after that time interval ends); \code{epi_slide} windows extend from \code{before} time steps before a \code{ref_time_value} through \code{after} time steps after \code{ref_time_value}. \item The input class and columns are similar but different: \code{epix_slide} -keeps all columns and the \code{epi_df}-ness of the first input to the -computation; \code{epi_slide} only provides the grouping variables in the second -input, and will convert the first input into a regular tibble if the -grouping variables include the essential \code{geo_value} column. +(with the default \code{all_versions=FALSE}) keeps all columns and the +\code{epi_df}-ness of the first argument to each computation; \code{epi_slide} only +provides the grouping variables in the second input, and will convert the +first input into a regular tibble if the grouping variables include the +essential \code{geo_value} column. (With \code{all_versions=TRUE}, \code{epix_slide} will +will provide an \code{epi_archive} rather than an \code{epi-df} to each +computation.) \item The output class and columns are similar but different: \code{epix_slide()} -returns a tibble containing only the grouping variables, \code{time_value}, and -the new column(s) from the slide computation \code{f}, whereas \code{epi_slide()} -returns an \code{epi_df} with all original variables plus the new columns from -the slide computation. -\item Unless grouping by \code{geo_value} and all \code{other_keys}, there will be -row-recyling behavior meant to resemble \code{epi_slide}'s results, based on the -distinct combinations of \code{geo_value}, \code{time_value}, and all \code{other_keys} -present in the version data with \code{time_value} matching one of the -\code{ref_time_values}. However, due to reporting latency or reporting dropping -in and out, this may not exactly match the behavior of "corresponding" -\code{epi_df}s. -\item Similar to the row recyling, while \code{all_rows=TRUE} is designed to mimic -\code{epi_slide} by completing based on distinct combinations of \code{geo_value}, -\code{time_value}, and all \code{other_keys} present in the version data with -\code{time_value} matching one of the \code{ref_time_values}, this can have unexpected -behaviors due reporting latency or reporting dropping in and out. +returns an \code{epi_df} or tibble containing only the grouping variables, +\code{time_value}, and the new column(s) from the slide computations, whereas +\code{epi_slide()} returns an \code{epi_df} with all original variables plus the new +columns from the slide computations. +\item There are no size stability checks or element/row recycling to maintain +size stability in \code{epix_slide}, unlike in \code{epi_slide}. (\code{epix_slide} is +roughly analogous to \code{\link[dplyr:reframe]{dplyr::reframe}} in \code{dplyr} 1.1.0 +or\code{\link[dplyr:summarise]{dplyr::summarize}} in \code{dplyr} 1.0.0, while \code{epi_slide} is roughly +analogous to \code{dplyr::mutate} followed by \code{dplyr::arrange}) This is detailed +in the "advanced" vignette. +\item \code{all_rows} is not supported in \code{epix_slide}; since the slide +computations are allowed more flexibility in their outputs than in +\code{epi_slide}, we can't guess a good representation for missing computations +for excluded group-\code{ref_time_value} pairs. \item The \code{ref_time_values} default for \code{epix_slide} is based on making an evenly-spaced sequence out of the \code{version}s in the \code{DT} plus the \code{versions_end}, rather than the \code{time_value}s. -Apart from this, the interfaces between \code{epix_slide()} and \code{epi_slide()} are -the same. } +Apart from the above distinctions, the interfaces between \code{epix_slide()} and +\code{epi_slide()} are the same. + Furthermore, the current function can be considerably slower than \code{epi_slide()}, for two reasons: (1) it must repeatedly fetch properly-versioned snapshots from the data archive (via its \code{as_of()} diff --git a/man/group_by.epi_archive.Rd b/man/group_by.epi_archive.Rd index dac7ba0d..aee0a07b 100644 --- a/man/group_by.epi_archive.Rd +++ b/man/group_by.epi_archive.Rd @@ -27,12 +27,13 @@ is_grouped_epi_archive(x) \item{...}{Similar to \code{\link[dplyr:group_by]{dplyr::group_by}} (see "Details:" for edge cases); \itemize{ -\item In \code{group_by}: unquoted variable name(s) or other \link[dplyr:dplyr_data_masking]{"data masking"} expression(s). It's possible to use -\code{\link[dplyr:mutate]{dplyr::mutate}}-like syntax here to calculate new columns on which to +\item For \code{group_by}: unquoted variable name(s) or other +\link[dplyr:dplyr_data_masking]{"data masking"} expression(s). It's possible to +use \code{\link[dplyr:mutate]{dplyr::mutate}}-like syntax here to calculate new columns on which to perform grouping, but note that, if you are regrouping an already-grouped \code{.data} object, the calculations will be carried out ignoring such grouping (same as \link[dplyr:group_by]{in dplyr}). -\item In \code{ungroup}: either +\item For \code{ungroup}: either \itemize{ \item empty, in order to remove the grouping and output an \code{epi_archive}; or \item variable name(s) or other \link[dplyr:dplyr_tidy_select]{"tidy-select"} @@ -46,14 +47,15 @@ the variable selection from \code{...} only; if \code{TRUE}, the output will be grouped by the current grouping variables plus the variable selection from \code{...}.} -\item{.drop}{As in \code{\link[dplyr:group_by]{dplyr::group_by}}; determines treatment of factor -columns.} +\item{.drop}{As described in \code{\link[dplyr:group_by]{dplyr::group_by}}; determines treatment of +factor columns.} -\item{x}{a \code{grouped_epi_archive}, or, in \code{is_grouped_epi_archive}, any object} +\item{x}{For \code{groups} or \code{ungroup}: a \code{grouped_epi_archive}; for +\code{is_grouped_epi_archive}: any object} -\item{.tbl}{An \code{epi_archive} or \code{grouped_epi_archive} (\code{epi_archive} -dispatches to the S3 default method, and \code{grouped_epi_archive} dispatches -its own S3 method)} +\item{.tbl}{(For \code{group_by_drop_default}:) an \code{epi_archive} or +\code{grouped_epi_archive} (\code{epi_archive} dispatches to the S3 default method; +\code{grouped_epi_archive} dispatches its own S3 method)} } \description{ \code{group_by} and related methods for \code{epi_archive}, \code{grouped_epi_archive} @@ -137,18 +139,9 @@ toy_archive \%>\% group_by(geo_value, age_group) \%>\% ungroup(age_group) # To get the grouping variable names as a `list` of `name`s (a.k.a. symbols): toy_archive \%>\% group_by(geo_value) \%>\% groups() -# `.drop = FALSE` is supported in a sense; `f` is called on 0-row inputs for -# the missing groups identified by `dplyr`, but the row-recycling rules will -# exclude the corresponding outputs of `f` from the output of the slide: -all.equal( - toy_archive \%>\% - group_by(geo_value, age_group, .drop=FALSE) \%>\% - epix_slide(f = ~ sum(.x$value), before = 20) \%>\% - ungroup(), - toy_archive \%>\% - group_by(geo_value, age_group, .drop=TRUE) \%>\% - epix_slide(f = ~ sum(.x$value), before = 20) \%>\% - ungroup() -) +toy_archive \%>\% + group_by(geo_value, age_group, .drop=FALSE) \%>\% + epix_slide(f = ~ sum(.x$value), before = 20) \%>\% + ungroup() } diff --git a/tests/testthat/test-deprecations.R b/tests/testthat/test-deprecations.R new file mode 100644 index 00000000..334b4488 --- /dev/null +++ b/tests/testthat/test-deprecations.R @@ -0,0 +1,48 @@ + +test_that("epix_slide group_by= deprecation works",{ + expect_error( + archive_cases_dv_subset %>% + epix_slide(function(...) {}, before=2L, group_by=c()), + class = "epiprocess__epix_slide_group_by_parameter_deprecated" + ) + expect_error( + archive_cases_dv_subset$ + slide(function(...) {}, before=2L, group_by=c()), + class = "epiprocess__epix_slide_group_by_parameter_deprecated" + ) + expect_error( + archive_cases_dv_subset %>% + group_by(geo_value) %>% + epix_slide(function(...) {}, before=2L, group_by=c()), + class = "epiprocess__epix_slide_group_by_parameter_deprecated" + ) + expect_error( + archive_cases_dv_subset$ + group_by(geo_value)$ + slide(function(...) {}, before=2L, group_by=c()), + class = "epiprocess__epix_slide_group_by_parameter_deprecated" + ) + # + expect_error( + archive_cases_dv_subset %>% + epix_slide(function(...) {}, before=2L, all_rows=TRUE), + class = "epiprocess__epix_slide_all_rows_parameter_deprecated" + ) + expect_error( + archive_cases_dv_subset$ + slide(function(...) {}, before=2L, all_rows=TRUE), + class = "epiprocess__epix_slide_all_rows_parameter_deprecated" + ) + expect_error( + archive_cases_dv_subset %>% + group_by(geo_value) %>% + epix_slide(function(...) {}, before=2L, all_rows=TRUE), + class = "epiprocess__epix_slide_all_rows_parameter_deprecated" + ) + expect_error( + archive_cases_dv_subset$ + group_by(geo_value)$ + slide(function(...) {}, before=2L, all_rows=TRUE), + class = "epiprocess__epix_slide_all_rows_parameter_deprecated" + ) +}) diff --git a/tests/testthat/test-epix_slide.R b/tests/testthat/test-epix_slide.R index 9ef2f9af..c352f90a 100644 --- a/tests/testthat/test-epix_slide.R +++ b/tests/testthat/test-epix_slide.R @@ -27,7 +27,7 @@ test_that("epix_slide works as intended",{ 2^6+2^3, 2^10+2^9, 2^15+2^14)) %>% - as_epi_df(as_of = 4) %>% # Also a bug (issue #213) + as_epi_df(as_of = 7) %>% group_by(geo_value) expect_identical(xx1,xx2) # * @@ -348,3 +348,18 @@ test_that("epix_slide with all_versions option works as intended",{ expect_identical(xx1,xx3) # This and * Imply xx2 and xx3 are identical }) + +test_that("`epix_slide` uses `versions_end` as a resulting `epi_df`'s `as_of`", { + ea_updated_stale = ea$clone() + ea_updated_stale$versions_end <- ea_updated_stale$versions_end + 3 # (dbl) + # + expect_identical( + ea_updated_stale %>% + group_by(geo_value) %>% + epix_slide(~ slice_head(.x, n = 1L), before = 10L) %>% + ungroup() %>% + attr("metadata") %>% + .$as_of, + 10 + ) +}) diff --git a/tests/testthat/test-grouped_epi_archive.R b/tests/testthat/test-grouped_epi_archive.R new file mode 100644 index 00000000..64c482bf --- /dev/null +++ b/tests/testthat/test-grouped_epi_archive.R @@ -0,0 +1,80 @@ +test_that("Grouping, regrouping, and ungrouping archives works as intended", { + # From an example: + library(dplyr) + toy_archive = + tribble( + ~geo_value, ~age_group, ~time_value, ~version, ~value, + "us", "adult", "2000-01-01", "2000-01-02", 121, + "us", "pediatric", "2000-01-02", "2000-01-03", 5, # (addition) + "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) + "us", "adult", "2000-01-02", "2000-01-03", 130 # (addition) + ) %>% + mutate(age_group = ordered(age_group, c("pediatric", "adult")), + time_value = as.Date(time_value), + version = as.Date(version)) %>% + as_epi_archive(other_keys = "age_group") + + # Ensure that we're using testthat edition 3's idea of "identical", which is + # not as strict as `identical`: + testthat::local_edition(3) + + # Test equivalency claims in example: + by_both_keys = toy_archive %>% group_by(geo_value, age_group) + expect_identical( + by_both_keys, + toy_archive %>% group_by(geo_value) %>% group_by(age_group, .add=TRUE) + ) + grouping_cols = c("geo_value", "age_group") + expect_identical( + by_both_keys, + toy_archive %>% group_by(across(all_of(grouping_cols))) + ) + + expect_identical( + toy_archive %>% group_by(geo_value), + toy_archive %>% group_by(geo_value, age_group) %>% ungroup(age_group) + ) + + # Test `.drop` behavior: + expect_error(toy_archive %>% group_by(.drop = "bogus"), + regexp = "\\.drop.*TRUE or FALSE") + expect_warning(toy_archive %>% group_by(.drop=FALSE), + class="epiprocess__group_by_epi_archive_drop_FALSE_no_factors") + expect_warning(toy_archive %>% group_by(geo_value, .drop=FALSE), + class="epiprocess__group_by_epi_archive_drop_FALSE_no_factors") + expect_warning(grouped_factor_then_nonfactor <- + toy_archive %>% group_by(age_group, geo_value, .drop=FALSE), + class="epiprocess__group_by_epi_archive_drop_FALSE_nonfactor_after_factor") + expect_identical(grouped_factor_then_nonfactor %>% + epix_slide(before = 10, s = sum(value)) %>% + ungroup(), + tibble::tribble( + ~age_group, ~geo_value, ~time_value, ~s, + "pediatric", NA_character_, "2000-01-02", 0, + "adult", "us", "2000-01-02", 121, + "pediatric", "us", "2000-01-03", 5, + "adult", "us", "2000-01-03", 255) %>% + mutate(age_group = ordered(age_group, c("pediatric", "adult")), + time_value = as.Date(time_value)) %>% + as_epi_df(geo_type = "nation", # bug; want "custom" from NA; issue #242 + as_of = as.Date("2000-01-03"), + additional_metadata = list(other_keys = "age_group")) %>% + # put back in expected order; see issue #166: + select(age_group, geo_value, time_value, s)) + expect_identical(toy_archive %>% + group_by(geo_value, age_group, .drop=FALSE) %>% + epix_slide(before = 10, s = sum(value)) %>% + ungroup(), + tibble::tribble( + ~geo_value, ~age_group, ~time_value, ~s, + "us", "pediatric", "2000-01-02", 0, + "us", "adult", "2000-01-02", 121, + "us", "pediatric", "2000-01-03", 5, + "us", "adult", "2000-01-03", 255) %>% + mutate(age_group = ordered(age_group, c("pediatric", "adult")), + time_value = as.Date(time_value)) %>% + as_epi_df(as_of = as.Date("2000-01-03"), + additional_metadata = list(other_keys = "age_group")) %>% + # put back in expected order; see issue #166: + select(geo_value, age_group, time_value, s)) +}) diff --git a/vignettes/advanced.Rmd b/vignettes/advanced.Rmd index ee0751e2..9580bee7 100644 --- a/vignettes/advanced.Rmd +++ b/vignettes/advanced.Rmd @@ -7,21 +7,62 @@ vignette: > %\VignetteEncoding{UTF-8} --- -In this vignette, we discuss how to use the sliding functionality in the -`epiprocess` package with computations that have advanced output structures. - -In general, the functions `epi_slide()` and `epix_slide()` do what they can to -ensure the result of a slide operation is *size stable*, meaning, it will return -something whose length is the same as the number of appearances of reference -time values for the slide computation in the given data frame/table (this -defaults to all time values, but can be some given subset when `ref_time_values` -is specified). - -The output of a slide computation should either be an atomic value/vector, or a -data frame. This data frame can have multiple columns, multiple rows, or both. -Below we demonstrate some advanced use cases of sliding with these output -structures. We focus on `epi_slide()` for the most part, though the behavior we -demonstrate also carries over to `epix_slide()`. +In this vignette, we discuss how to use the sliding functionality in the +`epiprocess` package with less common grouping schemes or with computations that +have advanced output structures. +The output of a slide computation should either be an atomic value/vector, or a +data frame. This data frame can have multiple columns, multiple rows, or both. + +During basic usage (e.g., when all optional arguments are set to their defaults): + +* `epi_slide(edf, , .....)`: + + * keeps **all** columns of `edf`, adds computed column(s) + * outputs one row **for every row in `edf`** (recycling outputs from + computations appropriately if there are multiple time series bundled + together inside any group(s)) + * is roughly analogous to (the non-sliding) **`dplyr::mutate` followed by + `dplyr::arrange(time_value, .by_group = TRUE)`** + +* `epix_slide(ea, , .....)`: + + * keeps **grouping and `time_value`** columns of `ea`, adds computed + column(s) + * outputs one row **for element/row output from the computations** + * is roughly analogous to (the non-sliding) **`dplyr::reframe`** (or + `dplyr::summarize`, before the `dplyr` 1.1.0 + [update](https://www.tidyverse.org/blog/2023/02/dplyr-1-1-0-pick-reframe-arrange/#reframe)) + +These differences in basic behavior make some common slide operations require less boilerplate: +* predictors and targets calculated with `epi_slide` are automatically lined up + with each other and with the signals from which they were calculated; and +* computations for an `epix_slide` can output data frames with any number of + rows, containing models, forecasts, evaluations, etc., and will not be + recycled. + +When using more advanced features, more complex rules apply: + +* Generalization: `epi_slide(edf, ....., ref_time_values=my_ref_time_values)` + will output one row for every row in `edf` with `time_value` appearing inside + `ref_time_values`, and is analogous to a `dplyr::mutate`&`dplyr::arrange` + followed by `dplyr::filter` to those `ref_time_values`. We call this property + **size stability**, and describe how it is achieved in the following sections. + The default behavior described above is a special case of this general rule + based on a default value of `ref_time_values`. +* Exception/feature: `epi_slide(edf, ....., ref_time_values=my_ref_time_values, + all_rows=TRUE)` will not just output rows for the given `ref_time_values`, but + instead will output one row per row in `edf`. +* Exception/feature: `epix_slide(ea, ....., as_list_col=TRUE)` will output one + row per computation that outputs a data frame, even when these data frames + have arbitrary numbers of rows. +* Clarification: `ea %>% group_by(....., .drop=FALSE) %>% + epix_slide(, .....)` will call the computation on any missing + groups according to `dplyr`'s `.drop=FALSE` rules, resulting in additional + output rows. + +Below we demonstrate some advanced use cases of sliding with different output +structures. We focus on `epi_slide()` for the most part, though some of the +behavior we demonstrate also carries over to `epix_slide()`. ## Recycling outputs @@ -35,7 +76,7 @@ simple synthetic example. library(epiprocess) library(dplyr) -df <- tibble( +edf <- tibble( geo_value = rep(c("ca", "fl", "pa"), each = 3), time_value = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "day"), length.out = length(geo_value)), @@ -44,29 +85,31 @@ df <- tibble( as_epi_df() # 2-day trailing average, per geo value -df %>% +edf %>% group_by(geo_value) %>% epi_slide(x_2dav = mean(x), before = 1) %>% ungroup() # 2-day trailing average, marginally -df %>% +edf %>% epi_slide(x_2dav = mean(x), before = 1) ``` ```{r, include = FALSE} # More checks (not included) -df %>% +edf %>% epi_slide(x_2dav = mean(x), before = 1, ref_time_values = as.Date("2020-06-02")) -df %>% +edf %>% + # pretend that observations about time_value t are reported in version t (nowcasts) mutate(version = time_value) %>% as_epi_archive() %>% group_by(geo_value) %>% epix_slide(x_2dav = mean(x), before = 1, ref_time_values = as.Date("2020-06-02")) %>% ungroup() -df %>% +edf %>% + # pretend that observations about time_value t are reported in version t (nowcasts) mutate(version = time_value) %>% as_epi_archive() %>% group_by(geo_value) %>% @@ -80,7 +123,7 @@ so, uses it to fill the new column. For example, this next computation gives the same result as the last one. ```{r} -df %>% +edf %>% epi_slide(y_2dav = rep(mean(x), 3), before = 1) ``` @@ -89,7 +132,7 @@ is *not* size stable, then `epi_slide()` throws an error. For example, below we are trying to return 2 things for 3 states. ```{r, error = TRUE} -df %>% +edf %>% epi_slide(x_2dav = rep(mean(x), 2), before = 1) ``` @@ -103,15 +146,15 @@ we set `as_list_col = TRUE` in the call to `epi_slide()`, the resulting `epi_df` object returned by `epi_slide()` has a list column containing the slide values. ```{r} -df2 <- df %>% +edf2 <- edf %>% group_by(geo_value) %>% epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), before = 1, as_list_col = TRUE) %>% ungroup() -class(df2$a) -length(df2$a) -df2$a[[2]] +class(edf2$a) +length(edf2$a) +edf2$a[[2]] ``` When we use `as_list_col = FALSE` (the default in `epi_slide()`), the function @@ -122,7 +165,7 @@ list column (here `a`) onto the column names of the output data frame from the slide computation (here `x_2dav` and `x_2dma`) separated by "_". ```{r} -df %>% +edf %>% group_by(geo_value) %>% epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), before = 1, as_list_col = FALSE) %>% @@ -133,7 +176,7 @@ We can use `names_sep = NULL` (which gets passed to `tidyr::unnest()`) to drop the prefix associated with list column name, in naming the unnested columns. ```{r} -df %>% +edf %>% group_by(geo_value) %>% epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), before = 1, as_list_col = FALSE, names_sep = NULL) %>% @@ -144,19 +187,19 @@ Furthermore, `epi_slide()` will recycle the single row data frame as needed in order to make the result size stable, just like the case for atomic values. ```{r} -df %>% +edf %>% epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), before = 1, as_list_col = FALSE, names_sep = NULL) ``` ```{r, include = FALSE} # More checks (not included) -df %>% +edf %>% epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), ref_time_values = as.Date("2020-06-02"), before = 1, as_list_col = FALSE, names_sep = NULL) -df %>% +edf %>% mutate(version = time_value) %>% as_epi_archive() %>% group_by(geo_value) %>% @@ -174,32 +217,51 @@ Meaning, `epi_slide()` will check that the result is size stable, and if so, will fill the new column(s) in the resulting `epi_df` object appropriately. This can be convenient for modeling in the following sense: we can, for example, -fit a sliding forecasting model by pooling data from different locations, and -then return separate forecasts from this common model for each location. We use -our synthetic example to demonstrate this idea abstractly but simply. +fit a sliding, data-versioning-unaware nowcasting or forecasting model by +pooling data from different locations, and then return separate forecasts from +this common model for each location. We use our synthetic example to demonstrate +this idea abstractly but simply by forecasting (actually, nowcasting) `y` from +`x` by fitting a time-windowed linear model that pooling data across all +locations. ```{r} -df$y <- 2 * df$x + 0.05 * rnorm(length(df$x)) +edf$y <- 2 * edf$x + 0.05 * rnorm(length(edf$x)) -df %>% +edf %>% epi_slide(function(d, ...) { obj <- lm(y ~ x, data = d) return( as.data.frame( predict(obj, newdata = d %>% - group_by(geo_value) %>% - filter(time_value == max(time_value)), + group_by(geo_value) %>% + filter(time_value == max(time_value)), interval = "prediction", level = 0.9) )) }, before = 1, new_col_name = "fc", names_sep = NULL) ``` +The above example focused on simplicity to show how to work with multi-row +outputs. Note however, the following issues in this example: + +* The `lm` fitting data includes the testing instances, as no training-test split was performed. +* Adding a simple training-test split would not factor in reporting latency properly. +* Data revisions are not taken into account. + +All three of these factors contribute to unrealistic retrospective forecasts and +overly optimistic retrospective performance evaluations. Instead, one should +favor an `epix_slide` for more realistic "pseudoprospective" forecasts. Using +`epix_slide` also makes it easier to express certain types of forecasts; while +in `epi_slide`, forecasts for additional aheads or quantile levels would need to +be expressed as additional columns, or nested inside list columns, `epix_slide` +does not perform size stability checks or recycling, allowing computations to +output any number of rows. + ## Version-aware forecasting, revisited -Finally, we revisit the COVID-19 forecasting example from the [archive +We revisit the COVID-19 forecasting example from the [archive vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html) in order -to demonstrate the last point in a more realistic setting. First, we fetch the -versioned data and build the archive. +to demonstrate the preceding points regarding forecast evaluation in a more +realistic setting. First, we fetch the versioned data and build the archive. ```{r, message = FALSE, warning = FALSE, eval =FALSE} library(epidatr)