diff --git a/DESCRIPTION b/DESCRIPTION index 71d95969..93a97936 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: epiprocess Title: Tools for basic signal processing in epidemiology -Version: 0.7.5 +Version: 0.7.6 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", email = "lcbrooks@andrew.cmu.edu", role = c("aut", "cre")), @@ -73,6 +73,7 @@ Depends: URL: https://cmu-delphi.github.io/epiprocess/ Collate: 'archive.R' + 'archive_new.R' 'autoplot.R' 'correlation.R' 'data.R' @@ -80,9 +81,11 @@ Collate: 'epiprocess.R' 'group_by_epi_df_methods.R' 'methods-epi_archive.R' + 'grouped_archive_new.R' 'grouped_epi_archive.R' 'growth_rate.R' 'key_colnames.R' + 'methods-epi_archive_new.R' 'methods-epi_df.R' 'outliers.R' 'reexports.R' diff --git a/NAMESPACE b/NAMESPACE index 03e0e41d..3eeb6b0f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,40 +6,58 @@ S3method(as_epi_df,data.frame) S3method(as_epi_df,epi_df) S3method(as_epi_df,tbl_df) S3method(as_epi_df,tbl_ts) +S3method(as_of,epi_archive2) S3method(as_tibble,epi_df) S3method(as_tsibble,epi_df) S3method(autoplot,epi_df) +S3method(clone,epi_archive2) +S3method(clone,grouped_epi_archive2) S3method(dplyr_col_modify,col_modify_recorder_df) S3method(dplyr_col_modify,epi_df) S3method(dplyr_reconstruct,epi_df) S3method(dplyr_row_slice,epi_df) S3method(epix_truncate_versions_after,epi_archive) +S3method(epix_truncate_versions_after,epi_archive2) S3method(epix_truncate_versions_after,grouped_epi_archive) +S3method(epix_truncate_versions_after,grouped_epi_archive2) S3method(group_by,epi_archive) +S3method(group_by,epi_archive2) S3method(group_by,epi_df) S3method(group_by,grouped_epi_archive) +S3method(group_by,grouped_epi_archive2) S3method(group_by_drop_default,grouped_epi_archive) +S3method(group_by_drop_default,grouped_epi_archive2) S3method(group_modify,epi_df) S3method(groups,grouped_epi_archive) +S3method(groups,grouped_epi_archive2) S3method(key_colnames,data.frame) S3method(key_colnames,default) S3method(key_colnames,epi_archive) S3method(key_colnames,epi_df) S3method(next_after,Date) S3method(next_after,integer) +S3method(print,epi_archive2) S3method(print,epi_df) +S3method(print,grouped_epi_archive2) S3method(select,epi_df) +S3method(slide,grouped_epi_archive2) S3method(summary,epi_df) +S3method(truncate_versions_after,grouped_epi_archive2) S3method(ungroup,epi_df) S3method(ungroup,grouped_epi_archive) +S3method(ungroup,grouped_epi_archive2) S3method(unnest,epi_df) export("%>%") export(archive_cases_dv_subset) +export(archive_cases_dv_subset_2) export(arrange) export(as_epi_archive) +export(as_epi_archive2) export(as_epi_df) +export(as_of) export(as_tsibble) export(autoplot) +export(clone) export(detect_outlr) export(detect_outlr_rm) export(detect_outlr_stl) @@ -47,24 +65,33 @@ export(epi_archive) export(epi_cor) export(epi_slide) export(epix_as_of) +export(epix_as_of2) export(epix_merge) +export(epix_merge2) export(epix_slide) +export(epix_slide2) export(epix_truncate_versions_after) +export(fill_through_version) export(filter) export(group_by) export(group_modify) export(growth_rate) export(is_epi_archive) +export(is_epi_archive2) export(is_epi_df) export(is_grouped_epi_archive) +export(is_grouped_epi_archive2) export(key_colnames) export(max_version_with_row_in) export(mutate) +export(new_epi_archive2) export(new_epi_df) export(next_after) export(relocate) export(rename) export(slice) +export(slide) +export(truncate_versions_after) export(ungroup) export(unnest) importFrom(R6,R6Class) diff --git a/NEWS.md b/NEWS.md index 5bf584e6..b2c26775 100644 --- a/NEWS.md +++ b/NEWS.md @@ -25,6 +25,7 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat ## Breaking changes - Switched `epi_df`'s `other_keys` default from `NULL` to `character(0)`; PR #390 +- Refactor `epi_archive` to use S3 instead of R6 for its object model. The calls to some functions will change, but the functionality will remain the same. It will also help us maintain the package better in the future. (#340) # epiprocess 0.7.0 diff --git a/R/archive.R b/R/archive.R index ff3bc20c..a530cc05 100644 --- a/R/archive.R +++ b/R/archive.R @@ -514,9 +514,6 @@ epi_archive <- fromLast = TRUE ) %>% tibble::as_tibble() %>% - # (`as_tibble` should de-alias the DT and its columns in any edge - # cases where they are aliased. We don't say we guarantee this - # though.) dplyr::select(-"version") %>% as_epi_df( geo_type = self$geo_type, diff --git a/R/archive_new.R b/R/archive_new.R new file mode 100644 index 00000000..0b4f3695 --- /dev/null +++ b/R/archive_new.R @@ -0,0 +1,1115 @@ +# We use special features of data.table's `[`. The data.table package has a +# compatibility feature that disables some/all of these features if it thinks we +# might expect `data.frame`-compatible behavior instead. We can signal that we +# want the special behavior via `.datatable.aware = TRUE` or by importing any +# `data.table` package member. Do both to prevent surprises if we decide to use +# `data.table::` everywhere and not importing things. +.datatable.aware <- TRUE + +#' Validate a version bound arg +#' +#' Expected to be used on `clobberable_versions_start`, `versions_end`, +#' and similar arguments. Some additional context-specific checks may be needed. +#' +#' @param version_bound the version bound to validate +#' @param x a data frame containing a version column with which to check +#' compatibility +#' @param na_ok Boolean; is `NA` an acceptable "bound"? (If so, `NA` will +#' have a special context-dependent meaning.) +#' @param version_bound_arg optional string; what to call the version bound in +#' error messages +#' +#' @section Side effects: raises an error if version bound appears invalid +#' +#' @noRd +validate_version_bound <- function(version_bound, x, na_ok = FALSE, + version_bound_arg = rlang::caller_arg(version_bound), + x_arg = rlang::caller_arg(version_bound)) { + if (is.null(version_bound)) { + cli_abort( + "{version_bound_arg} cannot be NULL" + ) + } + if (na_ok && is.na(version_bound)) { + return(invisible(NULL)) + } + if (!test_set_equal(class(version_bound), class(x[["version"]]))) { + cli_abort( + "{version_bound_arg} must have the same classes as x$version, + which is {class(x$version)}", + ) + } + if (!test_set_equal(typeof(version_bound), typeof(x[["version"]]))) { + cli_abort( + "{version_bound_arg} must have the same types as x$version, + which is {typeof(x$version)}", + ) + } + + return(invisible(NULL)) +} + +#' `max(x$version)`, with error if `x` has 0 rows +#' +#' Exported to make defaults more easily copyable. +#' +#' @param x `x` argument of [`as_epi_archive`] +#' +#' @return `max(x$version)` if it has any rows; raises error if it has 0 rows or +#' an `NA` version value +#' +#' @export +max_version_with_row_in <- function(x) { + if (nrow(x) == 0L) { + cli_abort( + "`nrow(x)==0L`, representing a data set history with no row up through the + latest observed version, but we don't have a sensible guess at what version + that is, or whether any of the empty versions might be clobbered in the + future; if we use `x` to form an `epi_archive`, then + `clobberable_versions_start` and `versions_end` must be manually specified.", + class = "epiprocess__max_version_cannot_be_used" + ) + } else { + version_col <- purrr::pluck(x, "version") # error not NULL if doesn't exist + if (anyNA(version_col)) { + cli_abort("version values cannot be NA", + class = "epiprocess__version_values_must_not_be_na" + ) + } else { + version_bound <- max(version_col) + } + } +} + +#' Get the next possible value greater than `x` of the same type +#' +#' @param x the starting "value"(s) +#' @return same class, typeof, and length as `x` +#' +#' @export +next_after <- function(x) UseMethod("next_after") + +#' @export +next_after.integer <- function(x) x + 1L + +#' @export +next_after.Date <- function(x) x + 1L + + + +#' epi archive +#' @title `epi_archive` object +#' +#' @description An `epi_archive` is an R6 class which contains a data table +#' along with several relevant pieces of metadata. The data table can be seen +#' as the full archive (version history) for some signal variables of +#' interest. +#' +#' @details An `epi_archive` is an R6 class which contains a data table `DT`, of +#' class `data.table` from the `data.table` package, with (at least) the +#' following columns: +#' +#' * `geo_value`: the geographic value associated with each row of measurements. +#' * `time_value`: the time value associated with each row of measurements. +#' * `version`: the time value specifying the version for each row of +#' measurements. For example, if in a given row the `version` is January 15, +#' 2022 and `time_value` is January 14, 2022, then this row contains the +#' measurements of the data for January 14, 2022 that were available one day +#' later. +#' +#' The data table `DT` has key variables `geo_value`, `time_value`, `version`, +#' as well as any others (these can be specified when instantiating the +#' `epi_archive` object via the `other_keys` argument, and/or set by operating +#' on `DT` directly). Refer to the documentation for [as_epi_archive()] for +#' information and examples of relevant parameter names for an `epi_archive` object. +#' Note that there can only be a single row per unique combination of +#' key variables, and thus the key variables are critical for figuring out how +#' to generate a snapshot of data from the archive, as of a given version. +#' +#' In general, the last version of each observation is carried forward (LOCF) to +#' fill in data between recorded versions, and between the last recorded +#' update and the `versions_end`. One consequence is that the `DT` +#' doesn't have to contain a full snapshot of every version (although this +#' generally works), but can instead contain only the rows that are new or +#' changed from the previous version (see `compactify`, which does this +#' automatically). Currently, deletions must be represented as revising a row +#' to a special state (e.g., making the entries `NA` or including a special +#' column that flags the data as removed and performing some kind of +#' post-processing), and the archive is unaware of what this state is. Note +#' that `NA`s *can* be introduced by `epi_archive` methods for other reasons, +#' e.g., in [`epix_fill_through_version`] and [`epix_merge`], if requested, to +#' represent potential update data that we do not yet have access to; or in +#' [`epix_merge`] to represent the "value" of an observation before the +#' version in which it was first released, or if no version of that +#' observation appears in the archive data at all. +#' +#' **A word of caution:** R6 objects, unlike most other objects in R, have +#' reference semantics. A primary consequence of this is that objects are not +#' copied when modified. You can read more about this in Hadley Wickham's +#' [Advanced R](https://adv-r.hadley.nz/r6.html#r6-semantics) book. In order +#' to construct a modified archive while keeping the original intact, first +#' make a clone using the `$clone` method, then overwrite the clone's `DT` +#' field with `data.table::copy(clone$DT)`, and finally perform the +#' modifications on the clone. +#' +#' @section Metadata: +#' The following pieces of metadata are included as fields in an `epi_archive` +#' object: +#' +#' * `geo_type`: the type for the geo values. +#' * `time_type`: the type for the time values. +#' * `additional_metadata`: list of additional metadata for the data archive. +#' +#' Unlike an `epi_df` object, metadata for an `epi_archive` object `x` can be +#' accessed (and altered) directly, as in `x$geo_type` or `x$time_type`, +#' etc. Like an `epi_df` object, the `geo_type` and `time_type` fields in the +#' metadata of an `epi_archive` object are not currently used by any +#' downstream functions in the `epiprocess` package, and serve only as useful +#' bits of information to convey about the data set at hand. +#' +#' @section Generating Snapshots: +#' An `epi_archive` object can be used to generate a snapshot of the data in +#' `epi_df` format, which represents the most up-to-date values of the signal +#' variables, as of the specified version. This is accomplished by calling the +#' `as_of()` method for an `epi_archive` object `x`. More details on this +#' method are documented in the wrapper function [`epix_as_of()`]. +#' +#' @section Sliding Computations: +#' We can run a sliding computation over an `epi_archive` object, much like +#' `epi_slide()` does for an `epi_df` object. This is accomplished by calling +#' the `slide()` method for an `epi_archive` object, which works similarly to +#' the way `epi_slide()` works for an `epi_df` object, but with one key +#' difference: it is version-aware. That is, for an `epi_archive` object, the +#' sliding computation at any given reference time point t is performed on +#' **data that would have been available as of t**. More details on `slide()` +#' are documented in the wrapper function [`epix_slide()`]. +#' +#' @export +#' @examples +#' tib <- tibble::tibble( +#' geo_value = rep(c("ca", "hi"), each = 5), +#' time_value = rep(seq(as.Date("2020-01-01"), +#' by = 1, length.out = 5 +#' ), times = 2), +#' version = rep(seq(as.Date("2020-01-02"), +#' by = 1, length.out = 5 +#' ), times = 2), +#' value = rnorm(10, mean = 2, sd = 1) +#' ) +#' +#' toy_epi_archive <- tib %>% new_epi_archive2( +#' geo_type = "state", +#' time_type = "day" +#' ) +#' toy_epi_archive +#' @name epi_archive +# TODO: Figure out where to actually put this documentation +NULL + +#' New epi archive +#' @description Creates a new `epi_archive` object. +#' @param x A data.frame, data.table, or tibble, with columns `geo_value`, +#' `time_value`, `version`, and then any additional number of columns. +#' @param geo_type Type for the geo values. If missing, then the function will +#' attempt to infer it from the geo values present; if this fails, then it +#' will be set to "custom". +#' @param time_type Type for the time values. If missing, then the function will +#' attempt to infer it from the time values present; if this fails, then it +#' will be set to "custom". +#' @param other_keys Character vector specifying the names of variables in `x` +#' that should be considered key variables (in the language of `data.table`) +#' apart from "geo_value", "time_value", and "version". +#' @param additional_metadata List of additional metadata to attach to the +#' `epi_archive` object. The metadata will have `geo_type` and `time_type` +#' fields; named entries from the passed list or will be included as well. +#' @param compactify Optional; Boolean or `NULL`: should we remove rows that are +#' considered redundant for the purposes of `epi_archive`'s built-in methods +#' such as `as_of`? As these methods use the last version of each observation +#' carried forward (LOCF) to interpolate between the version data provided, +#' rows that don't change these LOCF results can potentially be omitted to +#' save space while maintaining the same behavior (with the help of the +#' `clobberable_versions_start` and `versions_end` fields in some edge cases). +#' `TRUE` will remove these rows, `FALSE` will not, and missing or `NULL` will +#' remove these rows and issue a warning. Generally, this can be set to +#' `TRUE`, but if you directly inspect or edit the fields of the `epi_archive` +#' such as its `DT`, or rely on redundant updates to achieve a certain +#' behavior of the `ref_time_values` default in `epix_slide`, you will have to +#' determine whether `compactify=TRUE` will produce the desired results. If +#' compactification here is removing a large proportion of the rows, this may +#' indicate a potential for space, time, or bandwidth savings upstream the +#' data pipeline, e.g., by avoiding fetching, storing, or processing these +#' rows of `x`. +#' @param clobberable_versions_start Optional; as in [`as_epi_archive`] +#' @param versions_end Optional; as in [`as_epi_archive`] +#' @return An `epi_archive` object. +#' @importFrom data.table as.data.table key setkeyv +#' +#' @details +#' Refer to the documentation for [as_epi_archive()] for more information +#' and examples of parameter names. +#' @export +new_epi_archive2 <- function( + x, + geo_type = NULL, + time_type = NULL, + other_keys = NULL, + additional_metadata = NULL, + compactify = NULL, + clobberable_versions_start = NA, + versions_end = NULL) { + assert_data_frame(x) + if (!test_subset(c("geo_value", "time_value", "version"), names(x))) { + cli_abort( + "Columns `geo_value`, `time_value`, and `version` must be present in `x`." + ) + } + if (anyMissing(x$version)) { + cli_abort("Column `version` must not contain missing values.") + } + + # If geo type is missing, then try to guess it + if (missing(geo_type) || is.null(geo_type)) { + geo_type <- guess_geo_type(x$geo_value) + } + + # If time type is missing, then try to guess it + if (missing(time_type) || is.null(time_type)) { + time_type <- guess_time_type(x$time_value) + } + + # Finish off with small checks on keys variables and metadata + if (missing(other_keys)) other_keys <- NULL + if (missing(additional_metadata) || is.null(additional_metadata)) additional_metadata <- list() + if (!test_subset(other_keys, names(x))) { + cli_abort("`other_keys` must be contained in the column names of `x`.") + } + if (any(c("geo_value", "time_value", "version") %in% other_keys)) { + cli_abort("`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\".") + } + if (any(names(additional_metadata) %in% c("geo_type", "time_type"))) { + cli_warn("`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\".") + } + + # Conduct checks and apply defaults for `compactify` + if (missing(compactify)) { + compactify <- NULL + } + assert_logical(compactify, len = 1, null.ok = TRUE) + + # Apply defaults and conduct checks for + # `clobberable_versions_start`, `versions_end`: + if (missing(clobberable_versions_start)) { + clobberable_versions_start <- NA + } + if (missing(versions_end) || is.null(versions_end)) { + versions_end <- max_version_with_row_in(x) + } + validate_version_bound(clobberable_versions_start, x, na_ok = TRUE) + validate_version_bound(versions_end, x, na_ok = FALSE) + if (nrow(x) > 0L && versions_end < max(x[["version"]])) { + cli_abort( + sprintf( + "`versions_end` was %s, but `x` contained + updates for a later version or versions, up through %s", + versions_end, max(x[["version"]]) + ), + class = "epiprocess__versions_end_earlier_than_updates" + ) + } + if (!is.na(clobberable_versions_start) && clobberable_versions_start > versions_end) { + cli_abort( + sprintf( + "`versions_end` was %s, but a `clobberable_versions_start` + of %s indicated that there were later observed versions", + versions_end, clobberable_versions_start + ), + class = "epiprocess__versions_end_earlier_than_clobberable_versions_start" + ) + } + + # --- End of validation and replacing missing args with defaults --- + + # Create the data table; if x was an un-keyed data.table itself, + # then the call to as.data.table() will fail to set keys, so we + # need to check this, then do it manually if needed + key_vars <- c("geo_value", "time_value", other_keys, "version") + DT <- as.data.table(x, key = key_vars) + if (!identical(key_vars, key(DT))) setkeyv(DT, cols = key_vars) + + maybe_first_duplicate_key_row_index <- anyDuplicated(DT, by = key(DT)) + if (maybe_first_duplicate_key_row_index != 0L) { + cli_abort("`x` must have one row per unique combination of the key variables. If you + have additional key variables other than `geo_value`, `time_value`, and + `version`, such as an age group column, please specify them in `other_keys`. + Otherwise, check for duplicate rows and/or conflicting values for the same + measurement.", + class = "epiprocess__epi_archive_requires_unique_key" + ) + } + + # Checks to see if a value in a vector is LOCF + is_locf <- function(vec) { + dplyr::if_else(!is.na(vec) & !is.na(dplyr::lag(vec)), + vec == dplyr::lag(vec), + is.na(vec) & is.na(dplyr::lag(vec)) + ) + } + + # LOCF is defined by a row where all values except for the version + # differ from their respective lag values + + # Checks for LOCF's in a data frame + rm_locf <- function(df) { + dplyr::filter(df, if_any(c(everything(), -version), ~ !is_locf(.))) + } + + # Keeps LOCF values, such as to be printed + keep_locf <- function(df) { + dplyr::filter(df, if_all(c(everything(), -version), ~ is_locf(.))) + } + + # Runs compactify on data frame + if (is.null(compactify) || compactify == TRUE) { + elim <- keep_locf(DT) + DT <- rm_locf(DT) + } else { + # Create empty data frame for nrow(elim) to be 0 + elim <- tibble::tibble() + } + + # Warns about redundant rows + if (is.null(compactify) && nrow(elim) > 0) { + warning_intro <- cli::format_inline( + "Found rows that appear redundant based on + last (version of each) observation carried forward; + these rows have been removed to 'compactify' and save space:", + keep_whitespace = FALSE + ) + warning_data <- paste(collapse = "\n", capture.output(print(elim, topn = 3L, nrows = 7L))) + warning_outro <- cli::format_inline( + "Built-in `epi_archive` functionality should be unaffected, + but results may change if you work directly with its fields (such as `DT`). + See `?as_epi_archive` for details. + To silence this warning but keep compactification, + you can pass `compactify=TRUE` when constructing the archive.", + keep_whitespace = FALSE + ) + warning_message <- paste(sep = "\n", warning_intro, warning_data, warning_outro) + rlang::warn(warning_message, class = "epiprocess__compactify_default_removed_rows") + } + + structure( + list( + DT = DT, + geo_type = geo_type, + time_type = time_type, + additional_metadata = additional_metadata, + clobberable_versions_start = clobberable_versions_start, + versions_end = versions_end, + private = list() # TODO: to be encapsulated with guard-rails later + ), + class = "epi_archive2" + ) +} + +#' Print information about an `epi_archive` object +#' @param class Boolean; whether to print the class label header +#' @param methods Boolean; whether to print all available methods of +#' the archive +#' @importFrom cli cli_inform +#' @export +print.epi_archive2 <- function(epi_archive, class = TRUE, methods = TRUE) { + cli_inform( + c( + ">" = if (class) "An `epi_archive` object, with metadata:", + "i" = if (length(setdiff(key(epi_archive$DT), c("geo_value", "time_value", "version"))) > 0) { + "Non-standard DT keys: {setdiff(key(epi_archive$DT), c('geo_value', 'time_value', 'version'))}" + }, + "i" = "Min/max time values: {min(epi_archive$DT$time_value)} / {max(epi_archive$DT$time_value)}", + "i" = "First/last version with update: {min(epi_archive$DT$version)} / {max(epi_archive$DT$version)}", + "i" = if (!is.na(epi_archive$clobberable_versions_start)) { + "Clobberable versions start: {epi_archive$clobberable_versions_start}" + }, + "i" = "Versions end: {epi_archive$versions_end}", + "i" = if (methods) "Public R6 methods: {names(epi_archive$public_methods)}", + "i" = "A preview of the table ({nrow(epi_archive$DT)} rows x {ncol(epi_archive$DT)} columns):" + ) + ) + + return(invisible(epi_archive$DT %>% print())) +} + + +#' @export +as_of <- function(x, ...) { + UseMethod("as_of") +} + + +#' As of epi_archive +#' @description Generates a snapshot in `epi_df` format as of a given version. +#' See the documentation for the wrapper function [`epix_as_of()`] for +#' details. The parameter descriptions below are copied from there +#' @param epi_archive An `epi_archive` object +#' @param max_version Version specifying the max version to permit in the +#' snapshot. That is, the snapshot will comprise the unique rows of the +#' current archive data that represent the most up-to-date signal values, as +#' of the specified `max_version` (and whose `time_value`s are at least +#' `min_time_value`). +#' @param min_time_value Time value specifying the min `time_value` to permit in +#' the snapshot. Default is `-Inf`, which effectively means that there is no +#' minimum considered. +#' @param all_versions Boolean; If `all_versions = TRUE`, then the output will be in +#' `epi_archive` format, and contain rows in the specified `time_value` range +#' having `version <= max_version`. The resulting object will cover a +#' potentially narrower `version` and `time_value` range than `x`, depending +#' on user-provided arguments. Otherwise, there will be one row in the output +#' for the `max_version` of each `time_value`. Default is `FALSE`. +#' @importFrom data.table between key +#' @export +as_of.epi_archive2 <- function(epi_archive, max_version, min_time_value = -Inf, all_versions = FALSE) { + other_keys <- setdiff( + key(epi_archive$DT), + c("geo_value", "time_value", "version") + ) + if (length(other_keys) == 0) other_keys <- NULL + + # Check a few things on max_version + if (!test_set_equal(class(max_version), class(epi_archive$DT$version))) { + cli_abort( + "`max_version` must have the same classes as `epi_archive$DT$version`." + ) + } + if (!test_set_equal(typeof(max_version), typeof(epi_archive$DT$version))) { + cli_abort( + "`max_version` must have the same types as `epi_archive$DT$version`." + ) + } + assert_scalar(max_version, na.ok = FALSE) + if (max_version > epi_archive$versions_end) { + cli_abort("`max_version` must be at most `epi_archive$versions_end`.") + } + assert_logical(all_versions, len = 1) + if (!is.na(epi_archive$clobberable_versions_start) && max_version >= epi_archive$clobberable_versions_start) { + cli_warn( + 'Getting data as of some recent version which could still be + overwritten (under routine circumstances) without assigning a new + version number (a.k.a. "clobbered"). Thus, the snapshot that we + produce here should not be expected to be reproducible later. See + `?epi_archive` for more info and `?epix_as_of` on how to muffle.', + class = "epiprocess__snapshot_as_of_clobberable_version" + ) + } + + # Filter by version and return + if (all_versions) { + # epi_archive is copied into result, so we can modify result directly + result <- epix_truncate_versions_after(epi_archive, max_version) + result$DT <- result$DT[time_value >= min_time_value, ] + return(result) + } + + # Make sure to use data.table ways of filtering and selecting + as_of_epi_df <- epi_archive$DT[time_value >= min_time_value & version <= max_version, ] %>% + unique( + by = c("geo_value", "time_value", other_keys), + fromLast = TRUE + ) %>% + tibble::as_tibble() %>% + dplyr::select(-"version") %>% + as_epi_df( + geo_type = epi_archive$geo_type, + time_type = epi_archive$time_type, + as_of = max_version, + additional_metadata = c(epi_archive$additional_metadata, + other_keys = other_keys + ) + ) + + return(as_of_epi_df) +} + + +#' @export +fill_through_version <- function(x, ...) { + UseMethod("fill_through_version") +} + + +#' Fill through version +#' @description Fill in unobserved history using requested scheme by mutating +#' the given object and potentially reseating its fields. See +#' [`epix_fill_through_version`], which doesn't mutate the input archive but +#' might alias its fields. +#' +#' @param epi_archive an `epi_archive` object +#' @param fill_versions_end as in [`epix_fill_through_version`] +#' @param how as in [`epix_fill_through_version`] +#' +#' @importFrom data.table key setkeyv := address copy +#' @importFrom rlang arg_match +fill_through_version.epi_archive2 <- function( + epi_archive, + fill_versions_end, + how = c("na", "locf")) { + validate_version_bound(fill_versions_end, epi_archive$DT, na_ok = FALSE) + how <- arg_match(how) + if (epi_archive$versions_end < fill_versions_end) { + new_DT <- switch(how, + "na" = { + # old DT + a version consisting of all NA observations + # immediately after the last currently/actually-observed + # version. Note that this NA-observation version must only be + # added if `epi_archive` is outdated. + nonversion_key_cols <- setdiff(key(epi_archive$DT), "version") + nonkey_cols <- setdiff(names(epi_archive$DT), key(epi_archive$DT)) + next_version_tag <- next_after(epi_archive$versions_end) + if (next_version_tag > fill_versions_end) { + cli_abort(sprintf(paste( + "Apparent problem with `next_after` method:", + "archive contained observations through version %s", + "and the next possible version was supposed to be %s,", + "but this appeared to jump from a version < %3$s", + "to one > %3$s, implying at least one version in between." + ), epi_archive$versions_end, next_version_tag, fill_versions_end)) + } + nonversion_key_vals_ever_recorded <- unique(epi_archive$DT, by = nonversion_key_cols) + # In edge cases, the `unique` result can alias the original + # DT; detect and copy if necessary: + if (identical(address(epi_archive$DT), address(nonversion_key_vals_ever_recorded))) { + nonversion_key_vals_ever_recorded <- copy(nonversion_key_vals_ever_recorded) + } + next_version_DT <- nonversion_key_vals_ever_recorded[ + , version := next_version_tag + ][ + # this makes the class of these columns logical (`NA` is a + # logical NA; we're relying on the rbind below to convert to + # the proper class&typeof) + , (nonkey_cols) := NA + ] + # full result DT: + setkeyv(rbind(epi_archive$DT, next_version_DT), key(epi_archive$DT))[] + }, + "locf" = { + # just the old DT; LOCF is built into other methods: + epi_archive$DT + } + ) + new_versions_end <- fill_versions_end + # Update `epi_archive` all at once with simple, error-free operations + + # return below: + epi_archive$DT <- new_DT + epi_archive$versions_end <- new_versions_end + } else { + # Already sufficiently up to date; nothing to do. + } + return(invisible(epi_archive)) +} + + +#' @export +truncate_versions_after <- function(x, ...) { + UseMethod("truncate_versions_after") +} + + +#' Truncate versions after +#' @description Filter to keep only older versions, mutating the archive by +#' potentially reseating but not mutating some fields. `DT` is likely, but not +#' guaranteed, to be copied. Returns the mutated archive +#' [invisibly][base::invisible]. +#' @param epi_archive as in [`epix_truncate_versions_after`] +#' @param max_version as in [`epix_truncate_versions_after`] +truncate_versions_after.epi_archive2 <- function( + epi_archive, + max_version) { + if (!test_set_equal(class(max_version), class(epi_archive$DT$version))) { + cli_abort("`max_version` must have the same classes as `epi_archive$DT$version`.") + } + if (!test_set_equal(typeof(max_version), typeof(epi_archive$DT$version))) { + cli_abort("`max_version` must have the same types as `epi_archive$DT$version`.") + } + assert_scalar(max_version, na.ok = FALSE) + if (max_version > epi_archive$versions_end) { + cli_abort("`max_version` must be at most `epi_archive$versions_end`.") + } + epi_archive$DT <- epi_archive$DT[epi_archive$DT$version <= max_version, colnames(epi_archive$DT), with = FALSE] + # (^ this filter operation seems to always copy the DT, even if it + # keeps every entry; we don't guarantee this behavior in + # documentation, though, so we could change to alias in this case) + if (!is.na(epi_archive$clobberable_versions_start) && epi_archive$clobberable_versions_start > max_version) { + epi_archive$clobberable_versions_start <- NA + } + epi_archive$versions_end <- max_version + return(invisible(epi_archive)) +} + + +#' Merge epi archive +#' @description Merges another `epi_archive` with the current one, mutating the +#' current one by reseating its `DT` and several other fields, but avoiding +#' mutation of the old `DT`; returns the current archive +#' [invisibly][base::invisible]. See [`epix_merge`] for a full description +#' of the non-R6-method version, which does not mutate either archive, and +#' does not alias either archive's `DT`.a +#' @param x as in [`epix_merge`] +#' @param y as in [`epix_merge`] +#' @param sync as in [`epix_merge`] +#' @param compactify as in [`epix_merge`] +merge_epi_archive2 <- function( + x, + y, + sync = c("forbid", "na", "locf", "truncate"), + compactify = TRUE) { + result <- epix_merge(x, y, + sync = sync, + compactify = compactify + ) + + # TODO: Use encapsulating methods instead. + if (length(x$private_fields) != 0L) { + cli_abort("expected no private fields in x", + internal = TRUE + ) + } + + # Mutate fields all at once, trying to avoid any potential errors: + for (field_name in names(x$public_fields)) { + x[[field_name]] <- result[[field_name]] + } + + return(invisible(x)) +} + + +#' `group_by` and related methods for `epi_archive`, `grouped_epi_archive` +#' +#' @param .data An `epi_archive` or `grouped_epi_archive` +#' @param ... Similar to [`dplyr::group_by`] (see "Details:" for edge cases); +#' * For `group_by`: unquoted variable name(s) or other +#' ["data masking"][dplyr::dplyr_data_masking] expression(s). It's possible to +#' use [`dplyr::mutate`]-like syntax here to calculate new columns on which to +#' perform grouping, but note that, if you are regrouping an already-grouped +#' `.data` object, the calculations will be carried out ignoring such grouping +#' (same as [in dplyr][dplyr::group_by]). +#' * For `ungroup`: either +#' * empty, in order to remove the grouping and output an `epi_archive`; or +#' * variable name(s) or other ["tidy-select"][dplyr::dplyr_tidy_select] +#' expression(s), in order to remove the matching variables from the list of +#' grouping variables, and output another `grouped_epi_archive`. +#' @param .add Boolean. If `FALSE`, the default, the output will be grouped by +#' the variable selection from `...` only; if `TRUE`, the output will be +#' grouped by the current grouping variables plus the variable selection from +#' `...`. +#' @param .drop As described in [`dplyr::group_by`]; determines treatment of +#' factor columns. +#' @param x For `groups` or `ungroup`: a `grouped_epi_archive`; for +#' `is_grouped_epi_archive`: any object +#' @param .tbl (For `group_by_drop_default`:) an `epi_archive` or +#' `grouped_epi_archive` (`epi_archive` dispatches to the S3 default method; +#' `grouped_epi_archive` dispatches its own S3 method) +#' +#' @details +#' +#' To match `dplyr`, `group_by` allows "data masking" (also referred to as +#' "tidy evaluation") expressions `...`, not just column names, in a way similar +#' to `mutate`. Note that replacing or removing key columns with these +#' expressions is disabled. +#' +#' `archive %>% group_by()` and other expressions that group or regroup by zero +#' columns (indicating that all rows should be treated as part of one large +#' group) will output a `grouped_epi_archive`, in order to enable the use of +#' `grouped_epi_archive` methods on the result. This is in slight contrast to +#' the same operations on tibbles and grouped tibbles, which will *not* output a +#' `grouped_df` in these circumstances. +#' +#' Using `group_by` with `.add=FALSE` to override the existing grouping is +#' disabled; instead, `ungroup` first then `group_by`. +#' +#' Mutation and aliasing: `group_by` tries to use a shallow copy of the `DT`, +#' introducing column-level aliasing between its input and its result. This +#' doesn't follow the general model for most `data.table` operations, which +#' seems to be that, given an nonaliased (i.e., unique) pointer to a +#' `data.table` object, its pointers to its columns should also be nonaliased. +#' If you mutate any of the columns of either the input or result, first ensure +#' that it is fine if columns of the other are also mutated, but do not rely on +#' such behavior to occur. Additionally, never perform mutation on the key +#' columns at all (except for strictly increasing transformations), as this will +#' invalidate sortedness assumptions about the rows. +#' +#' `group_by_drop_default` on (ungrouped) `epi_archive`s is expected to dispatch +#' to `group_by_drop_default.default` (but there is a dedicated method for +#' `grouped_epi_archive`s). +#' +#' @examples +#' +#' grouped_archive <- archive_cases_dv_subset_2 %>% group_by(geo_value) +#' +#' # `print` for metadata and method listing: +#' grouped_archive %>% print() +#' +#' # The primary use for grouping is to perform a grouped `epix_slide`: +#' +#' archive_cases_dv_subset_2 %>% +#' group_by(geo_value) %>% +#' epix_slide2( +#' f = ~ mean(.x$case_rate_7d_av), +#' before = 2, +#' ref_time_values = as.Date("2020-06-11") + 0:2, +#' new_col_name = "case_rate_3d_av" +#' ) %>% +#' ungroup() +#' +#' # ----------------------------------------------------------------- +#' +#' # Advanced: some other features of dplyr grouping are implemented: +#' +#' library(dplyr) +#' toy_archive <- +#' tribble( +#' ~geo_value, ~age_group, ~time_value, ~version, ~value, +#' "us", "adult", "2000-01-01", "2000-01-02", 121, +#' "us", "pediatric", "2000-01-02", "2000-01-03", 5, # (addition) +#' "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) +#' "us", "adult", "2000-01-02", "2000-01-03", 130 # (addition) +#' ) %>% +#' mutate( +#' age_group = ordered(age_group, c("pediatric", "adult")), +#' time_value = as.Date(time_value), +#' version = as.Date(version) +#' ) %>% +#' as_epi_archive2(other_keys = "age_group") +#' +#' # The following are equivalent: +#' toy_archive %>% group_by(geo_value, age_group) +#' toy_archive %>% +#' group_by(geo_value) %>% +#' group_by(age_group, .add = TRUE) +#' grouping_cols <- c("geo_value", "age_group") +#' toy_archive %>% group_by(across(all_of(grouping_cols))) +#' +#' # And these are equivalent: +#' toy_archive %>% group_by(geo_value) +#' toy_archive %>% +#' group_by(geo_value, age_group) %>% +#' ungroup(age_group) +#' +#' # To get the grouping variable names as a `list` of `name`s (a.k.a. symbols): +#' toy_archive %>% +#' group_by(geo_value) %>% +#' groups() +#' +#' toy_archive %>% +#' group_by(geo_value, age_group, .drop = FALSE) %>% +#' epix_slide2(f = ~ sum(.x$value), before = 20) %>% +#' ungroup() +#' +#' @importFrom dplyr group_by +#' @export +#' +#' @aliases grouped_epi_archive +group_by.epi_archive2 <- function(epi_archive, ..., .add = FALSE, .drop = dplyr::group_by_drop_default(epi_archive)) { + # `add` makes no difference; this is an ungrouped `epi_archive`. + detailed_mutate <- epix_detailed_restricted_mutate2(epi_archive, ...) + assert_logical(.drop) + if (!.drop) { + grouping_cols <- as.list(detailed_mutate[["archive"]][["DT"]])[detailed_mutate[["request_names"]]] + grouping_col_is_factor <- purrr::map_lgl(grouping_cols, is.factor) + # ^ Use `as.list` to try to avoid any possibility of a deep copy. + if (!any(grouping_col_is_factor)) { + cli_warn( + "`.drop=FALSE` but there are no factor grouping columns; + did you mean to convert one of the columns to a factor beforehand?", + class = "epiprocess__group_by_epi_archive__drop_FALSE_no_factors" + ) + } else if (any(diff(grouping_col_is_factor) == -1L)) { + cli_warn( + "`.drop=FALSE` but there are one or more non-factor grouping columns listed + after a factor grouping column; this may produce groups with `NA`s for these + columns; see https://github.com/tidyverse/dplyr/issues/5369#issuecomment-683762553; + depending on how you want completion to work, you might instead want to convert all + grouping columns to factors beforehand, specify the non-factor grouping columns first, + or use `.drop=TRUE` and add a call to `tidyr::complete`.", + class = "epiprocess__group_by_epi_archive__drop_FALSE_nonfactor_after_factor" + ) + } + } + new_grouped_epi_archive(detailed_mutate[["archive"]], + detailed_mutate[["request_names"]], + drop = .drop + ) +} + + +#' @export +slide <- function(.data, ...) { + UseMethod("slide") +} + + +#' Slide over epi archive +#' @description Slides a given function over variables in an `epi_archive` +#' object. See the documentation for the wrapper function [`epix_slide()`] for +#' details. The parameter descriptions below are copied from there +#' @importFrom data.table key +#' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms +#' @param f Function, formula, or missing; together with `...` specifies the +#' computation to slide. To "slide" means to apply a computation over a +#' sliding (a.k.a. "rolling") time window for each data group. The window is +#' determined by the `before` parameter described below. One time step is +#' typically one day or one week; see [`epi_slide`] details for more +#' explanation. If a function, `f` must take an `epi_df` with the same +#' column names as the archive's `DT`, minus the `version` column; followed +#' by a one-row tibble containing the values of the grouping variables for +#' the associated group; followed by a reference time value, usually as a +#' `Date` object; followed by any number of named arguments. If a formula, +#' `f` can operate directly on columns accessed via `.x$var` or `.$var`, as +#' in `~ mean (.x$var)` to compute a mean of a column `var` for each +#' group-`ref_time_value` combination. The group key can be accessed via +#' `.y` or `.group_key`, and the reference time value can be accessed via +#' `.z` or `.ref_time_value`. If `f` is missing, then `...` will specify the +#' computation. +#' @param ... Additional arguments to pass to the function or formula specified +#' via `f`. Alternatively, if `f` is missing, then `...` is interpreted as an +#' expression for tidy evaluation; in addition to referring to columns +#' directly by name, the expression has access to `.data` and `.env` pronouns +#' as in `dplyr` verbs, and can also refer to the `.group_key` and +#' `.ref_time_value`. See details of [`epi_slide`]. +#' @param before How far `before` each `ref_time_value` should the sliding +#' window extend? If provided, should be a single, non-NA, +#' [integer-compatible][vctrs::vec_cast] number of time steps. This window +#' endpoint is inclusive. For example, if `before = 7`, and one time step is +#' one day, then to produce a value for a `ref_time_value` of January 8, we +#' apply the given function or formula to data (for each group present) with +#' `time_value`s from January 1 onward, as they were reported on January 8. +#' For typical disease surveillance sources, this will not include any data +#' with a `time_value` of January 8, and, depending on the amount of reporting +#' latency, may not include January 7 or even earlier `time_value`s. (If +#' instead the archive were to hold nowcasts instead of regular surveillance +#' data, then we would indeed expect data for `time_value` January 8. If it +#' were to hold forecasts, then we would expect data for `time_value`s after +#' January 8, and the sliding window would extend as far after each +#' `ref_time_value` as needed to include all such `time_value`s.) +#' @param ref_time_values Reference time values / versions for sliding +#' computations; each element of this vector serves both as the anchor point +#' for the `time_value` window for the computation and the `max_version` +#' `as_of` which we fetch data in this window. If missing, then this will set +#' to a regularly-spaced sequence of values set to cover the range of +#' `version`s in the `DT` plus the `versions_end`; the spacing of values will +#' be guessed (using the GCD of the skips between values). +#' @param time_step Optional function used to define the meaning of one time +#' step, which if specified, overrides the default choice based on the +#' `time_value` column. This function must take a positive integer and return +#' an object of class `lubridate::period`. For example, we can use `time_step +#' = lubridate::hours` in order to set the time step to be one hour (this +#' would only be meaningful if `time_value` is of class `POSIXct`). +#' @param new_col_name String indicating the name of the new column that will +#' contain the derivative values. Default is "slide_value"; note that setting +#' `new_col_name` equal to an existing column name will overwrite this column. +#' @param as_list_col Should the slide results be held in a list column, or be +#' [unchopped][tidyr::unchop]/[unnested][tidyr::unnest]? Default is `FALSE`, +#' in which case a list object returned by `f` would be unnested (using +#' [`tidyr::unnest()`]), and, if the slide computations output data frames, +#' the names of the resulting columns are given by prepending `new_col_name` +#' to the names of the list elements. +#' @param names_sep String specifying the separator to use in `tidyr::unnest()` +#' when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix +#' from `new_col_name` entirely. +#' @param all_versions (Not the same as `all_rows` parameter of `epi_slide`.) If +#' `all_versions = TRUE`, then `f` will be passed the version history (all +#' `version <= ref_time_value`) for rows having `time_value` between +#' `ref_time_value - before` and `ref_time_value`. Otherwise, `f` will be +#' passed only the most recent `version` for every unique `time_value`. +#' Default is `FALSE`. +slide.epi_archive2 <- function(epi_archive, f, ..., before, ref_time_values, + time_step, new_col_name = "slide_value", + as_list_col = FALSE, names_sep = "_", + all_versions = FALSE) { + # For an "ungrouped" slide, treat all rows as belonging to one big + # group (group by 0 vars), like `dplyr::summarize`, and let the + # resulting `grouped_epi_archive` handle the slide: + slide( + group_by(epi_archive), + f, + ..., + before = before, ref_time_values = ref_time_values, + time_step = time_step, new_col_name = new_col_name, + as_list_col = as_list_col, names_sep = names_sep, + all_versions = all_versions + ) %>% + # We want a slide on ungrouped archives to output something + # ungrouped, rather than retaining the trivial (0-variable) + # grouping applied above. So we `ungroup()`. However, the current + # `dplyr` implementation automatically ignores/drops trivial + # groupings, so this is just a no-op for now. + ungroup() +} + + +#' Convert to `epi_archive` format +#' +#' Converts a data frame, data table, or tibble into an `epi_archive` +#' object. See the [archive +#' vignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html) for +#' examples. The parameter descriptions below are copied from there +#' +#' @param x A data frame, data table, or tibble, with columns `geo_value`, +#' `time_value`, `version`, and then any additional number of columns. +#' @param geo_type Type for the geo values. If missing, then the function will +#' attempt to infer it from the geo values present; if this fails, then it +#' will be set to "custom". +#' @param time_type Type for the time values. If missing, then the function will +#' attempt to infer it from the time values present; if this fails, then it +#' will be set to "custom". +#' @param other_keys Character vector specifying the names of variables in `x` +#' that should be considered key variables (in the language of `data.table`) +#' apart from "geo_value", "time_value", and "version". +#' @param additional_metadata List of additional metadata to attach to the +#' `epi_archive` object. The metadata will have `geo_type` and `time_type` +#' fields; named entries from the passed list or will be included as well. +#' @param compactify Optional; Boolean or `NULL`: should we remove rows that are +#' considered redundant for the purposes of `epi_archive`'s built-in methods +#' such as `as_of`? As these methods use the last version of each observation +#' carried forward (LOCF) to interpolate between the version data provided, +#' rows that don't change these LOCF results can potentially be omitted to +#' save space. `TRUE` will remove these rows, `FALSE` will not, and missing or +#' `NULL` will remove these rows and issue a warning. Generally, this can be +#' set to `TRUE`, but if you directly inspect or edit the fields of the +#' `epi_archive` such as its `DT`, you will have to determine whether +#' `compactify=TRUE` will produce the desired results. If compactification +#' here is removing a large proportion of the rows, this may indicate a +#' potential for space, time, or bandwidth savings upstream the data pipeline, +#' e.g., when fetching, storing, or preparing the input data `x` +#' @param clobberable_versions_start Optional; `length`-1; either a value of the +#' same `class` and `typeof` as `x$version`, or an `NA` of any `class` and +#' `typeof`: specifically, either (a) the earliest version that could be +#' subject to "clobbering" (being overwritten with different update data, but +#' using the *same* version tag as the old update data), or (b) `NA`, to +#' indicate that no versions are clobberable. There are a variety of reasons +#' why versions could be clobberable under routine circumstances, such as (a) +#' today's version of one/all of the columns being published after initially +#' being filled with `NA` or LOCF, (b) a buggy version of today's data being +#' published but then fixed and republished later in the day, or (c) data +#' pipeline delays (e.g., publisher uploading, periodic scraping, database +#' syncing, periodic fetching, etc.) that make events (a) or (b) reflected +#' later in the day (or even on a different day) than expected; potential +#' causes vary between different data pipelines. The default value is `NA`, +#' which doesn't consider any versions to be clobberable. Another setting that +#' may be appropriate for some pipelines is `max_version_with_row_in(x)`. +#' @param versions_end Optional; length-1, same `class` and `typeof` as +#' `x$version`: what is the last version we have observed? The default is +#' `max_version_with_row_in(x)`, but values greater than this could also be +#' valid, and would indicate that we observed additional versions of the data +#' beyond `max(x$version)`, but they all contained empty updates. (The default +#' value of `clobberable_versions_start` does not fully trust these empty +#' updates, and assumes that any version `>= max(x$version)` could be +#' clobbered.) If `nrow(x) == 0`, then this argument is mandatory. +#' @return An `epi_archive` object. +#' +#' @details This simply a wrapper around the `new()` method of the `epi_archive` +#' class, so for example: +#' ``` +#' x <- as_epi_archive(df, geo_type = "state", time_type = "day") +#' ``` +#' would be equivalent to: +#' ``` +#' x <- epi_archive$new(df, geo_type = "state", time_type = "day") +#' ``` +#' +#' @export +#' @examples +#' # Simple ex. with necessary keys +#' tib <- tibble::tibble( +#' geo_value = rep(c("ca", "hi"), each = 5), +#' time_value = rep(seq(as.Date("2020-01-01"), +#' by = 1, length.out = 5 +#' ), times = 2), +#' version = rep(seq(as.Date("2020-01-02"), +#' by = 1, length.out = 5 +#' ), times = 2), +#' value = rnorm(10, mean = 2, sd = 1) +#' ) +#' +#' toy_epi_archive <- tib %>% as_epi_archive2( +#' geo_type = "state", +#' time_type = "day" +#' ) +#' toy_epi_archive +#' +#' # Ex. with an additional key for county +#' df <- data.frame( +#' geo_value = c(replicate(2, "ca"), replicate(2, "fl")), +#' county = c(1, 3, 2, 5), +#' time_value = c( +#' "2020-06-01", +#' "2020-06-02", +#' "2020-06-01", +#' "2020-06-02" +#' ), +#' version = c( +#' "2020-06-02", +#' "2020-06-03", +#' "2020-06-02", +#' "2020-06-03" +#' ), +#' cases = c(1, 2, 3, 4), +#' cases_rate = c(0.01, 0.02, 0.01, 0.05) +#' ) +#' +#' x <- df %>% as_epi_archive2( +#' geo_type = "state", +#' time_type = "day", +#' other_keys = "county" +#' ) +as_epi_archive2 <- function(x, geo_type, time_type, other_keys, + additional_metadata = list(), + compactify = NULL, + clobberable_versions_start = NA, + versions_end = max_version_with_row_in(x)) { + new_epi_archive2( + x, geo_type, time_type, other_keys, additional_metadata, + compactify, clobberable_versions_start, versions_end + ) +} + +#' Test for `epi_archive` format +#' +#' @param x An object. +#' @param grouped_okay Optional; Boolean; should a `grouped_epi_archive` also +#' count? Default is `FALSE`. +#' @return `TRUE` if the object inherits from `epi_archive`. +#' +#' @export +#' @examples +#' is_epi_archive2(jhu_csse_daily_subset) # FALSE (this is an epi_df, not epi_archive) +#' is_epi_archive2(archive_cases_dv_subset_2) # TRUE +#' +#' # By default, grouped_epi_archives don't count as epi_archives, as they may +#' # support a different set of operations from regular `epi_archives`. This +#' # behavior can be controlled by `grouped_okay`. +#' grouped_archive <- archive_cases_dv_subset_2 %>% group_by(geo_value) +#' is_epi_archive2(grouped_archive) # FALSE +#' is_epi_archive2(grouped_archive, grouped_okay = TRUE) # TRUE +#' +#' @seealso [`is_grouped_epi_archive`] +is_epi_archive2 <- function(x, grouped_okay = FALSE) { + inherits(x, "epi_archive2") || grouped_okay && inherits(x, "grouped_epi_archive2") +} + + +#' @export +clone <- function(x, ...) { + UseMethod("clone") +} + + +#' @export +clone.epi_archive2 <- function(epi_archive, deep = FALSE) { + # TODO: Finish. + if (deep) { + epi_archive$DT <- copy(epi_archive$DT) + } else { + epi_archive$DT <- copy(epi_archive$DT) + } + return(epi_archive) +} diff --git a/R/data.R b/R/data.R index 26b9f39f..37ccc522 100644 --- a/R/data.R +++ b/R/data.R @@ -289,3 +289,11 @@ delayed_assign_with_unregister_awareness( #' * Furthermore, the data has been limited to a very small number of rows, the #' signal names slightly altered, and formatted into a tibble. "jhu_csse_county_level_subset" + +#' @export +"archive_cases_dv_subset_2" + +delayed_assign_with_unregister_awareness( + "archive_cases_dv_subset_2", + as_epi_archive2(archive_cases_dv_subset_dt, compactify = FALSE) +) diff --git a/R/grouped_archive_new.R b/R/grouped_archive_new.R new file mode 100644 index 00000000..c0e6c35e --- /dev/null +++ b/R/grouped_archive_new.R @@ -0,0 +1,456 @@ +#' +#' Convenience function for performing a `tidy_select` on dots according to its +#' docs, and taking the names (rather than the integer indices). +#' +#' @param ... tidyselect-syntax selection description +#' @param .data named vector / data frame; context for the description / the +#' object to which the selections apply +#' @return character vector containing names of entries/columns of +#' `names(.data)` denoting the selection +#' +#' @noRd +eval_pure_select_names_from_dots <- function(..., .data) { + # `?tidyselect::eval_select` tells us to use this form when we take in dots. + # It seems a bit peculiar, since the expr doesn't pack with it a way to get at + # the environment for the dots, but it looks like `eval_select` will assume + # the caller env (our `environment()`) when given an expr, and thus have + # access to the dots. + # + # If we were allowing renaming, we'd need to be careful about which names (new + # vs. old vs. both) to return here. + names(tidyselect::eval_select(rlang::expr(c(...)), .data, allow_rename = FALSE)) +} + +#' Get names of dots without forcing the dots +#' +#' For use in functions that use nonstandard evaluation (NSE) on the dots; we +#' can't use the pattern `names(list(...))` in this case because it will attempt +#' to force/(standard-)evaluate the dots, and we want to avoid attempted forcing of the +#' dots if we're using NSE. +#' +#' @noRd +nse_dots_names <- function(...) { + names(rlang::call_match()) +} +nse_dots_names2 <- function(...) { + rlang::names2(rlang::call_match()) +} + +#' @importFrom dplyr group_by_drop_default +#' @noRd +new_grouped_epi_archive <- function(ungrouped, vars, drop) { + if (inherits(ungrouped, "grouped_epi_archive")) { + cli_abort( + "`ungrouped` must not already be grouped (neither automatic regrouping + nor nested grouping is supported). Either use `group_by` with `.add=TRUE`, + or `ungroup` first.", + class = "epiprocess__grouped_epi_archive__ungrouped_arg_is_already_grouped", + epiprocess__ungrouped_class = class(ungrouped), + epiprocess__ungrouped_groups = groups(ungrouped) + ) + } + assert_class(ungrouped, "epi_archive2") + assert_character(vars) + if (!test_subset(vars, names(ungrouped$DT))) { + cli_abort( + "All grouping variables `vars` must be present in the data.", + ) + } + if ("version" %in% vars) { + cli_abort("`version` has a special interpretation and cannot be used by itself as a grouping variable") + } + assert_logical(drop, len = 1) + + # ----- + private <- list() + private$ungrouped <- ungrouped + private$vars <- vars + private$drop <- drop + + return(structure( + list( + private = private + ), + class = c("grouped_epi_archive2", "epi_archive2") + )) +} + +#' @export +print.grouped_epi_archive2 <- function(grouped_epi_archive, class = TRUE) { + if (class) cat("A `grouped_epi_archive` object:\n") + writeLines(wrap_varnames(grouped_epi_archive$private$vars, initial = "* Groups: ")) + # If none of the grouping vars is a factor, then $drop doesn't seem + # relevant, so try to be less verbose and don't message about it. + # + # Below map-then-extract may look weird, but the more natural + # extract-then-map appears to trigger copies of the extracted columns + # since we are working with a `data.table` (unless we go through + # `as.list`, but its current column-aliasing behavior is probably not + # something to rely too much on), while map functions currently appear + # to avoid column copies. + if (any(purrr::map_lgl(grouped_epi_archive$private$ungrouped$DT, is.factor)[grouped_epi_archive$private$vars])) { + cat(strwrap(init = "* ", prefix = " ", sprintf( + "%s groups formed by factor levels that don't appear in the data", + if (grouped_epi_archive$private$drop) "Drops" else "Does not drop" + ))) + cat("\n") + } + cat("It wraps an ungrouped `epi_archive`, with metadata:\n") + print(grouped_epi_archive$private$ungrouped, class = FALSE) + # Return self invisibly for convenience in `$`-"pipe": + invisible(grouped_epi_archive) +} + +#' @include methods-epi_archive.R +#' @rdname group_by.epi_archive +#' +#' @importFrom dplyr group_by +#' @export +group_by.grouped_epi_archive2 <- function( + grouped_epi_archive, + ..., + .add = FALSE, + .drop = dplyr::group_by_drop_default(grouped_epi_archive)) { + assert_logical(.add, len = 1) + if (!.add) { + cli_abort('`group_by` on a `grouped_epi_archive` with `.add=FALSE` is forbidden + (neither automatic regrouping nor nested grouping is supported). + If you want to "regroup", replacing the existing grouping vars, `ungroup` first and then `group_by`. + If you want to add to the existing grouping vars, call `group_by` specifying `.add=TRUE`. + ', + class = "epiprocess__grouped_epi_archive_group_by_with_add_FALSE" + ) + } else { + # `group_by` `...` computations are performed on ungrouped data (see + # `?dplyr::group_by`) + detailed_mutate <- epix_detailed_restricted_mutate2(grouped_epi_archive$private$ungrouped, ...) + out_ungrouped <- detailed_mutate[["archive"]] + vars_from_dots <- detailed_mutate[["request_names"]] + vars <- union(grouped_epi_archive$private$vars, vars_from_dots) + new_grouped_epi_archive(grouped_epi_archive$private$ungrouped, vars, .drop) + } +} + +#' @include methods-epi_archive.R +#' @rdname group_by.epi_archive +#' +#' @export +group_by_drop_default.grouped_epi_archive2 <- function(grouped_epi_archive) { + grouped_epi_archive$private$drop +} + +#' @include methods-epi_archive.R +#' @rdname group_by.epi_archive +#' +#' @importFrom dplyr groups +#' @export +groups.grouped_epi_archive2 <- function(grouped_epi_archive) { + rlang::syms(grouped_epi_archive$private$vars) +} + +#' @include methods-epi_archive.R +#' @rdname group_by.epi_archive +#' +#' @importFrom dplyr ungroup +#' @export +ungroup.grouped_epi_archive2 <- function(grouped_epi_archive, ...) { + if (rlang::dots_n(...) == 0L) { + # No dots = special behavior: remove all grouping vars and convert to + # an ungrouped class, as with `grouped_df`s. + grouped_epi_archive$private$ungrouped + } else { + exclude_vars <- eval_pure_select_names_from_dots(..., .data = grouped_epi_archive$private$ungrouped$DT) + # (requiring a pure selection here is a little stricter than dplyr + # implementations, but passing a renaming selection into `ungroup` + # seems pretty weird.) + result_vars <- grouped_epi_archive$private$vars[!grouped_epi_archive$private$vars %in% exclude_vars] + # `vars` might be length 0 if the user's tidyselection removed all + # grouping vars. Unlike with tibble, opt here to keep the result as a + # grouped_epi_archive, for output class consistency when `...` is + # provided. + new_grouped_epi_archive(grouped_epi_archive$private$ungrouped, result_vars, grouped_epi_archive$private$drop) + } +} + +#' Truncate versions after a given version, grouped +#' @description Filter to keep only older versions by mutating the underlying +#' `epi_archive` using `$truncate_versions_after`. Returns the mutated +#' `grouped_epi_archive` [invisibly][base::invisible]. +#' @param x as in [`epix_truncate_versions_after`] +#' @param max_version as in [`epix_truncate_versions_after`] +#' @export +truncate_versions_after.grouped_epi_archive2 <- function(grouped_epi_archive, max_version) { + # The grouping is irrelevant for this method; if we were to split into + # groups and recombine appropriately, we should get the same result as + # just leveraging the ungrouped method, so just do the latter: + truncate_versions_after(grouped_epi_archive$private$ungrouped, max_version) + return(invisible(grouped_epi_archive)) +} + +#' Truncate versions after a given version, grouped +#' @export +epix_truncate_versions_after.grouped_epi_archive2 <- function(grouped_epi_archive, max_version) { + cloned_group_epi_archive <- clone(grouped_epi_archive, deep = TRUE) + return((truncate_versions_after(cloned_group_epi_archive, max_version))) + # ^ second set of parens drops invisibility +} + + +#' Slide over grouped epi archive +#' @description Slides a given function over variables in a `grouped_epi_archive` +#' object. See the documentation for the wrapper function [`epix_slide()`] for +#' details. +#' @importFrom data.table key address rbindlist setDF +#' @importFrom tibble as_tibble new_tibble validate_tibble +#' @importFrom dplyr group_by groups +#' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms +#' env missing_arg +#' @export +slide.grouped_epi_archive2 <- function(grouped_epi_archive, f, ..., before, ref_time_values, + time_step, new_col_name = "slide_value", + as_list_col = FALSE, names_sep = "_", + all_versions = FALSE) { + # Perform some deprecated argument checks without using ` = + # deprecated()` in the function signature, because they are from + # early development versions and much more likely to be clutter than + # informative in the signature. + if ("group_by" %in% nse_dots_names(...)) { + cli_abort(" + The `group_by` argument to `slide` has been removed; please use + the `group_by` S3 generic function or `$group_by` R6 method + before the slide instead. (If you were instead trying to pass a + `group_by` argument to `f` or create a column named `group_by`, + this check is a false positive, but you will still need to use a + different column name here and rename the resulting column after + the slide.) + ", class = "epiprocess__epix_slide_group_by_parameter_deprecated") + } + if ("all_rows" %in% nse_dots_names(...)) { + cli_abort(" + The `all_rows` argument has been removed from `epix_slide` (but + is still supported in `epi_slide`). Add rows for excluded + results with a manual join instead. + ", class = "epiprocess__epix_slide_all_rows_parameter_deprecated") + } + + if (missing(ref_time_values)) { + ref_time_values <- epix_slide_ref_time_values_default(grouped_epi_archive$private$ungrouped) + } else { + assert_numeric(ref_time_values, min.len = 1L, null.ok = FALSE, any.missing = FALSE) + if (any(ref_time_values > grouped_epi_archive$private$ungrouped$versions_end)) { + cli_abort("Some `ref_time_values` are greater than the latest version in the archive.") + } + if (anyDuplicated(ref_time_values) != 0L) { + cli_abort("Some `ref_time_values` are duplicated.") + } + # Sort, for consistency with `epi_slide`, although the current + # implementation doesn't take advantage of it. + ref_time_values <- sort(ref_time_values) + } + + # Validate and pre-process `before`: + if (missing(before)) { + cli_abort("`before` is required (and must be passed by name); + if you did not want to apply a sliding window but rather + to map `as_of` and `f` across various `ref_time_values`, + pass a large `before` value (e.g., if time steps are days, + `before=365000`).") + } + before <- vctrs::vec_cast(before, integer()) + assert_int(before, lower = 0L, null.ok = FALSE, na.ok = FALSE) + + # If a custom time step is specified, then redefine units + + if (!missing(time_step)) before <- time_step(before) + + # Symbolize column name + new_col <- sym(new_col_name) + + # Validate rest of parameters: + assert_logical(as_list_col, len = 1L) + assert_logical(all_versions, len = 1L) + assert_character(names_sep, len = 1L, null.ok = TRUE) + + # Computation for one group, one time value + comp_one_grp <- function(.data_group, .group_key, + f, ..., + ref_time_value, + new_col) { + # Carry out the specified computation + comp_value <- f(.data_group, .group_key, ref_time_value, ...) + + if (all_versions) { + # Extract data from archive so we can do length checks below. When + # `all_versions = TRUE`, `.data_group` will always be an ungrouped + # archive because of the preceding `as_of` step. + .data_group <- .data_group$DT + } + + assert( + check_atomic(comp_value, any.missing = TRUE), + check_data_frame(comp_value), + combine = "or", + .var.name = vname(comp_value) + ) + + # Label every result row with the `ref_time_value` + res <- list(time_value = ref_time_value) + + # Wrap the computation output in a list and unchop/unnest later if + # `as_list_col = FALSE`. This approach means that we will get a + # list-class col rather than a data.frame-class col when + # `as_list_col = TRUE` and the computations outputs are data + # frames. + res[[new_col]] <- list(comp_value) + + # Convert the list to a tibble all at once for speed. + return(validate_tibble(new_tibble(res))) + } + + # If `f` is missing, interpret ... as an expression for tidy evaluation + if (missing(f)) { + quos <- enquos(...) + if (length(quos) == 0) { + cli_abort("If `f` is missing then a computation must be specified via `...`.") + } + if (length(quos) > 1) { + cli_abort("If `f` is missing then only a single computation can be specified via `...`.") + } + + f <- quos[[1]] + new_col <- sym(names(rlang::quos_auto_name(quos))) + ... <- missing_arg() # magic value that passes zero args as dots in calls below + } + + f <- as_slide_computation(f, ...) + x <- lapply(ref_time_values, function(ref_time_value) { + # Ungrouped as-of data; `epi_df` if `all_versions` is `FALSE`, + # `epi_archive` if `all_versions` is `TRUE`: + as_of_raw <- as_of(grouped_epi_archive$private$ungrouped, + ref_time_value, + min_time_value = ref_time_value - before, + all_versions = all_versions + ) + + # Set: + # * `as_of_df`, the data.frame/tibble/epi_df/etc. that we will + # `group_modify` as the `.data` argument. Might or might not + # include version column. + # * `group_modify_fn`, the corresponding `.f` argument + if (!all_versions) { + as_of_df <- as_of_raw + group_modify_fn <- comp_one_grp + } else { + as_of_archive <- as_of_raw + # We essentially want to `group_modify` the archive, but + # haven't implemented this method yet. Next best would be + # `group_modify` on its `$DT`, but that has different + # behavior based on whether or not `dtplyr` is loaded. + # Instead, go through an ordinary data frame, trying to avoid + # copies. + if (address(as_of_archive$DT) == address(grouped_epi_archive$private$ungrouped$DT)) { + # `as_of` aliased its the full `$DT`; copy before mutating: + # + # Note: this step is probably unneeded; we're fine with + # aliasing of the DT or its columns: vanilla operations aren't + # going to mutate them in-place if they are aliases, and we're + # not performing mutation (unlike the situation with + # `fill_through_version` where we do mutate a `DT` and don't + # want aliasing). + as_of_archive$DT <- copy(as_of_archive$DT) + } + dt_key <- data.table::key(as_of_archive$DT) + as_of_df <- as_of_archive$DT + data.table::setDF(as_of_df) + + # Convert each subgroup chunk to an archive before running the calculation. + group_modify_fn <- function(.data_group, .group_key, + f, ..., + ref_time_value, + new_col) { + # .data_group is coming from as_of_df as a tibble, but we + # want to feed `comp_one_grp` an `epi_archive` backed by a + # DT; convert and wrap: + data.table::setattr(.data_group, "sorted", dt_key) + data.table::setDT(.data_group, key = dt_key) + .data_group_archive <- clone(as_of_archive) + .data_group_archive$DT <- .data_group + comp_one_grp(.data_group_archive, .group_key, + f = f, ..., + ref_time_value = ref_time_value, + new_col = new_col + ) + } + } + + return( + dplyr::group_modify( + dplyr::group_by(as_of_df, !!!syms(grouped_epi_archive$private$vars), .drop = grouped_epi_archive$private$drop), + group_modify_fn, + f = f, ..., + ref_time_value = ref_time_value, + new_col = new_col, + .keep = TRUE + ) + ) + }) + # Combine output into a single tibble + x <- as_tibble(setDF(rbindlist(x))) + # Reconstruct groups + x <- group_by(x, !!!syms(grouped_epi_archive$private$vars), .drop = grouped_epi_archive$private$drop) + + # Unchop/unnest if we need to + if (!as_list_col) { + x <- tidyr::unnest(x, !!new_col, names_sep = names_sep) + } + + # if (is_epi_df(x)) { + # # The analogue of `epi_df`'s `as_of` metadata for an archive is + # # `$versions_end`, at least in the current absence of + # # separate fields/columns denoting the "archive version" with a + # # different resolution, or from the perspective of a different + # # stage of a data pipeline. The `as_of` that is automatically + # # derived won't always match; override: + # attr(x, "metadata")[["as_of"]] <- private$ungrouped$versions_end + # } + + # XXX We need to work out when we want to return an `epi_df` and how + # to get appropriate keys (see #290, #223, #163). We'll probably + # need the commented-out code above if we ever output an `epi_df`. + # However, as a stopgap measure to have some more consistency across + # different ways of calling `epix_slide`, and to prevent `epi_df` + # output with invalid metadata, always output a (grouped or + # ungrouped) tibble. + x <- decay_epi_df(x) + + return(x) +} + + +# At time of writing, roxygen parses content in collation order, impacting the +# presentation of .Rd files that document multiple functions (see +# https://github.com/r-lib/roxygen2/pull/324). Use @include tags (determining +# `Collate:`) and ordering of functions within each file in order to get the +# desired ordering. + + + +#' @include methods-epi_archive.R +#' @rdname group_by.epi_archive +#' +#' @export +is_grouped_epi_archive2 <- function(x) { + inherits(x, "grouped_epi_archive2") +} + + +#' @export +clone.grouped_epi_archive2 <- function(x, deep = FALSE) { + # TODO: Finish. + if (deep) { + ungrouped <- clone(x$private$ungrouped, deep = TRUE) + } else { + ungrouped <- x$private$ungrouped + } + new_grouped_epi_archive(ungrouped, x$private$vars, x$private$drop) +} diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 4bcead66..213cf1b1 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -470,9 +470,8 @@ dplyr_col_modify.col_modify_recorder_df <- function(data, cols) { #' @noRd epix_detailed_restricted_mutate <- function(.data, ...) { # We don't want to directly use `dplyr::mutate` on the `$DT`, as: - # - this likely copies the entire table # - `mutate` behavior, including the output class, changes depending on - # whether `dtplyr` is loaded and would require post-processing + # whether `dtplyr` < 1.3.0 is loaded and would require post-processing # - behavior with `dtplyr` isn't fully compatible # - it doesn't give the desired details, and `rlang::exprs_auto_name` does not # appropriately handle the `= NULL` and `= ` tidyeval cases diff --git a/R/methods-epi_archive_new.R b/R/methods-epi_archive_new.R new file mode 100644 index 00000000..3ce39afc --- /dev/null +++ b/R/methods-epi_archive_new.R @@ -0,0 +1,826 @@ +#' Generate a snapshot from an `epi_archive` object +#' +#' Generates a snapshot in `epi_df` format from an `epi_archive` object, as of a +#' given version. See the [archive +#' vignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html) for +#' examples. +#' +#' @param x An `epi_archive` object +#' @param max_version Time value specifying the max version to permit in the +#' snapshot. That is, the snapshot will comprise the unique rows of the +#' current archive data that represent the most up-to-date signal values, as +#' of the specified `max_version` (and whose time values are at least +#' `min_time_value`.) +#' @param min_time_value Time value specifying the min time value to permit in +#' the snapshot. Default is `-Inf`, which effectively means that there is no +#' minimum considered. +#' @param all_versions If `all_versions = TRUE`, then the output will be in +#' `epi_archive` format, and contain rows in the specified `time_value` range +#' having `version <= max_version`. The resulting object will cover a +#' potentially narrower `version` and `time_value` range than `x`, depending +#' on user-provided arguments. Otherwise, there will be one row in the output +#' for the `max_version` of each `time_value`. Default is `FALSE`. +#' @return An `epi_df` object. +#' +#' @details This is simply a wrapper around the `as_of()` method of the +#' `epi_archive` class, so if `x` is an `epi_archive` object, then: +#' ``` +#' epix_as_of(x, max_version = v) +#' ``` +#' is equivalent to: +#' ``` +#' x$as_of(max_version = v) +#' ``` +#' +#' Mutation and aliasing: `epix_as_of` and `$as_of` will not mutate the input +#' archives, but may in some edge cases alias parts of the inputs, so copy the +#' outputs if needed before using mutating operations like `data.table`'s `:=` +#' operator. Currently, the only situation where there is potentially aliasing +#' is of the `DT` in edge cases with `all_versions = TRUE`, but this may change +#' in the future. +#' +#' @examples +#' # warning message of data latency shown +#' epix_as_of2( +#' archive_cases_dv_subset_2, +#' max_version = max(archive_cases_dv_subset_2$DT$version) +#' ) +#' +#' range(archive_cases_dv_subset_2$DT$version) # 2020-06-02 -- 2021-12-01 +#' +#' epix_as_of2( +#' archive_cases_dv_subset_2, +#' max_version = as.Date("2020-06-12") +#' ) +#' +#' # When fetching a snapshot as of the latest version with update data in the +#' # archive, a warning is issued by default, as this update data might not yet +#' # be finalized (for example, if data versions are labeled with dates, these +#' # versions might be overwritten throughout the corresponding days with +#' # additional data or "hotfixes" of erroroneous data; when we build an archive +#' # based on database queries, the latest available update might still be +#' # subject to change, but previous versions should be finalized). We can +#' # muffle such warnings with the following pattern: +#' withCallingHandlers( +#' { +#' epix_as_of2( +#' archive_cases_dv_subset_2, +#' max_version = max(archive_cases_dv_subset_2$DT$version) +#' ) +#' }, +#' epiprocess__snapshot_as_of_clobberable_version = function(wrn) invokeRestart("muffleWarning") +#' ) +#' # Since R 4.0, there is a `globalCallingHandlers` function that can be used +#' # to globally toggle these warnings. +#' +#' @export +epix_as_of2 <- function(epi_archive, max_version, min_time_value = -Inf, all_versions = FALSE) { + assert_class(epi_archive, "epi_archive2") + return(as_of(epi_archive, max_version, min_time_value, all_versions = all_versions)) +} + +#' `epi_archive` with unobserved history filled in (won't mutate, might alias) +#' +#' Sometimes, due to upstream data pipeline issues, we have to work with a +#' version history that isn't completely up to date, but with functions that +#' expect archives that are completely up to date, or equally as up-to-date as +#' another archive. This function provides one way to approach such mismatches: +#' pretend that we've "observed" additional versions, filling in these versions +#' with NAs or extrapolated values. +#' +#' '`epix_fill_through_version` will not mutate its `x` argument, but its result +#' might alias fields of `x` (e.g., mutating the result's `DT` might mutate +#' `x$DT`). The R6 method variant, `x$fill_through_version`, will mutate `x` to +#' give the result, but might reseat its fields (e.g., references to the old +#' `x$DT` might not be updated by this function or subsequent operations on +#' `x`), and returns the updated `x` [invisibly][base::invisible]. +#' +#' @param x An `epi_archive` +#' @param fill_versions_end Length-1, same class&type as `x$version`: the +#' version through which to fill in missing version history; this will be the +#' result's `$versions_end` unless it already had a later +#' `$versions_end`. +#' @param how Optional; `"na"` or `"locf"`: `"na"` will fill in any missing +#' required version history with `NA`s, by inserting (if necessary) an update +#' immediately after the current `$versions_end` that revises all +#' existing measurements to be `NA` (this is only supported for `version` +#' classes with a `next_after` implementation); `"locf"` will fill in missing +#' version history with the last version of each observation carried forward +#' (LOCF), by leaving the update `$DT` alone (other `epi_archive` methods are +#' based on LOCF). Default is `"na"`. +#' @return An `epi_archive` +epix_fill_through_version2 <- function(epi_archive, fill_versions_end, + how = c("na", "locf")) { + assert_class(epi_archive, "epi_archive2") + cloned_epi_archive <- clone(epi_archive) + # Enclosing parentheses drop the invisibility flag. See description above of + # potential mutation and aliasing behavior. + (fill_through_version(cloned_epi_archive, fill_versions_end, how = how)) +} + +#' Merge two `epi_archive` objects +#' +#' Merges two `epi_archive`s that share a common `geo_value`, `time_value`, and +#' set of key columns. When they also share a common `versions_end`, +#' using `$as_of` on the result should be the same as using `$as_of` on `x` and +#' `y` individually, then performing a full join of the `DT`s on the non-version +#' key columns (potentially consolidating multiple warnings about clobberable +#' versions). If the `versions_end` values differ, the +#' `sync` parameter controls what is done. +#' +#' This function, [`epix_merge`], does not mutate its inputs and will not alias +#' either archive's `DT`, but may alias other fields; `x$merge` will overwrite +#' `x` with the result of the merge, reseating its `DT` and several other fields +#' (making them point to different objects), but avoiding mutation of the +#' contents of the old `DT` (only relevant if you have another reference to the +#' old `DT` in another object). +#' +#' @param x,y Two `epi_archive` objects to join together. +#' @param sync Optional; `"forbid"`, `"na"`, `"locf"`, or `"truncate"`; in the +#' case that `x$versions_end` doesn't match `y$versions_end`, what do we do?: +#' `"forbid"`: emit an error; "na": use `max(x$versions_end, y$versions_end)` +#' as the result's `versions_end`, but ensure that, if we request a snapshot +#' as of a version after `min(x$versions_end, y$versions_end)`, the +#' observation columns from the less up-to-date archive will be all NAs (i.e., +#' imagine there was an update immediately after its `versions_end` which +#' revised all observations to be `NA`); `"locf"`: use `max(x$versions_end, +#' y$versions_end)` as the result's `versions_end`, allowing the last version +#' of each observation to be carried forward to extrapolate unavailable +#' versions for the less up-to-date input archive (i.e., imagining that in the +#' less up-to-date archive's data set remained unchanged between its actual +#' `versions_end` and the other archive's `versions_end`); or `"truncate"`: +#' use `min(x$versions_end, y$versions_end)` as the result's `versions_end`, +#' and discard any rows containing update rows for later versions. +#' @param compactify Optional; `TRUE`, `FALSE`, or `NULL`; should the result be +#' compactified? See [`as_epi_archive`] for an explanation of what this means. +#' Default here is `TRUE`. +#' @return the resulting `epi_archive` +#' +#' @details In all cases, `additional_metadata` will be an empty list, and +#' `clobberable_versions_start` will be set to the earliest version that could +#' be clobbered in either input archive. +#' +#' @examples +#' # create two example epi_archive datasets +#' x <- archive_cases_dv_subset_2$DT %>% +#' dplyr::select(geo_value, time_value, version, case_rate_7d_av) %>% +#' as_epi_archive2(compactify = TRUE) +#' y <- archive_cases_dv_subset_2$DT %>% +#' dplyr::select(geo_value, time_value, version, percent_cli) %>% +#' as_epi_archive2(compactify = TRUE) +#' # merge results stored in a third object: +#' xy <- epix_merge2(x, y) +#' +#' @importFrom data.table key set setkeyv +#' @export +epix_merge2 <- function(x, y, + sync = c("forbid", "na", "locf", "truncate"), + compactify = TRUE) { + assert_class(x, "epi_archive2") + assert_class(y, "epi_archive2") + sync <- rlang::arg_match(sync) + + if (!identical(x$geo_type, y$geo_type)) { + cli_abort("`x` and `y` must have the same `$geo_type`") + } + + if (!identical(x$time_type, y$time_type)) { + cli_abort("`x` and `y` must have the same `$time_type`") + } + + if (length(x$additional_metadata) != 0L) { + cli_warn("x$additional_metadata won't appear in merge result", + class = "epiprocess__epix_merge_ignores_additional_metadata" + ) + } + if (length(y$additional_metadata) != 0L) { + cli_warn("y$additional_metadata won't appear in merge result", + class = "epiprocess__epix_merge_ignores_additional_metadata" + ) + } + result_additional_metadata <- list() + + result_clobberable_versions_start <- + if (all(is.na(c(x$clobberable_versions_start, y$clobberable_versions_start)))) { + NA # (any type of NA is fine here) + } else { + min_na_rm(c(x$clobberable_versions_start, y$clobberable_versions_start)) + } + + # The actual merge below may not succeed 100% of the time, so do this + # preprocessing using non-mutating (but potentially aliasing) functions. This + # approach potentially uses more memory, but won't leave behind a + # partially-mutated `x` on failure. + if (sync == "forbid") { + if (!identical(x$versions_end, y$versions_end)) { + cli_abort(paste( + "`x` and `y` were not equally up to date version-wise:", + "`x$versions_end` was not identical to `y$versions_end`;", + "either ensure that `x` and `y` are equally up to date before merging,", + "or specify how to deal with this using `sync`" + ), class = "epiprocess__epix_merge_unresolved_sync") + } else { + new_versions_end <- x$versions_end + x_DT <- x$DT + y_DT <- y$DT + } + } else if (sync %in% c("na", "locf")) { + new_versions_end <- max(x$versions_end, y$versions_end) + x_DT <- epix_fill_through_version2(x, new_versions_end, sync)$DT + y_DT <- epix_fill_through_version2(y, new_versions_end, sync)$DT + } else if (sync == "truncate") { + new_versions_end <- min(x$versions_end, y$versions_end) + x_DT <- x$DT[x[["DT"]][["version"]] <= new_versions_end, names(x$DT), with = FALSE] + y_DT <- y$DT[y[["DT"]][["version"]] <= new_versions_end, names(y$DT), with = FALSE] + } else { + cli_abort("unimplemented") + } + + # key(x_DT) should be the same as key(x$DT) and key(y_DT) should be the same + # as key(y$DT). Below, we only use {x,y}_DT in the code (making it easier to + # split the code into separate functions if we wish), but still refer to + # {x,y}$DT in the error messages (further relying on this assumption). + # + # Check&ensure that the above assumption; if it didn't already hold, we likely + # have a bug in the preprocessing, a weird/invalid archive as input, and/or a + # data.table version with different semantics (which may break other parts of + # our code). + x_DT_key_as_expected <- identical(key(x$DT), key(x_DT)) + y_DT_key_as_expected <- identical(key(y$DT), key(y_DT)) + if (!x_DT_key_as_expected || !y_DT_key_as_expected) { + cli_warn(" + `epiprocess` internal warning (please report): pre-processing for + epix_merge unexpectedly resulted in an intermediate data table (or + tables) with a different key than the corresponding input archive. + Manually setting intermediate data table keys to the expected values. + ", internal = TRUE) + setkeyv(x_DT, key(x$DT)) + setkeyv(y_DT, key(y$DT)) + } + # Without some sort of annotations of what various columns represent, we can't + # do something that makes sense when merging archives with mismatched keys. + # E.g., even if we assume extra keys represent demographic breakdowns, a + # sensible default treatment of count-type and rate-type value columns would + # differ. + if (!identical(sort(key(x_DT)), sort(key(y_DT)))) { + cli_abort(" + The archives must have the same set of key column names; if the + key columns represent the same things, just with different + names, please retry after manually renaming to match; if they + represent different things (e.g., x has an age breakdown + but y does not), please retry after processing them to share + the same key (e.g., by summarizing x to remove the age breakdown, + or by applying a static age breakdown to y). + ", class = "epiprocess__epix_merge_x_y_must_have_same_key_set") + } + # `by` cols = result (and each input's) `key` cols, and determine + # the row set, determined using a full join via `merge` + # + # non-`by` cols = "value"-ish cols, and are looked up with last + # version carried forward via rolling joins + by <- key(x_DT) # = some perm of key(y_DT) + if (!all(c("geo_value", "time_value", "version") %in% key(x_DT))) { + cli_abort('Invalid `by`; `by` is currently set to the common `key` of + the two archives, and is expected to contain + "geo_value", "time_value", and "version".', + class = "epiprocess__epi_archive_must_have_required_key_cols" + ) + } + if (length(by) < 1L || utils::tail(by, 1L) != "version") { + cli_abort('Invalid `by`; `by` is currently set to the common `key` of + the two archives, and is expected to have a "version" as + the last key col.', + class = "epiprocess__epi_archive_must_have_version_at_end_of_key" + ) + } + x_nonby_colnames <- setdiff(names(x_DT), by) + y_nonby_colnames <- setdiff(names(y_DT), by) + if (length(intersect(x_nonby_colnames, y_nonby_colnames)) != 0L) { + cli_abort(" + `x` and `y` DTs have overlapping non-by column names; + this is currently not supported; please manually fix up first: + any overlapping columns that can are key-like should be + incorporated into the key, and other columns should be renamed. + ", class = "epiprocess__epix_merge_x_y_must_not_have_overlapping_nonby_colnames") + } + x_by_vals <- x_DT[, by, with = FALSE] + if (anyDuplicated(x_by_vals) != 0L) { + cli_abort(" + The `by` columns must uniquely determine rows of `x$DT`; + the `by` is currently set to the common `key` of the two + archives, so this can be resolved by adding key-like columns + to `x`'s key (to get a unique key). + ", class = "epiprocess__epix_merge_by_cols_must_act_as_unique_key") + } + y_by_vals <- y_DT[, by, with = FALSE] + if (anyDuplicated(y_by_vals) != 0L) { + cli_abort(" + The `by` columns must uniquely determine rows of `y$DT`; + the `by` is currently set to the common `key` of the two + archives, so this can be resolved by adding key-like columns + to `y`'s key (to get a unique key). + ", class = "epiprocess__epix_merge_by_cols_must_act_as_unique_key") + } + result_DT <- merge(x_by_vals, y_by_vals, + by = by, + # We must have `all=TRUE` or we may skip updates + # from x and/or y and corrupt the history + all = TRUE, + # We don't want Cartesian products, but the + # by-is-unique-key check above already ensures + # this. (Note that `allow.cartesian=FALSE` doesn't + # actually catch all Cartesian products anyway.) + # Disable superfluous check: + allow.cartesian = TRUE + ) + set( + result_DT, , x_nonby_colnames, + x_DT[result_DT[, by, with = FALSE], x_nonby_colnames, + with = FALSE, + # It's good practice to specify `on`, and we must + # explicitly specify `on` if there's a potential key vs. + # by order mismatch (not possible currently for x + # with by = key(x$DT), but possible for y): + on = by, + # last version carried forward: + roll = TRUE, + # requesting non-version key that doesn't exist in the other archive, + # or before its first version, should result in NA + nomatch = NA, + # see note on `allow.cartesian` above; currently have a + # similar story here. + allow.cartesian = TRUE + ] + ) + set( + result_DT, , y_nonby_colnames, + y_DT[result_DT[, by, with = FALSE], y_nonby_colnames, + with = FALSE, + on = by, + roll = TRUE, + nomatch = NA, + allow.cartesian = TRUE + ] + ) + # The key could be unset in case of a key vs. by order mismatch as + # noted above. Ensure that we keep it: + setkeyv(result_DT, by) + + return(as_epi_archive2( + result_DT[], # clear data.table internal invisibility flag if set + geo_type = x$geo_type, + time_type = x$time_type, + other_keys = setdiff(key(result_DT), c("geo_value", "time_value", "version")), + additional_metadata = result_additional_metadata, + # It'd probably be better to pre-compactify before the merge, and might be + # guaranteed not to be necessary to compactify the merge result if the + # inputs are already compactified, but at time of writing we don't have + # compactify in its own method or field, and it seems like it should be + # pretty fast anyway. + compactify = compactify, + clobberable_versions_start = result_clobberable_versions_start, + versions_end = new_versions_end + )) +} + +# Helpers for `group_by`: + +#' Make non-testing mock to get [`dplyr::dplyr_col_modify`] input +#' +#' A workaround for `dplyr:::mutate_cols` not being exported and directly +#' applying test mock libraries likely being impossible (due to mocking another +#' package's S3 generic or method). +#' +#' Use solely with a single call to the [`dplyr::mutate`] function and then +#' `destructure_col_modify_recorder_df`; other applicable operations from +#' [dplyr::dplyr_extending] have not been implemented. +#' +#' @param parent_df the "parent class" data frame to wrap +#' @return a `col_modify_recorder_df` +#' +#' @noRd +new_col_modify_recorder_df <- function(parent_df) { + assert_class(parent_df, "data.frame") + `class<-`(parent_df, c("col_modify_recorder_df", class(parent_df))) +} + +#' Extract unchanged parent-class data frame from a `new_col_modify_recorder_df` +#' +#' @param col_modify_recorder_df an instance of a `col_modify_recorder_df` +#' @return named list with elements `unchanged_parent_df`, `cols`; `cols` is the +#' input to [`dplyr::dplyr_col_modify`] that this class was designed to record +#' +#' @noRd +destructure_col_modify_recorder_df <- function(col_modify_recorder_df) { + assert_class(col_modify_recorder_df, "col_modify_recorder_df") + list( + unchanged_parent_df = col_modify_recorder_df %>% + `attr<-`("epiprocess::col_modify_recorder_df::cols", NULL) %>% + `class<-`(setdiff(class(.), "col_modify_recorder_df")), + cols = attr(col_modify_recorder_df, + "epiprocess::col_modify_recorder_df::cols", + exact = TRUE + ) + ) +} + +#' `dplyr_col_modify` method that simply records the `cols` argument +#' +#' Must export S3 methods in R >= 4.0, even if they're only designed to be +#' package internals, and must import any corresponding upstream S3 generic +#' functions: +#' @importFrom dplyr dplyr_col_modify +#' @export +#' @noRd +dplyr_col_modify.col_modify_recorder_df <- function(data, cols) { + if (!is.null(attr(data, "epiprocess::col_modify_recorder_df::cols", exact = TRUE))) { + cli_abort("`col_modify_recorder_df` can only record `cols` once", + internal = TRUE + ) + } + attr(data, "epiprocess::col_modify_recorder_df::cols") <- cols + data +} + +#' A more detailed but restricted `mutate` for use in `group_by.epi_archive` +#' +#' More detailed: provides the names of the "requested" columns in addition to +#' the output expected from a regular `mutate` method. +#' +#' Restricted: doesn't allow replacing or removing key cols, where a sort is +#' potentially required at best and what the output key should be is unclear at +#' worst. (The originally expected restriction was that the `mutate` parameters +#' not present in `group_by` would not be recognized, but the current +#' implementation just lets `mutate` handle these even anyway, even if they're +#' not part of the regular `group_by` parameters; these arguments would have to +#' be passed by names with dot prefixes, so just hope that the user means to use +#' them here if provided.) +#' +#' This can introduce column-level aliasing in `data.table`s, which isn't really +#' intended in the `data.table` user model but we can make it part of our user +#' model (see +#' https://stackoverflow.com/questions/45925482/make-a-shallow-copy-in-data-table +#' and links). +#' +#' Don't export this without cleaning up language of "mutate" as in side effects +#' vs. "mutate" as in `dplyr::mutate`. +#' @noRd +epix_detailed_restricted_mutate2 <- function(.data, ...) { + # We don't want to directly use `dplyr::mutate` on the `$DT`, as: + # - `mutate` behavior, including the output class, changes depending on + # whether `dtplyr` < 1.3.0 is loaded and would require post-processing + # - behavior with `dtplyr` isn't fully compatible + # - it doesn't give the desired details, and `rlang::exprs_auto_name` does not + # appropriately handle the `= NULL` and `= ` tidyeval cases + # Instead: + # - Use `as.list` to get a shallow copy (undocumented, but apparently + # intended, behavior), then `as_tibble` (also shallow, given a list) to get + # back to something that will use `dplyr`'s included `mutate` method(s), + # then convert this using shallow operations into a `data.table`. + # - Use `col_modify_recorder_df` to get the desired details. + in_tbl <- tibble::as_tibble(as.list(.data$DT), .name_repair = "minimal") + col_modify_cols <- + destructure_col_modify_recorder_df( + mutate(new_col_modify_recorder_df(in_tbl), ...) + )[["cols"]] + invalidated_key_col_is <- + which(purrr::map_lgl(key(.data$DT), function(key_colname) { + key_colname %in% names(col_modify_cols) && + !rlang::is_reference(in_tbl[[key_colname]], col_modify_cols[[key_colname]]) + })) + if (length(invalidated_key_col_is) != 0L) { + rlang::abort(paste_lines(c( + "Key columns must not be replaced or removed.", + wrap_varnames(key(.data$DT)[invalidated_key_col_is], + initial = "Flagged key cols: " + ) + ))) + } else { + # Have `dplyr` do the `dplyr_col_modify`, keeping the column-level-aliasing + # and must-copy-on-write-if-refcount-more-than-1 model, obtaining a tibble, + # then convert it into a `data.table`. The key should still be valid + # (assuming that the user did not explicitly alter `key(.data$DT)` or the + # columns by reference somehow within `...` tidyeval-style computations, or + # trigger refcount-1 alterations due to still having >1 refcounts on the + # columns), set the "sorted" attribute accordingly to prevent attempted + # sorting (including potential extra copies) or sortedness checking, then + # `setDT` (rather than `as.data.table`, in order to prevent column copying + # to establish ownership according to `data.table`'s memory model). + out_DT <- dplyr::dplyr_col_modify(in_tbl, col_modify_cols) %>% + data.table::setattr("sorted", data.table::key(.data$DT)) %>% + data.table::setDT(key = key(.data$DT)) + out_archive <- clone(.data) + out_archive$DT <- out_DT + request_names <- names(col_modify_cols) + return(list( + archive = out_archive, + request_names = request_names + )) + # (We might also consider special-casing when `mutate` hands back something + # equivalent (in some sense) to the input (probably only encountered when + # we're dealing with `group_by`), and using just `$DT`, not a shallow copy, + # in the result, primarily in order to hedge against `as.list` or `setDT` + # changing their behavior and generating deep copies somehow. This could + # also prevent storage, and perhaps also generation, of shallow copies, but + # this seems unlikely to be a major gain unless it helps enable some + # in-place modifications of refcount-1 columns (although detecting this case + # seems to be common across `group_by` implementations; maybe there is + # something there).) + } +} + + +#' Slide a function over variables in an `epi_archive` or `grouped_epi_archive` +#' +#' Slides a given function over variables in an `epi_archive` object. This +#' behaves similarly to `epi_slide()`, with the key exception that it is +#' version-aware: the sliding computation at any given reference time t is +#' performed on **data that would have been available as of t**. See the +#' [archive +#' vignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html) for +#' examples. +#' +#' @param x An [`epi_archive`] or [`grouped_epi_archive`] object. If ungrouped, +#' all data in `x` will be treated as part of a single data group. +#' @param f Function, formula, or missing; together with `...` specifies the +#' computation to slide. To "slide" means to apply a computation over a +#' sliding (a.k.a. "rolling") time window for each data group. The window is +#' determined by the `before` parameter described below. One time step is +#' typically one day or one week; see [`epi_slide`] details for more +#' explanation. If a function, `f` must take an `epi_df` with the same +#' column names as the archive's `DT`, minus the `version` column; followed +#' by a one-row tibble containing the values of the grouping variables for +#' the associated group; followed by a reference time value, usually as a +#' `Date` object; followed by any number of named arguments. If a formula, +#' `f` can operate directly on columns accessed via `.x$var` or `.$var`, as +#' in `~ mean (.x$var)` to compute a mean of a column `var` for each +#' group-`ref_time_value` combination. The group key can be accessed via +#' `.y` or `.group_key`, and the reference time value can be accessed via +#' `.z` or `.ref_time_value`. If `f` is missing, then `...` will specify the +#' computation. +#' @param ... Additional arguments to pass to the function or formula specified +#' via `f`. Alternatively, if `f` is missing, then `...` is interpreted as an +#' expression for tidy evaluation; in addition to referring to columns +#' directly by name, the expression has access to `.data` and `.env` pronouns +#' as in `dplyr` verbs, and can also refer to the `.group_key` and +#' `.ref_time_value`. See details of [`epi_slide`]. +#' @param before How far `before` each `ref_time_value` should the sliding +#' window extend? If provided, should be a single, non-NA, +#' [integer-compatible][vctrs::vec_cast] number of time steps. This window +#' endpoint is inclusive. For example, if `before = 7`, and one time step is +#' one day, then to produce a value for a `ref_time_value` of January 8, we +#' apply the given function or formula to data (for each group present) with +#' `time_value`s from January 1 onward, as they were reported on January 8. +#' For typical disease surveillance sources, this will not include any data +#' with a `time_value` of January 8, and, depending on the amount of reporting +#' latency, may not include January 7 or even earlier `time_value`s. (If +#' instead the archive were to hold nowcasts instead of regular surveillance +#' data, then we would indeed expect data for `time_value` January 8. If it +#' were to hold forecasts, then we would expect data for `time_value`s after +#' January 8, and the sliding window would extend as far after each +#' `ref_time_value` as needed to include all such `time_value`s.) +#' @param ref_time_values Reference time values / versions for sliding +#' computations; each element of this vector serves both as the anchor point +#' for the `time_value` window for the computation and the `max_version` +#' `as_of` which we fetch data in this window. If missing, then this will set +#' to a regularly-spaced sequence of values set to cover the range of +#' `version`s in the `DT` plus the `versions_end`; the spacing of values will +#' be guessed (using the GCD of the skips between values). +#' @param time_step Optional function used to define the meaning of one time +#' step, which if specified, overrides the default choice based on the +#' `time_value` column. This function must take a positive integer and return +#' an object of class `lubridate::period`. For example, we can use `time_step +#' = lubridate::hours` in order to set the time step to be one hour (this +#' would only be meaningful if `time_value` is of class `POSIXct`). +#' @param new_col_name String indicating the name of the new column that will +#' contain the derivative values. Default is "slide_value"; note that setting +#' `new_col_name` equal to an existing column name will overwrite this column. +#' @param as_list_col Should the slide results be held in a list column, or be +#' [unchopped][tidyr::unchop]/[unnested][tidyr::unnest]? Default is `FALSE`, +#' in which case a list object returned by `f` would be unnested (using +#' [`tidyr::unnest()`]), and, if the slide computations output data frames, +#' the names of the resulting columns are given by prepending `new_col_name` +#' to the names of the list elements. +#' @param names_sep String specifying the separator to use in `tidyr::unnest()` +#' when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix +#' from `new_col_name` entirely. +#' @param all_versions (Not the same as `all_rows` parameter of `epi_slide`.) If +#' `all_versions = TRUE`, then `f` will be passed the version history (all +#' `version <= ref_time_value`) for rows having `time_value` between +#' `ref_time_value - before` and `ref_time_value`. Otherwise, `f` will be +#' passed only the most recent `version` for every unique `time_value`. +#' Default is `FALSE`. +#' @return A tibble whose columns are: the grouping variables, `time_value`, +#' containing the reference time values for the slide computation, and a +#' column named according to the `new_col_name` argument, containing the slide +#' values. +#' +#' @details A few key distinctions between the current function and `epi_slide()`: +#' 1. In `f` functions for `epix_slide`, one should not assume that the input +#' data to contain any rows with `time_value` matching the computation's +#' `ref_time_value` (accessible via `attributes()$metadata$as_of`); for +#' typical epidemiological surveillance data, observations pertaining to a +#' particular time period (`time_value`) are first reported `as_of` some +#' instant after that time period has ended. +#' 2. `epix_slide()` doesn't accept an `after` argument; its windows extend +#' from `before` time steps before a given `ref_time_value` through the last +#' `time_value` available as of version `ref_time_value` (typically, this +#' won't include `ref_time_value` itself, as observations about a particular +#' time interval (e.g., day) are only published after that time interval +#' ends); `epi_slide` windows extend from `before` time steps before a +#' `ref_time_value` through `after` time steps after `ref_time_value`. +#' 3. The input class and columns are similar but different: `epix_slide` +#' (with the default `all_versions=FALSE`) keeps all columns and the +#' `epi_df`-ness of the first argument to each computation; `epi_slide` only +#' provides the grouping variables in the second input, and will convert the +#' first input into a regular tibble if the grouping variables include the +#' essential `geo_value` column. (With `all_versions=TRUE`, `epix_slide` will +#' will provide an `epi_archive` rather than an `epi-df` to each +#' computation.) +#' 4. The output class and columns are similar but different: `epix_slide()` +#' returns a tibble containing only the grouping variables, `time_value`, and +#' the new column(s) from the slide computations, whereas `epi_slide()` +#' returns an `epi_df` with all original variables plus the new columns from +#' the slide computations. (Both will mirror the grouping or ungroupedness of +#' their input, with one exception: `epi_archive`s can have trivial +#' (zero-variable) groupings, but these will be dropped in `epix_slide` +#' results as they are not supported by tibbles.) +#' 5. There are no size stability checks or element/row recycling to maintain +#' size stability in `epix_slide`, unlike in `epi_slide`. (`epix_slide` is +#' roughly analogous to [`dplyr::group_modify`], while `epi_slide` is roughly +#' analogous to `dplyr::mutate` followed by `dplyr::arrange`) This is detailed +#' in the "advanced" vignette. +#' 6. `all_rows` is not supported in `epix_slide`; since the slide +#' computations are allowed more flexibility in their outputs than in +#' `epi_slide`, we can't guess a good representation for missing computations +#' for excluded group-`ref_time_value` pairs. +#' 7. The `ref_time_values` default for `epix_slide` is based on making an +#' evenly-spaced sequence out of the `version`s in the `DT` plus the +#' `versions_end`, rather than the `time_value`s. +#' +#' Apart from the above distinctions, the interfaces between `epix_slide()` and +#' `epi_slide()` are the same. +#' +#' Furthermore, the current function can be considerably slower than +#' `epi_slide()`, for two reasons: (1) it must repeatedly fetch +#' properly-versioned snapshots from the data archive (via its `as_of()` +#' method), and (2) it performs a "manual" sliding of sorts, and does not +#' benefit from the highly efficient `slider` package. For this reason, it +#' should never be used in place of `epi_slide()`, and only used when +#' version-aware sliding is necessary (as it its purpose). +#' +#' Finally, this is simply a wrapper around the `slide()` method of the +#' `epi_archive` and `grouped_epi_archive` classes, so if `x` is an +#' object of either of these classes, then: +#' ``` +#' epix_slide(x, new_var = comp(old_var), before = 119) +#' ``` +#' is equivalent to: +#' ``` +#' x$slide(new_var = comp(old_var), before = 119) +#' ``` +#' +#' Mutation and aliasing: `epix_slide` and `$slide` will not perform in-place +#' mutation of the input archives on their own. In some edge cases the inputs it +#' feeds to the slide computations may alias parts of the input archive, so copy +#' the slide computation inputs if needed before using mutating operations like +#' `data.table`'s `:=` operator. Similarly, in some edge cases, the output of +#' the slide operation may alias parts of the input archive, so similarly, make +#' sure to clone and/or copy appropriately before using in-place mutation. +#' +#' @examples +#' library(dplyr) +#' +#' # Reference time points for which we want to compute slide values: +#' ref_time_values <- seq(as.Date("2020-06-01"), +#' as.Date("2020-06-15"), +#' by = "1 day" +#' ) +#' +#' # A simple (but not very useful) example (see the archive vignette for a more +#' # realistic one): +#' archive_cases_dv_subset_2 %>% +#' group_by(geo_value) %>% +#' epix_slide2( +#' f = ~ mean(.x$case_rate_7d_av), +#' before = 2, +#' ref_time_values = ref_time_values, +#' new_col_name = "case_rate_7d_av_recent_av" +#' ) %>% +#' ungroup() +#' # We requested time windows that started 2 days before the corresponding time +#' # values. The actual number of `time_value`s in each computation depends on +#' # the reporting latency of the signal and `time_value` range covered by the +#' # archive (2020-06-01 -- 2021-11-30 in this example). In this case, we have +#' # * 0 `time_value`s, for ref time 2020-06-01 --> the result is automatically +#' # discarded +#' # * 1 `time_value`, for ref time 2020-06-02 +#' # * 2 `time_value`s, for the rest of the results +#' # * never the 3 `time_value`s we would get from `epi_slide`, since, because +#' # of data latency, we'll never have an observation +#' # `time_value == ref_time_value` as of `ref_time_value`. +#' # The example below shows this type of behavior in more detail. +#' +#' # Examining characteristics of the data passed to each computation with +#' # `all_versions=FALSE`. +#' archive_cases_dv_subset_2 %>% +#' group_by(geo_value) %>% +#' epix_slide2( +#' function(x, gk, rtv) { +#' tibble( +#' time_range = if (nrow(x) == 0L) { +#' "0 `time_value`s" +#' } else { +#' sprintf("%s -- %s", min(x$time_value), max(x$time_value)) +#' }, +#' n = nrow(x), +#' class1 = class(x)[[1L]] +#' ) +#' }, +#' before = 5, all_versions = FALSE, +#' ref_time_values = ref_time_values, names_sep = NULL +#' ) %>% +#' ungroup() %>% +#' arrange(geo_value, time_value) +#' +#' # --- Advanced: --- +#' +#' # `epix_slide` with `all_versions=FALSE` (the default) applies a +#' # version-unaware computation to several versions of the data. We can also +#' # use `all_versions=TRUE` to apply a version-*aware* computation to several +#' # versions of the data, again looking at characteristics of the data passed +#' # to each computation. In this case, each computation should expect an +#' # `epi_archive` containing the relevant version data: +#' +#' archive_cases_dv_subset_2 %>% +#' group_by(geo_value) %>% +#' epix_slide2( +#' function(x, gk, rtv) { +#' tibble( +#' versions_start = if (nrow(x$DT) == 0L) { +#' "NA (0 rows)" +#' } else { +#' toString(min(x$DT$version)) +#' }, +#' versions_end = x$versions_end, +#' time_range = if (nrow(x$DT) == 0L) { +#' "0 `time_value`s" +#' } else { +#' sprintf("%s -- %s", min(x$DT$time_value), max(x$DT$time_value)) +#' }, +#' n = nrow(x$DT), +#' class1 = class(x)[[1L]] +#' ) +#' }, +#' before = 5, all_versions = TRUE, +#' ref_time_values = ref_time_values, names_sep = NULL +#' ) %>% +#' ungroup() %>% +#' # Focus on one geo_value so we can better see the columns above: +#' filter(geo_value == "ca") %>% +#' select(-geo_value) +#' +#' @importFrom rlang enquo !!! +#' @export +epix_slide2 <- function(x, f, ..., before, ref_time_values, + time_step, new_col_name = "slide_value", + as_list_col = FALSE, names_sep = "_", + all_versions = FALSE) { + if (!is_epi_archive2(x, grouped_okay = TRUE)) { + cli_abort("`x` must be of class `epi_archive` or `grouped_epi_archive`.") + } + return(slide(x, f, ..., + before = before, + ref_time_values = ref_time_values, + time_step = time_step, + new_col_name = new_col_name, + as_list_col = as_list_col, + names_sep = names_sep, + all_versions = all_versions + )) +} + + +#' Filter an `epi_archive` object to keep only older versions +#' +#' Generates a filtered `epi_archive` from an `epi_archive` object, keeping +#' only rows with `version` falling on or before a specified date. +#' +#' @param x An `epi_archive` object +#' @param max_version Time value specifying the max version to permit in the +#' filtered archive. That is, the output archive will comprise rows of the +#' current archive data having `version` less than or equal to the +#' specified `max_version` +#' @return An `epi_archive` object +#' +#' @export +epix_truncate_versions_after <- function(x, max_version) { + UseMethod("epix_truncate_versions_after") +} + +#' @export +epix_truncate_versions_after.epi_archive2 <- function(x, max_version) { + cloned_epi_archive <- clone(x) + return((truncate_versions_after(x, max_version))) + # ^ second set of parens drops invisibility +} diff --git a/R/sysdata.rda b/R/sysdata.rda index d100711d..a2c5abfd 100644 Binary files a/R/sysdata.rda and b/R/sysdata.rda differ diff --git a/man/as_epi_archive2.Rd b/man/as_epi_archive2.Rd new file mode 100644 index 00000000..bc3f5185 --- /dev/null +++ b/man/as_epi_archive2.Rd @@ -0,0 +1,142 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive_new.R +\name{as_epi_archive2} +\alias{as_epi_archive2} +\title{Convert to \code{epi_archive} format} +\usage{ +as_epi_archive2( + x, + geo_type, + time_type, + other_keys, + additional_metadata = list(), + compactify = NULL, + clobberable_versions_start = NA, + versions_end = max_version_with_row_in(x) +) +} +\arguments{ +\item{x}{A data frame, data table, or tibble, with columns \code{geo_value}, +\code{time_value}, \code{version}, and then any additional number of columns.} + +\item{geo_type}{Type for the geo values. If missing, then the function will +attempt to infer it from the geo values present; if this fails, then it +will be set to "custom".} + +\item{time_type}{Type for the time values. If missing, then the function will +attempt to infer it from the time values present; if this fails, then it +will be set to "custom".} + +\item{other_keys}{Character vector specifying the names of variables in \code{x} +that should be considered key variables (in the language of \code{data.table}) +apart from "geo_value", "time_value", and "version".} + +\item{additional_metadata}{List of additional metadata to attach to the +\code{epi_archive} object. The metadata will have \code{geo_type} and \code{time_type} +fields; named entries from the passed list or will be included as well.} + +\item{compactify}{Optional; Boolean or \code{NULL}: should we remove rows that are +considered redundant for the purposes of \code{epi_archive}'s built-in methods +such as \code{as_of}? As these methods use the last version of each observation +carried forward (LOCF) to interpolate between the version data provided, +rows that don't change these LOCF results can potentially be omitted to +save space. \code{TRUE} will remove these rows, \code{FALSE} will not, and missing or +\code{NULL} will remove these rows and issue a warning. Generally, this can be +set to \code{TRUE}, but if you directly inspect or edit the fields of the +\code{epi_archive} such as its \code{DT}, you will have to determine whether +\code{compactify=TRUE} will produce the desired results. If compactification +here is removing a large proportion of the rows, this may indicate a +potential for space, time, or bandwidth savings upstream the data pipeline, +e.g., when fetching, storing, or preparing the input data \code{x}} + +\item{clobberable_versions_start}{Optional; \code{length}-1; either a value of the +same \code{class} and \code{typeof} as \code{x$version}, or an \code{NA} of any \code{class} and +\code{typeof}: specifically, either (a) the earliest version that could be +subject to "clobbering" (being overwritten with different update data, but +using the \emph{same} version tag as the old update data), or (b) \code{NA}, to +indicate that no versions are clobberable. There are a variety of reasons +why versions could be clobberable under routine circumstances, such as (a) +today's version of one/all of the columns being published after initially +being filled with \code{NA} or LOCF, (b) a buggy version of today's data being +published but then fixed and republished later in the day, or (c) data +pipeline delays (e.g., publisher uploading, periodic scraping, database +syncing, periodic fetching, etc.) that make events (a) or (b) reflected +later in the day (or even on a different day) than expected; potential +causes vary between different data pipelines. The default value is \code{NA}, +which doesn't consider any versions to be clobberable. Another setting that +may be appropriate for some pipelines is \code{max_version_with_row_in(x)}.} + +\item{versions_end}{Optional; length-1, same \code{class} and \code{typeof} as +\code{x$version}: what is the last version we have observed? The default is +\code{max_version_with_row_in(x)}, but values greater than this could also be +valid, and would indicate that we observed additional versions of the data +beyond \code{max(x$version)}, but they all contained empty updates. (The default +value of \code{clobberable_versions_start} does not fully trust these empty +updates, and assumes that any version \verb{>= max(x$version)} could be +clobbered.) If \code{nrow(x) == 0}, then this argument is mandatory.} +} +\value{ +An \code{epi_archive} object. +} +\description{ +Converts a data frame, data table, or tibble into an \code{epi_archive} +object. See the \href{https://cmu-delphi.github.io/epiprocess/articles/archive.html}{archive vignette} for +examples. The parameter descriptions below are copied from there +} +\details{ +This simply a wrapper around the \code{new()} method of the \code{epi_archive} +class, so for example: + +\if{html}{\out{
}}\preformatted{x <- as_epi_archive(df, geo_type = "state", time_type = "day") +}\if{html}{\out{
}} + +would be equivalent to: + +\if{html}{\out{
}}\preformatted{x <- epi_archive$new(df, geo_type = "state", time_type = "day") +}\if{html}{\out{
}} +} +\examples{ +# Simple ex. with necessary keys +tib <- tibble::tibble( + geo_value = rep(c("ca", "hi"), each = 5), + time_value = rep(seq(as.Date("2020-01-01"), + by = 1, length.out = 5 + ), times = 2), + version = rep(seq(as.Date("2020-01-02"), + by = 1, length.out = 5 + ), times = 2), + value = rnorm(10, mean = 2, sd = 1) +) + +toy_epi_archive <- tib \%>\% as_epi_archive2( + geo_type = "state", + time_type = "day" +) +toy_epi_archive + +# Ex. with an additional key for county +df <- data.frame( + geo_value = c(replicate(2, "ca"), replicate(2, "fl")), + county = c(1, 3, 2, 5), + time_value = c( + "2020-06-01", + "2020-06-02", + "2020-06-01", + "2020-06-02" + ), + version = c( + "2020-06-02", + "2020-06-03", + "2020-06-02", + "2020-06-03" + ), + cases = c(1, 2, 3, 4), + cases_rate = c(0.01, 0.02, 0.01, 0.05) +) + +x <- df \%>\% as_epi_archive2( + geo_type = "state", + time_type = "day", + other_keys = "county" +) +} diff --git a/man/as_of.epi_archive2.Rd b/man/as_of.epi_archive2.Rd new file mode 100644 index 00000000..21a4cfc1 --- /dev/null +++ b/man/as_of.epi_archive2.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive_new.R +\name{as_of.epi_archive2} +\alias{as_of.epi_archive2} +\title{As of epi_archive} +\usage{ +\method{as_of}{epi_archive2}(epi_archive, max_version, min_time_value = -Inf, all_versions = FALSE) +} +\arguments{ +\item{epi_archive}{An \code{epi_archive} object} + +\item{max_version}{Version specifying the max version to permit in the +snapshot. That is, the snapshot will comprise the unique rows of the +current archive data that represent the most up-to-date signal values, as +of the specified \code{max_version} (and whose \code{time_value}s are at least +\code{min_time_value}).} + +\item{min_time_value}{Time value specifying the min \code{time_value} to permit in +the snapshot. Default is \code{-Inf}, which effectively means that there is no +minimum considered.} + +\item{all_versions}{Boolean; If \code{all_versions = TRUE}, then the output will be in +\code{epi_archive} format, and contain rows in the specified \code{time_value} range +having \code{version <= max_version}. The resulting object will cover a +potentially narrower \code{version} and \code{time_value} range than \code{x}, depending +on user-provided arguments. Otherwise, there will be one row in the output +for the \code{max_version} of each \code{time_value}. Default is \code{FALSE}.} +} +\description{ +Generates a snapshot in \code{epi_df} format as of a given version. +See the documentation for the wrapper function \code{\link[=epix_as_of]{epix_as_of()}} for +details. The parameter descriptions below are copied from there +} diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index 6a25b2af..86e21b89 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -1,9 +1,14 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/archive.R +% Please edit documentation in R/archive.R, R/archive_new.R \name{epi_archive} \alias{epi_archive} \title{\code{epi_archive} object} \description{ +An \code{epi_archive} is an R6 class which contains a data table +along with several relevant pieces of metadata. The data table can be seen +as the full archive (version history) for some signal variables of +interest. + An \code{epi_archive} is an R6 class which contains a data table along with several relevant pieces of metadata. The data table can be seen as the full archive (version history) for some signal variables of @@ -49,6 +54,56 @@ represent potential update data that we do not yet have access to; or in version in which it was first released, or if no version of that observation appears in the archive data at all. +\strong{A word of caution:} R6 objects, unlike most other objects in R, have +reference semantics. A primary consequence of this is that objects are not +copied when modified. You can read more about this in Hadley Wickham's +\href{https://adv-r.hadley.nz/r6.html#r6-semantics}{Advanced R} book. In order +to construct a modified archive while keeping the original intact, first +make a clone using the \verb{$clone} method, then overwrite the clone's \code{DT} +field with \code{data.table::copy(clone$DT)}, and finally perform the +modifications on the clone. + +epi archive + +An \code{epi_archive} is an R6 class which contains a data table \code{DT}, of +class \code{data.table} from the \code{data.table} package, with (at least) the +following columns: +\itemize{ +\item \code{geo_value}: the geographic value associated with each row of measurements. +\item \code{time_value}: the time value associated with each row of measurements. +\item \code{version}: the time value specifying the version for each row of +measurements. For example, if in a given row the \code{version} is January 15, +2022 and \code{time_value} is January 14, 2022, then this row contains the +measurements of the data for January 14, 2022 that were available one day +later. +} + +The data table \code{DT} has key variables \code{geo_value}, \code{time_value}, \code{version}, +as well as any others (these can be specified when instantiating the +\code{epi_archive} object via the \code{other_keys} argument, and/or set by operating +on \code{DT} directly). Refer to the documentation for \code{\link[=as_epi_archive]{as_epi_archive()}} for +information and examples of relevant parameter names for an \code{epi_archive} object. +Note that there can only be a single row per unique combination of +key variables, and thus the key variables are critical for figuring out how +to generate a snapshot of data from the archive, as of a given version. + +In general, the last version of each observation is carried forward (LOCF) to +fill in data between recorded versions, and between the last recorded +update and the \code{versions_end}. One consequence is that the \code{DT} +doesn't have to contain a full snapshot of every version (although this +generally works), but can instead contain only the rows that are new or +changed from the previous version (see \code{compactify}, which does this +automatically). Currently, deletions must be represented as revising a row +to a special state (e.g., making the entries \code{NA} or including a special +column that flags the data as removed and performing some kind of +post-processing), and the archive is unaware of what this state is. Note +that \code{NA}s \emph{can} be introduced by \code{epi_archive} methods for other reasons, +e.g., in \code{\link{epix_fill_through_version}} and \code{\link{epix_merge}}, if requested, to +represent potential update data that we do not yet have access to; or in +\code{\link{epix_merge}} to represent the "value" of an observation before the +version in which it was first released, or if no version of that +observation appears in the archive data at all. + \strong{A word of caution:} R6 objects, unlike most other objects in R, have reference semantics. A primary consequence of this is that objects are not copied when modified. You can read more about this in Hadley Wickham's @@ -60,6 +115,22 @@ modifications on the clone. } \section{Metadata}{ +The following pieces of metadata are included as fields in an \code{epi_archive} +object: +\itemize{ +\item \code{geo_type}: the type for the geo values. +\item \code{time_type}: the type for the time values. +\item \code{additional_metadata}: list of additional metadata for the data archive. +} + +Unlike an \code{epi_df} object, metadata for an \code{epi_archive} object \code{x} can be +accessed (and altered) directly, as in \code{x$geo_type} or \code{x$time_type}, +etc. Like an \code{epi_df} object, the \code{geo_type} and \code{time_type} fields in the +metadata of an \code{epi_archive} object are not currently used by any +downstream functions in the \code{epiprocess} package, and serve only as useful +bits of information to convey about the data set at hand. + + The following pieces of metadata are included as fields in an \code{epi_archive} object: \itemize{ @@ -78,6 +149,13 @@ bits of information to convey about the data set at hand. \section{Generating Snapshots}{ +An \code{epi_archive} object can be used to generate a snapshot of the data in +\code{epi_df} format, which represents the most up-to-date values of the signal +variables, as of the specified version. This is accomplished by calling the +\code{as_of()} method for an \code{epi_archive} object \code{x}. More details on this +method are documented in the wrapper function \code{\link[=epix_as_of]{epix_as_of()}}. + + An \code{epi_archive} object can be used to generate a snapshot of the data in \code{epi_df} format, which represents the most up-to-date values of the signal variables, as of the specified version. This is accomplished by calling the @@ -87,6 +165,16 @@ method are documented in the wrapper function \code{\link[=epix_as_of]{epix_as_o \section{Sliding Computations}{ +We can run a sliding computation over an \code{epi_archive} object, much like +\code{epi_slide()} does for an \code{epi_df} object. This is accomplished by calling +the \code{slide()} method for an \code{epi_archive} object, which works similarly to +the way \code{epi_slide()} works for an \code{epi_df} object, but with one key +difference: it is version-aware. That is, for an \code{epi_archive} object, the +sliding computation at any given reference time point t is performed on +\strong{data that would have been available as of t}. More details on \code{slide()} +are documented in the wrapper function \code{\link[=epix_slide]{epix_slide()}}. + + We can run a sliding computation over an \code{epi_archive} object, much like \code{epi_slide()} does for an \code{epi_df} object. This is accomplished by calling the \code{slide()} method for an \code{epi_archive} object, which works similarly to @@ -114,6 +202,22 @@ toy_epi_archive <- tib \%>\% epi_archive$new( time_type = "day" ) toy_epi_archive +tib <- tibble::tibble( + geo_value = rep(c("ca", "hi"), each = 5), + time_value = rep(seq(as.Date("2020-01-01"), + by = 1, length.out = 5 + ), times = 2), + version = rep(seq(as.Date("2020-01-02"), + by = 1, length.out = 5 + ), times = 2), + value = rnorm(10, mean = 2, sd = 1) +) + +toy_epi_archive <- tib \%>\% new_epi_archive2( + geo_type = "state", + time_type = "day" +) +toy_epi_archive } \section{Public fields}{ \if{html}{\out{
}} diff --git a/man/epix_as_of2.Rd b/man/epix_as_of2.Rd new file mode 100644 index 00000000..ac69e9a9 --- /dev/null +++ b/man/epix_as_of2.Rd @@ -0,0 +1,95 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods-epi_archive_new.R +\name{epix_as_of2} +\alias{epix_as_of2} +\title{Generate a snapshot from an \code{epi_archive} object} +\usage{ +epix_as_of2( + epi_archive, + max_version, + min_time_value = -Inf, + all_versions = FALSE +) +} +\arguments{ +\item{max_version}{Time value specifying the max version to permit in the +snapshot. That is, the snapshot will comprise the unique rows of the +current archive data that represent the most up-to-date signal values, as +of the specified \code{max_version} (and whose time values are at least +\code{min_time_value}.)} + +\item{min_time_value}{Time value specifying the min time value to permit in +the snapshot. Default is \code{-Inf}, which effectively means that there is no +minimum considered.} + +\item{all_versions}{If \code{all_versions = TRUE}, then the output will be in +\code{epi_archive} format, and contain rows in the specified \code{time_value} range +having \code{version <= max_version}. The resulting object will cover a +potentially narrower \code{version} and \code{time_value} range than \code{x}, depending +on user-provided arguments. Otherwise, there will be one row in the output +for the \code{max_version} of each \code{time_value}. Default is \code{FALSE}.} + +\item{x}{An \code{epi_archive} object} +} +\value{ +An \code{epi_df} object. +} +\description{ +Generates a snapshot in \code{epi_df} format from an \code{epi_archive} object, as of a +given version. See the \href{https://cmu-delphi.github.io/epiprocess/articles/archive.html}{archive vignette} for +examples. +} +\details{ +This is simply a wrapper around the \code{as_of()} method of the +\code{epi_archive} class, so if \code{x} is an \code{epi_archive} object, then: + +\if{html}{\out{
}}\preformatted{epix_as_of(x, max_version = v) +}\if{html}{\out{
}} + +is equivalent to: + +\if{html}{\out{
}}\preformatted{x$as_of(max_version = v) +}\if{html}{\out{
}} + +Mutation and aliasing: \code{epix_as_of} and \verb{$as_of} will not mutate the input +archives, but may in some edge cases alias parts of the inputs, so copy the +outputs if needed before using mutating operations like \code{data.table}'s \verb{:=} +operator. Currently, the only situation where there is potentially aliasing +is of the \code{DT} in edge cases with \code{all_versions = TRUE}, but this may change +in the future. +} +\examples{ +# warning message of data latency shown +epix_as_of2( + archive_cases_dv_subset_2, + max_version = max(archive_cases_dv_subset_2$DT$version) +) + +range(archive_cases_dv_subset_2$DT$version) # 2020-06-02 -- 2021-12-01 + +epix_as_of2( + archive_cases_dv_subset_2, + max_version = as.Date("2020-06-12") +) + +# When fetching a snapshot as of the latest version with update data in the +# archive, a warning is issued by default, as this update data might not yet +# be finalized (for example, if data versions are labeled with dates, these +# versions might be overwritten throughout the corresponding days with +# additional data or "hotfixes" of erroroneous data; when we build an archive +# based on database queries, the latest available update might still be +# subject to change, but previous versions should be finalized). We can +# muffle such warnings with the following pattern: +withCallingHandlers( + { + epix_as_of2( + archive_cases_dv_subset_2, + max_version = max(archive_cases_dv_subset_2$DT$version) + ) + }, + epiprocess__snapshot_as_of_clobberable_version = function(wrn) invokeRestart("muffleWarning") +) +# Since R 4.0, there is a `globalCallingHandlers` function that can be used +# to globally toggle these warnings. + +} diff --git a/man/epix_fill_through_version2.Rd b/man/epix_fill_through_version2.Rd new file mode 100644 index 00000000..7389388a --- /dev/null +++ b/man/epix_fill_through_version2.Rd @@ -0,0 +1,48 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods-epi_archive_new.R +\name{epix_fill_through_version2} +\alias{epix_fill_through_version2} +\title{\code{epi_archive} with unobserved history filled in (won't mutate, might alias)} +\usage{ +epix_fill_through_version2( + epi_archive, + fill_versions_end, + how = c("na", "locf") +) +} +\arguments{ +\item{fill_versions_end}{Length-1, same class&type as \code{x$version}: the +version through which to fill in missing version history; this will be the +result's \verb{$versions_end} unless it already had a later +\verb{$versions_end}.} + +\item{how}{Optional; \code{"na"} or \code{"locf"}: \code{"na"} will fill in any missing +required version history with \code{NA}s, by inserting (if necessary) an update +immediately after the current \verb{$versions_end} that revises all +existing measurements to be \code{NA} (this is only supported for \code{version} +classes with a \code{next_after} implementation); \code{"locf"} will fill in missing +version history with the last version of each observation carried forward +(LOCF), by leaving the update \verb{$DT} alone (other \code{epi_archive} methods are +based on LOCF). Default is \code{"na"}.} + +\item{x}{An \code{epi_archive}} +} +\value{ +An \code{epi_archive} +} +\description{ +Sometimes, due to upstream data pipeline issues, we have to work with a +version history that isn't completely up to date, but with functions that +expect archives that are completely up to date, or equally as up-to-date as +another archive. This function provides one way to approach such mismatches: +pretend that we've "observed" additional versions, filling in these versions +with NAs or extrapolated values. +} +\details{ +'\code{epix_fill_through_version} will not mutate its \code{x} argument, but its result +might alias fields of \code{x} (e.g., mutating the result's \code{DT} might mutate +\code{x$DT}). The R6 method variant, \code{x$fill_through_version}, will mutate \code{x} to +give the result, but might reseat its fields (e.g., references to the old +\code{x$DT} might not be updated by this function or subsequent operations on +\code{x}), and returns the updated \code{x} \link[base:invisible]{invisibly}. +} diff --git a/man/epix_merge2.Rd b/man/epix_merge2.Rd new file mode 100644 index 00000000..11d0aff5 --- /dev/null +++ b/man/epix_merge2.Rd @@ -0,0 +1,71 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods-epi_archive_new.R +\name{epix_merge2} +\alias{epix_merge2} +\title{Merge two \code{epi_archive} objects} +\usage{ +epix_merge2( + x, + y, + sync = c("forbid", "na", "locf", "truncate"), + compactify = TRUE +) +} +\arguments{ +\item{x, y}{Two \code{epi_archive} objects to join together.} + +\item{sync}{Optional; \code{"forbid"}, \code{"na"}, \code{"locf"}, or \code{"truncate"}; in the +case that \code{x$versions_end} doesn't match \code{y$versions_end}, what do we do?: +\code{"forbid"}: emit an error; "na": use \code{max(x$versions_end, y$versions_end)} +as the result's \code{versions_end}, but ensure that, if we request a snapshot +as of a version after \code{min(x$versions_end, y$versions_end)}, the +observation columns from the less up-to-date archive will be all NAs (i.e., +imagine there was an update immediately after its \code{versions_end} which +revised all observations to be \code{NA}); \code{"locf"}: use \code{max(x$versions_end, y$versions_end)} as the result's \code{versions_end}, allowing the last version +of each observation to be carried forward to extrapolate unavailable +versions for the less up-to-date input archive (i.e., imagining that in the +less up-to-date archive's data set remained unchanged between its actual +\code{versions_end} and the other archive's \code{versions_end}); or \code{"truncate"}: +use \code{min(x$versions_end, y$versions_end)} as the result's \code{versions_end}, +and discard any rows containing update rows for later versions.} + +\item{compactify}{Optional; \code{TRUE}, \code{FALSE}, or \code{NULL}; should the result be +compactified? See \code{\link{as_epi_archive}} for an explanation of what this means. +Default here is \code{TRUE}.} +} +\value{ +the resulting \code{epi_archive} +} +\description{ +Merges two \code{epi_archive}s that share a common \code{geo_value}, \code{time_value}, and +set of key columns. When they also share a common \code{versions_end}, +using \verb{$as_of} on the result should be the same as using \verb{$as_of} on \code{x} and +\code{y} individually, then performing a full join of the \code{DT}s on the non-version +key columns (potentially consolidating multiple warnings about clobberable +versions). If the \code{versions_end} values differ, the +\code{sync} parameter controls what is done. +} +\details{ +This function, \code{\link{epix_merge}}, does not mutate its inputs and will not alias +either archive's \code{DT}, but may alias other fields; \code{x$merge} will overwrite +\code{x} with the result of the merge, reseating its \code{DT} and several other fields +(making them point to different objects), but avoiding mutation of the +contents of the old \code{DT} (only relevant if you have another reference to the +old \code{DT} in another object). + +In all cases, \code{additional_metadata} will be an empty list, and +\code{clobberable_versions_start} will be set to the earliest version that could +be clobbered in either input archive. +} +\examples{ +# create two example epi_archive datasets +x <- archive_cases_dv_subset_2$DT \%>\% + dplyr::select(geo_value, time_value, version, case_rate_7d_av) \%>\% + as_epi_archive2(compactify = TRUE) +y <- archive_cases_dv_subset_2$DT \%>\% + dplyr::select(geo_value, time_value, version, percent_cli) \%>\% + as_epi_archive2(compactify = TRUE) +# merge results stored in a third object: +xy <- epix_merge2(x, y) + +} diff --git a/man/epix_slide2.Rd b/man/epix_slide2.Rd new file mode 100644 index 00000000..8d822bc0 --- /dev/null +++ b/man/epix_slide2.Rd @@ -0,0 +1,283 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods-epi_archive_new.R +\name{epix_slide2} +\alias{epix_slide2} +\title{Slide a function over variables in an \code{epi_archive} or \code{grouped_epi_archive}} +\usage{ +epix_slide2( + x, + f, + ..., + before, + ref_time_values, + time_step, + new_col_name = "slide_value", + as_list_col = FALSE, + names_sep = "_", + all_versions = FALSE +) +} +\arguments{ +\item{x}{An \code{\link{epi_archive}} or \code{\link{grouped_epi_archive}} object. If ungrouped, +all data in \code{x} will be treated as part of a single data group.} + +\item{f}{Function, formula, or missing; together with \code{...} specifies the +computation to slide. To "slide" means to apply a computation over a +sliding (a.k.a. "rolling") time window for each data group. The window is +determined by the \code{before} parameter described below. One time step is +typically one day or one week; see \code{\link{epi_slide}} details for more +explanation. If a function, \code{f} must take an \code{epi_df} with the same +column names as the archive's \code{DT}, minus the \code{version} column; followed +by a one-row tibble containing the values of the grouping variables for +the associated group; followed by a reference time value, usually as a +\code{Date} object; followed by any number of named arguments. If a formula, +\code{f} can operate directly on columns accessed via \code{.x$var} or \code{.$var}, as +in \code{~ mean (.x$var)} to compute a mean of a column \code{var} for each +group-\code{ref_time_value} combination. The group key can be accessed via +\code{.y} or \code{.group_key}, and the reference time value can be accessed via +\code{.z} or \code{.ref_time_value}. If \code{f} is missing, then \code{...} will specify the +computation.} + +\item{...}{Additional arguments to pass to the function or formula specified +via \code{f}. Alternatively, if \code{f} is missing, then \code{...} is interpreted as an +expression for tidy evaluation; in addition to referring to columns +directly by name, the expression has access to \code{.data} and \code{.env} pronouns +as in \code{dplyr} verbs, and can also refer to the \code{.group_key} and +\code{.ref_time_value}. See details of \code{\link{epi_slide}}.} + +\item{before}{How far \code{before} each \code{ref_time_value} should the sliding +window extend? If provided, should be a single, non-NA, +\link[vctrs:vec_cast]{integer-compatible} number of time steps. This window +endpoint is inclusive. For example, if \code{before = 7}, and one time step is +one day, then to produce a value for a \code{ref_time_value} of January 8, we +apply the given function or formula to data (for each group present) with +\code{time_value}s from January 1 onward, as they were reported on January 8. +For typical disease surveillance sources, this will not include any data +with a \code{time_value} of January 8, and, depending on the amount of reporting +latency, may not include January 7 or even earlier \code{time_value}s. (If +instead the archive were to hold nowcasts instead of regular surveillance +data, then we would indeed expect data for \code{time_value} January 8. If it +were to hold forecasts, then we would expect data for \code{time_value}s after +January 8, and the sliding window would extend as far after each +\code{ref_time_value} as needed to include all such \code{time_value}s.)} + +\item{ref_time_values}{Reference time values / versions for sliding +computations; each element of this vector serves both as the anchor point +for the \code{time_value} window for the computation and the \code{max_version} +\code{as_of} which we fetch data in this window. If missing, then this will set +to a regularly-spaced sequence of values set to cover the range of +\code{version}s in the \code{DT} plus the \code{versions_end}; the spacing of values will +be guessed (using the GCD of the skips between values).} + +\item{time_step}{Optional function used to define the meaning of one time +step, which if specified, overrides the default choice based on the +\code{time_value} column. This function must take a positive integer and return +an object of class \code{lubridate::period}. For example, we can use \code{time_step = lubridate::hours} in order to set the time step to be one hour (this +would only be meaningful if \code{time_value} is of class \code{POSIXct}).} + +\item{new_col_name}{String indicating the name of the new column that will +contain the derivative values. Default is "slide_value"; note that setting +\code{new_col_name} equal to an existing column name will overwrite this column.} + +\item{as_list_col}{Should the slide results be held in a list column, or be +\link[tidyr:chop]{unchopped}/\link[tidyr:unnest]{unnested}? Default is \code{FALSE}, +in which case a list object returned by \code{f} would be unnested (using +\code{\link[tidyr:unnest]{tidyr::unnest()}}), and, if the slide computations output data frames, +the names of the resulting columns are given by prepending \code{new_col_name} +to the names of the list elements.} + +\item{names_sep}{String specifying the separator to use in \code{tidyr::unnest()} +when \code{as_list_col = FALSE}. Default is "_". Using \code{NULL} drops the prefix +from \code{new_col_name} entirely.} + +\item{all_versions}{(Not the same as \code{all_rows} parameter of \code{epi_slide}.) If +\code{all_versions = TRUE}, then \code{f} will be passed the version history (all +\code{version <= ref_time_value}) for rows having \code{time_value} between +\code{ref_time_value - before} and \code{ref_time_value}. Otherwise, \code{f} will be +passed only the most recent \code{version} for every unique \code{time_value}. +Default is \code{FALSE}.} +} +\value{ +A tibble whose columns are: the grouping variables, \code{time_value}, +containing the reference time values for the slide computation, and a +column named according to the \code{new_col_name} argument, containing the slide +values. +} +\description{ +Slides a given function over variables in an \code{epi_archive} object. This +behaves similarly to \code{epi_slide()}, with the key exception that it is +version-aware: the sliding computation at any given reference time t is +performed on \strong{data that would have been available as of t}. See the +\href{https://cmu-delphi.github.io/epiprocess/articles/archive.html}{archive vignette} for +examples. +} +\details{ +A few key distinctions between the current function and \code{epi_slide()}: +\enumerate{ +\item In \code{f} functions for \code{epix_slide}, one should not assume that the input +data to contain any rows with \code{time_value} matching the computation's +\code{ref_time_value} (accessible via \verb{attributes()$metadata$as_of}); for +typical epidemiological surveillance data, observations pertaining to a +particular time period (\code{time_value}) are first reported \code{as_of} some +instant after that time period has ended. +\item \code{epix_slide()} doesn't accept an \code{after} argument; its windows extend +from \code{before} time steps before a given \code{ref_time_value} through the last +\code{time_value} available as of version \code{ref_time_value} (typically, this +won't include \code{ref_time_value} itself, as observations about a particular +time interval (e.g., day) are only published after that time interval +ends); \code{epi_slide} windows extend from \code{before} time steps before a +\code{ref_time_value} through \code{after} time steps after \code{ref_time_value}. +\item The input class and columns are similar but different: \code{epix_slide} +(with the default \code{all_versions=FALSE}) keeps all columns and the +\code{epi_df}-ness of the first argument to each computation; \code{epi_slide} only +provides the grouping variables in the second input, and will convert the +first input into a regular tibble if the grouping variables include the +essential \code{geo_value} column. (With \code{all_versions=TRUE}, \code{epix_slide} will +will provide an \code{epi_archive} rather than an \code{epi-df} to each +computation.) +\item The output class and columns are similar but different: \code{epix_slide()} +returns a tibble containing only the grouping variables, \code{time_value}, and +the new column(s) from the slide computations, whereas \code{epi_slide()} +returns an \code{epi_df} with all original variables plus the new columns from +the slide computations. (Both will mirror the grouping or ungroupedness of +their input, with one exception: \code{epi_archive}s can have trivial +(zero-variable) groupings, but these will be dropped in \code{epix_slide} +results as they are not supported by tibbles.) +\item There are no size stability checks or element/row recycling to maintain +size stability in \code{epix_slide}, unlike in \code{epi_slide}. (\code{epix_slide} is +roughly analogous to \code{\link[dplyr:group_map]{dplyr::group_modify}}, while \code{epi_slide} is roughly +analogous to \code{dplyr::mutate} followed by \code{dplyr::arrange}) This is detailed +in the "advanced" vignette. +\item \code{all_rows} is not supported in \code{epix_slide}; since the slide +computations are allowed more flexibility in their outputs than in +\code{epi_slide}, we can't guess a good representation for missing computations +for excluded group-\code{ref_time_value} pairs. +\item The \code{ref_time_values} default for \code{epix_slide} is based on making an +evenly-spaced sequence out of the \code{version}s in the \code{DT} plus the +\code{versions_end}, rather than the \code{time_value}s. +} + +Apart from the above distinctions, the interfaces between \code{epix_slide()} and +\code{epi_slide()} are the same. + +Furthermore, the current function can be considerably slower than +\code{epi_slide()}, for two reasons: (1) it must repeatedly fetch +properly-versioned snapshots from the data archive (via its \code{as_of()} +method), and (2) it performs a "manual" sliding of sorts, and does not +benefit from the highly efficient \code{slider} package. For this reason, it +should never be used in place of \code{epi_slide()}, and only used when +version-aware sliding is necessary (as it its purpose). + +Finally, this is simply a wrapper around the \code{slide()} method of the +\code{epi_archive} and \code{grouped_epi_archive} classes, so if \code{x} is an +object of either of these classes, then: + +\if{html}{\out{
}}\preformatted{epix_slide(x, new_var = comp(old_var), before = 119) +}\if{html}{\out{
}} + +is equivalent to: + +\if{html}{\out{
}}\preformatted{x$slide(new_var = comp(old_var), before = 119) +}\if{html}{\out{
}} + +Mutation and aliasing: \code{epix_slide} and \verb{$slide} will not perform in-place +mutation of the input archives on their own. In some edge cases the inputs it +feeds to the slide computations may alias parts of the input archive, so copy +the slide computation inputs if needed before using mutating operations like +\code{data.table}'s \verb{:=} operator. Similarly, in some edge cases, the output of +the slide operation may alias parts of the input archive, so similarly, make +sure to clone and/or copy appropriately before using in-place mutation. +} +\examples{ +library(dplyr) + +# Reference time points for which we want to compute slide values: +ref_time_values <- seq(as.Date("2020-06-01"), + as.Date("2020-06-15"), + by = "1 day" +) + +# A simple (but not very useful) example (see the archive vignette for a more +# realistic one): +archive_cases_dv_subset_2 \%>\% + group_by(geo_value) \%>\% + epix_slide2( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = ref_time_values, + new_col_name = "case_rate_7d_av_recent_av" + ) \%>\% + ungroup() +# We requested time windows that started 2 days before the corresponding time +# values. The actual number of `time_value`s in each computation depends on +# the reporting latency of the signal and `time_value` range covered by the +# archive (2020-06-01 -- 2021-11-30 in this example). In this case, we have +# * 0 `time_value`s, for ref time 2020-06-01 --> the result is automatically +# discarded +# * 1 `time_value`, for ref time 2020-06-02 +# * 2 `time_value`s, for the rest of the results +# * never the 3 `time_value`s we would get from `epi_slide`, since, because +# of data latency, we'll never have an observation +# `time_value == ref_time_value` as of `ref_time_value`. +# The example below shows this type of behavior in more detail. + +# Examining characteristics of the data passed to each computation with +# `all_versions=FALSE`. +archive_cases_dv_subset_2 \%>\% + group_by(geo_value) \%>\% + epix_slide2( + function(x, gk, rtv) { + tibble( + time_range = if (nrow(x) == 0L) { + "0 `time_value`s" + } else { + sprintf("\%s -- \%s", min(x$time_value), max(x$time_value)) + }, + n = nrow(x), + class1 = class(x)[[1L]] + ) + }, + before = 5, all_versions = FALSE, + ref_time_values = ref_time_values, names_sep = NULL + ) \%>\% + ungroup() \%>\% + arrange(geo_value, time_value) + +# --- Advanced: --- + +# `epix_slide` with `all_versions=FALSE` (the default) applies a +# version-unaware computation to several versions of the data. We can also +# use `all_versions=TRUE` to apply a version-*aware* computation to several +# versions of the data, again looking at characteristics of the data passed +# to each computation. In this case, each computation should expect an +# `epi_archive` containing the relevant version data: + +archive_cases_dv_subset_2 \%>\% + group_by(geo_value) \%>\% + epix_slide2( + function(x, gk, rtv) { + tibble( + versions_start = if (nrow(x$DT) == 0L) { + "NA (0 rows)" + } else { + toString(min(x$DT$version)) + }, + versions_end = x$versions_end, + time_range = if (nrow(x$DT) == 0L) { + "0 `time_value`s" + } else { + sprintf("\%s -- \%s", min(x$DT$time_value), max(x$DT$time_value)) + }, + n = nrow(x$DT), + class1 = class(x)[[1L]] + ) + }, + before = 5, all_versions = TRUE, + ref_time_values = ref_time_values, names_sep = NULL + ) \%>\% + ungroup() \%>\% + # Focus on one geo_value so we can better see the columns above: + filter(geo_value == "ca") \%>\% + select(-geo_value) + +} diff --git a/man/epix_truncate_versions_after.Rd b/man/epix_truncate_versions_after.Rd index 8f741418..f30be07f 100644 --- a/man/epix_truncate_versions_after.Rd +++ b/man/epix_truncate_versions_after.Rd @@ -1,9 +1,12 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/methods-epi_archive.R +% Please edit documentation in R/methods-epi_archive.R, +% R/methods-epi_archive_new.R \name{epix_truncate_versions_after} \alias{epix_truncate_versions_after} \title{Filter an \code{epi_archive} object to keep only older versions} \usage{ +epix_truncate_versions_after(x, max_version) + epix_truncate_versions_after(x, max_version) } \arguments{ @@ -15,9 +18,14 @@ current archive data having \code{version} less than or equal to the specified \code{max_version}} } \value{ +An \code{epi_archive} object + An \code{epi_archive} object } \description{ +Generates a filtered \code{epi_archive} from an \code{epi_archive} object, keeping +only rows with \code{version} falling on or before a specified date. + Generates a filtered \code{epi_archive} from an \code{epi_archive} object, keeping only rows with \code{version} falling on or before a specified date. } diff --git a/man/epix_truncate_versions_after.grouped_epi_archive2.Rd b/man/epix_truncate_versions_after.grouped_epi_archive2.Rd new file mode 100644 index 00000000..5fba48fb --- /dev/null +++ b/man/epix_truncate_versions_after.grouped_epi_archive2.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/grouped_archive_new.R +\name{epix_truncate_versions_after.grouped_epi_archive2} +\alias{epix_truncate_versions_after.grouped_epi_archive2} +\title{Truncate versions after a given version, grouped} +\usage{ +\method{epix_truncate_versions_after}{grouped_epi_archive2}(grouped_epi_archive, max_version) +} +\description{ +Truncate versions after a given version, grouped +} diff --git a/man/fill_through_version.epi_archive2.Rd b/man/fill_through_version.epi_archive2.Rd new file mode 100644 index 00000000..48afb864 --- /dev/null +++ b/man/fill_through_version.epi_archive2.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive_new.R +\name{fill_through_version.epi_archive2} +\alias{fill_through_version.epi_archive2} +\title{Fill through version} +\usage{ +\method{fill_through_version}{epi_archive2}(epi_archive, fill_versions_end, how = c("na", "locf")) +} +\arguments{ +\item{epi_archive}{an \code{epi_archive} object} + +\item{fill_versions_end}{as in \code{\link{epix_fill_through_version}}} + +\item{how}{as in \code{\link{epix_fill_through_version}}} +} +\description{ +Fill in unobserved history using requested scheme by mutating +the given object and potentially reseating its fields. See +\code{\link{epix_fill_through_version}}, which doesn't mutate the input archive but +might alias its fields. +} diff --git a/man/group_by.epi_archive.Rd b/man/group_by.epi_archive.Rd index 5e867bf3..f157e834 100644 --- a/man/group_by.epi_archive.Rd +++ b/man/group_by.epi_archive.Rd @@ -1,8 +1,14 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/methods-epi_archive.R, R/grouped_epi_archive.R +% Please edit documentation in R/methods-epi_archive.R, R/grouped_archive_new.R, +% R/grouped_epi_archive.R \name{group_by.epi_archive} \alias{group_by.epi_archive} \alias{grouped_epi_archive} +\alias{group_by.grouped_epi_archive2} +\alias{group_by_drop_default.grouped_epi_archive2} +\alias{groups.grouped_epi_archive2} +\alias{ungroup.grouped_epi_archive2} +\alias{is_grouped_epi_archive2} \alias{group_by.grouped_epi_archive} \alias{groups.grouped_epi_archive} \alias{ungroup.grouped_epi_archive} @@ -12,6 +18,21 @@ \usage{ \method{group_by}{epi_archive}(.data, ..., .add = FALSE, .drop = dplyr::group_by_drop_default(.data)) +\method{group_by}{grouped_epi_archive2}( + grouped_epi_archive, + ..., + .add = FALSE, + .drop = dplyr::group_by_drop_default(grouped_epi_archive) +) + +\method{group_by_drop_default}{grouped_epi_archive2}(grouped_epi_archive) + +\method{groups}{grouped_epi_archive2}(grouped_epi_archive) + +\method{ungroup}{grouped_epi_archive2}(grouped_epi_archive, ...) + +is_grouped_epi_archive2(x) + \method{group_by}{grouped_epi_archive}(.data, ..., .add = FALSE, .drop = dplyr::group_by_drop_default(.data)) \method{groups}{grouped_epi_archive}(x) diff --git a/man/group_by.epi_archive2.Rd b/man/group_by.epi_archive2.Rd new file mode 100644 index 00000000..fa9040c3 --- /dev/null +++ b/man/group_by.epi_archive2.Rd @@ -0,0 +1,147 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive_new.R +\name{group_by.epi_archive2} +\alias{group_by.epi_archive2} +\alias{grouped_epi_archive} +\title{\code{group_by} and related methods for \code{epi_archive}, \code{grouped_epi_archive}} +\usage{ +\method{group_by}{epi_archive2}( + epi_archive, + ..., + .add = FALSE, + .drop = dplyr::group_by_drop_default(epi_archive) +) +} +\arguments{ +\item{...}{Similar to \code{\link[dplyr:group_by]{dplyr::group_by}} (see "Details:" for edge cases); +\itemize{ +\item For \code{group_by}: unquoted variable name(s) or other +\link[dplyr:dplyr_data_masking]{"data masking"} expression(s). It's possible to +use \code{\link[dplyr:mutate]{dplyr::mutate}}-like syntax here to calculate new columns on which to +perform grouping, but note that, if you are regrouping an already-grouped +\code{.data} object, the calculations will be carried out ignoring such grouping +(same as \link[dplyr:group_by]{in dplyr}). +\item For \code{ungroup}: either +\itemize{ +\item empty, in order to remove the grouping and output an \code{epi_archive}; or +\item variable name(s) or other \link[dplyr:dplyr_tidy_select]{"tidy-select"} +expression(s), in order to remove the matching variables from the list of +grouping variables, and output another \code{grouped_epi_archive}. +} +}} + +\item{.add}{Boolean. If \code{FALSE}, the default, the output will be grouped by +the variable selection from \code{...} only; if \code{TRUE}, the output will be +grouped by the current grouping variables plus the variable selection from +\code{...}.} + +\item{.drop}{As described in \code{\link[dplyr:group_by]{dplyr::group_by}}; determines treatment of +factor columns.} + +\item{.data}{An \code{epi_archive} or \code{grouped_epi_archive}} + +\item{x}{For \code{groups} or \code{ungroup}: a \code{grouped_epi_archive}; for +\code{is_grouped_epi_archive}: any object} + +\item{.tbl}{(For \code{group_by_drop_default}:) an \code{epi_archive} or +\code{grouped_epi_archive} (\code{epi_archive} dispatches to the S3 default method; +\code{grouped_epi_archive} dispatches its own S3 method)} +} +\description{ +\code{group_by} and related methods for \code{epi_archive}, \code{grouped_epi_archive} +} +\details{ +To match \code{dplyr}, \code{group_by} allows "data masking" (also referred to as +"tidy evaluation") expressions \code{...}, not just column names, in a way similar +to \code{mutate}. Note that replacing or removing key columns with these +expressions is disabled. + +\code{archive \%>\% group_by()} and other expressions that group or regroup by zero +columns (indicating that all rows should be treated as part of one large +group) will output a \code{grouped_epi_archive}, in order to enable the use of +\code{grouped_epi_archive} methods on the result. This is in slight contrast to +the same operations on tibbles and grouped tibbles, which will \emph{not} output a +\code{grouped_df} in these circumstances. + +Using \code{group_by} with \code{.add=FALSE} to override the existing grouping is +disabled; instead, \code{ungroup} first then \code{group_by}. + +Mutation and aliasing: \code{group_by} tries to use a shallow copy of the \code{DT}, +introducing column-level aliasing between its input and its result. This +doesn't follow the general model for most \code{data.table} operations, which +seems to be that, given an nonaliased (i.e., unique) pointer to a +\code{data.table} object, its pointers to its columns should also be nonaliased. +If you mutate any of the columns of either the input or result, first ensure +that it is fine if columns of the other are also mutated, but do not rely on +such behavior to occur. Additionally, never perform mutation on the key +columns at all (except for strictly increasing transformations), as this will +invalidate sortedness assumptions about the rows. + +\code{group_by_drop_default} on (ungrouped) \code{epi_archive}s is expected to dispatch +to \code{group_by_drop_default.default} (but there is a dedicated method for +\code{grouped_epi_archive}s). +} +\examples{ + +grouped_archive <- archive_cases_dv_subset_2 \%>\% group_by(geo_value) + +# `print` for metadata and method listing: +grouped_archive \%>\% print() + +# The primary use for grouping is to perform a grouped `epix_slide`: + +archive_cases_dv_subset_2 \%>\% + group_by(geo_value) \%>\% + epix_slide2( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = as.Date("2020-06-11") + 0:2, + new_col_name = "case_rate_3d_av" + ) \%>\% + ungroup() + +# ----------------------------------------------------------------- + +# Advanced: some other features of dplyr grouping are implemented: + +library(dplyr) +toy_archive <- + tribble( + ~geo_value, ~age_group, ~time_value, ~version, ~value, + "us", "adult", "2000-01-01", "2000-01-02", 121, + "us", "pediatric", "2000-01-02", "2000-01-03", 5, # (addition) + "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) + "us", "adult", "2000-01-02", "2000-01-03", 130 # (addition) + ) \%>\% + mutate( + age_group = ordered(age_group, c("pediatric", "adult")), + time_value = as.Date(time_value), + version = as.Date(version) + ) \%>\% + as_epi_archive2(other_keys = "age_group") + +# The following are equivalent: +toy_archive \%>\% group_by(geo_value, age_group) +toy_archive \%>\% + group_by(geo_value) \%>\% + group_by(age_group, .add = TRUE) +grouping_cols <- c("geo_value", "age_group") +toy_archive \%>\% group_by(across(all_of(grouping_cols))) + +# And these are equivalent: +toy_archive \%>\% group_by(geo_value) +toy_archive \%>\% + group_by(geo_value, age_group) \%>\% + ungroup(age_group) + +# To get the grouping variable names as a `list` of `name`s (a.k.a. symbols): +toy_archive \%>\% + group_by(geo_value) \%>\% + groups() + +toy_archive \%>\% + group_by(geo_value, age_group, .drop = FALSE) \%>\% + epix_slide2(f = ~ sum(.x$value), before = 20) \%>\% + ungroup() + +} diff --git a/man/is_epi_archive2.Rd b/man/is_epi_archive2.Rd new file mode 100644 index 00000000..df258d3e --- /dev/null +++ b/man/is_epi_archive2.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive_new.R +\name{is_epi_archive2} +\alias{is_epi_archive2} +\title{Test for \code{epi_archive} format} +\usage{ +is_epi_archive2(x, grouped_okay = FALSE) +} +\arguments{ +\item{x}{An object.} + +\item{grouped_okay}{Optional; Boolean; should a \code{grouped_epi_archive} also +count? Default is \code{FALSE}.} +} +\value{ +\code{TRUE} if the object inherits from \code{epi_archive}. +} +\description{ +Test for \code{epi_archive} format +} +\examples{ +is_epi_archive2(jhu_csse_daily_subset) # FALSE (this is an epi_df, not epi_archive) +is_epi_archive2(archive_cases_dv_subset_2) # TRUE + +# By default, grouped_epi_archives don't count as epi_archives, as they may +# support a different set of operations from regular `epi_archives`. This +# behavior can be controlled by `grouped_okay`. +grouped_archive <- archive_cases_dv_subset_2 \%>\% group_by(geo_value) +is_epi_archive2(grouped_archive) # FALSE +is_epi_archive2(grouped_archive, grouped_okay = TRUE) # TRUE + +} +\seealso{ +\code{\link{is_grouped_epi_archive}} +} diff --git a/man/max_version_with_row_in.Rd b/man/max_version_with_row_in.Rd index cca554fa..6f0d35b3 100644 --- a/man/max_version_with_row_in.Rd +++ b/man/max_version_with_row_in.Rd @@ -1,18 +1,25 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/archive.R +% Please edit documentation in R/archive.R, R/archive_new.R \name{max_version_with_row_in} \alias{max_version_with_row_in} \title{\code{max(x$version)}, with error if \code{x} has 0 rows} \usage{ +max_version_with_row_in(x) + max_version_with_row_in(x) } \arguments{ \item{x}{\code{x} argument of \code{\link{as_epi_archive}}} } \value{ +\code{max(x$version)} if it has any rows; raises error if it has 0 rows or +an \code{NA} version value + \code{max(x$version)} if it has any rows; raises error if it has 0 rows or an \code{NA} version value } \description{ +Exported to make defaults more easily copyable. + Exported to make defaults more easily copyable. } diff --git a/man/merge_epi_archive2.Rd b/man/merge_epi_archive2.Rd new file mode 100644 index 00000000..dd1e671e --- /dev/null +++ b/man/merge_epi_archive2.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive_new.R +\name{merge_epi_archive2} +\alias{merge_epi_archive2} +\title{Merge epi archive} +\usage{ +merge_epi_archive2( + x, + y, + sync = c("forbid", "na", "locf", "truncate"), + compactify = TRUE +) +} +\arguments{ +\item{x}{as in \code{\link{epix_merge}}} + +\item{y}{as in \code{\link{epix_merge}}} + +\item{sync}{as in \code{\link{epix_merge}}} + +\item{compactify}{as in \code{\link{epix_merge}}} +} +\description{ +Merges another \code{epi_archive} with the current one, mutating the +current one by reseating its \code{DT} and several other fields, but avoiding +mutation of the old \code{DT}; returns the current archive +\link[base:invisible]{invisibly}. See \code{\link{epix_merge}} for a full description +of the non-R6-method version, which does not mutate either archive, and +does not alias either archive's \code{DT}.a +} diff --git a/man/new_epi_archive2.Rd b/man/new_epi_archive2.Rd new file mode 100644 index 00000000..52141190 --- /dev/null +++ b/man/new_epi_archive2.Rd @@ -0,0 +1,69 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive_new.R +\name{new_epi_archive2} +\alias{new_epi_archive2} +\title{New epi archive} +\usage{ +new_epi_archive2( + x, + geo_type = NULL, + time_type = NULL, + other_keys = NULL, + additional_metadata = NULL, + compactify = NULL, + clobberable_versions_start = NA, + versions_end = NULL +) +} +\arguments{ +\item{x}{A data.frame, data.table, or tibble, with columns \code{geo_value}, +\code{time_value}, \code{version}, and then any additional number of columns.} + +\item{geo_type}{Type for the geo values. If missing, then the function will +attempt to infer it from the geo values present; if this fails, then it +will be set to "custom".} + +\item{time_type}{Type for the time values. If missing, then the function will +attempt to infer it from the time values present; if this fails, then it +will be set to "custom".} + +\item{other_keys}{Character vector specifying the names of variables in \code{x} +that should be considered key variables (in the language of \code{data.table}) +apart from "geo_value", "time_value", and "version".} + +\item{additional_metadata}{List of additional metadata to attach to the +\code{epi_archive} object. The metadata will have \code{geo_type} and \code{time_type} +fields; named entries from the passed list or will be included as well.} + +\item{compactify}{Optional; Boolean or \code{NULL}: should we remove rows that are +considered redundant for the purposes of \code{epi_archive}'s built-in methods +such as \code{as_of}? As these methods use the last version of each observation +carried forward (LOCF) to interpolate between the version data provided, +rows that don't change these LOCF results can potentially be omitted to +save space while maintaining the same behavior (with the help of the +\code{clobberable_versions_start} and \code{versions_end} fields in some edge cases). +\code{TRUE} will remove these rows, \code{FALSE} will not, and missing or \code{NULL} will +remove these rows and issue a warning. Generally, this can be set to +\code{TRUE}, but if you directly inspect or edit the fields of the \code{epi_archive} +such as its \code{DT}, or rely on redundant updates to achieve a certain +behavior of the \code{ref_time_values} default in \code{epix_slide}, you will have to +determine whether \code{compactify=TRUE} will produce the desired results. If +compactification here is removing a large proportion of the rows, this may +indicate a potential for space, time, or bandwidth savings upstream the +data pipeline, e.g., by avoiding fetching, storing, or processing these +rows of \code{x}.} + +\item{clobberable_versions_start}{Optional; as in \code{\link{as_epi_archive}}} + +\item{versions_end}{Optional; as in \code{\link{as_epi_archive}}} +} +\value{ +An \code{epi_archive} object. +} +\description{ +Creates a new \code{epi_archive} object. +} +\details{ +Refer to the documentation for \code{\link[=as_epi_archive]{as_epi_archive()}} for more information +and examples of parameter names. +} diff --git a/man/next_after.Rd b/man/next_after.Rd index 5170e8d9..82fd3ebb 100644 --- a/man/next_after.Rd +++ b/man/next_after.Rd @@ -1,17 +1,23 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/archive.R +% Please edit documentation in R/archive.R, R/archive_new.R \name{next_after} \alias{next_after} \title{Get the next possible value greater than \code{x} of the same type} \usage{ +next_after(x) + next_after(x) } \arguments{ \item{x}{the starting "value"(s)} } \value{ +same class, typeof, and length as \code{x} + same class, typeof, and length as \code{x} } \description{ +Get the next possible value greater than \code{x} of the same type + Get the next possible value greater than \code{x} of the same type } diff --git a/man/print.epi_archive2.Rd b/man/print.epi_archive2.Rd new file mode 100644 index 00000000..0105c47e --- /dev/null +++ b/man/print.epi_archive2.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive_new.R +\name{print.epi_archive2} +\alias{print.epi_archive2} +\title{Print information about an \code{epi_archive} object} +\usage{ +\method{print}{epi_archive2}(epi_archive, class = TRUE, methods = TRUE) +} +\arguments{ +\item{class}{Boolean; whether to print the class label header} + +\item{methods}{Boolean; whether to print all available methods of +the archive} +} +\description{ +Print information about an \code{epi_archive} object +} diff --git a/man/slide.epi_archive2.Rd b/man/slide.epi_archive2.Rd new file mode 100644 index 00000000..54db5636 --- /dev/null +++ b/man/slide.epi_archive2.Rd @@ -0,0 +1,101 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive_new.R +\name{slide.epi_archive2} +\alias{slide.epi_archive2} +\title{Slide over epi archive} +\usage{ +\method{slide}{epi_archive2}( + epi_archive, + f, + ..., + before, + ref_time_values, + time_step, + new_col_name = "slide_value", + as_list_col = FALSE, + names_sep = "_", + all_versions = FALSE +) +} +\arguments{ +\item{f}{Function, formula, or missing; together with \code{...} specifies the +computation to slide. To "slide" means to apply a computation over a +sliding (a.k.a. "rolling") time window for each data group. The window is +determined by the \code{before} parameter described below. One time step is +typically one day or one week; see \code{\link{epi_slide}} details for more +explanation. If a function, \code{f} must take an \code{epi_df} with the same +column names as the archive's \code{DT}, minus the \code{version} column; followed +by a one-row tibble containing the values of the grouping variables for +the associated group; followed by a reference time value, usually as a +\code{Date} object; followed by any number of named arguments. If a formula, +\code{f} can operate directly on columns accessed via \code{.x$var} or \code{.$var}, as +in \code{~ mean (.x$var)} to compute a mean of a column \code{var} for each +group-\code{ref_time_value} combination. The group key can be accessed via +\code{.y} or \code{.group_key}, and the reference time value can be accessed via +\code{.z} or \code{.ref_time_value}. If \code{f} is missing, then \code{...} will specify the +computation.} + +\item{...}{Additional arguments to pass to the function or formula specified +via \code{f}. Alternatively, if \code{f} is missing, then \code{...} is interpreted as an +expression for tidy evaluation; in addition to referring to columns +directly by name, the expression has access to \code{.data} and \code{.env} pronouns +as in \code{dplyr} verbs, and can also refer to the \code{.group_key} and +\code{.ref_time_value}. See details of \code{\link{epi_slide}}.} + +\item{before}{How far \code{before} each \code{ref_time_value} should the sliding +window extend? If provided, should be a single, non-NA, +\link[vctrs:vec_cast]{integer-compatible} number of time steps. This window +endpoint is inclusive. For example, if \code{before = 7}, and one time step is +one day, then to produce a value for a \code{ref_time_value} of January 8, we +apply the given function or formula to data (for each group present) with +\code{time_value}s from January 1 onward, as they were reported on January 8. +For typical disease surveillance sources, this will not include any data +with a \code{time_value} of January 8, and, depending on the amount of reporting +latency, may not include January 7 or even earlier \code{time_value}s. (If +instead the archive were to hold nowcasts instead of regular surveillance +data, then we would indeed expect data for \code{time_value} January 8. If it +were to hold forecasts, then we would expect data for \code{time_value}s after +January 8, and the sliding window would extend as far after each +\code{ref_time_value} as needed to include all such \code{time_value}s.)} + +\item{ref_time_values}{Reference time values / versions for sliding +computations; each element of this vector serves both as the anchor point +for the \code{time_value} window for the computation and the \code{max_version} +\code{as_of} which we fetch data in this window. If missing, then this will set +to a regularly-spaced sequence of values set to cover the range of +\code{version}s in the \code{DT} plus the \code{versions_end}; the spacing of values will +be guessed (using the GCD of the skips between values).} + +\item{time_step}{Optional function used to define the meaning of one time +step, which if specified, overrides the default choice based on the +\code{time_value} column. This function must take a positive integer and return +an object of class \code{lubridate::period}. For example, we can use \code{time_step = lubridate::hours} in order to set the time step to be one hour (this +would only be meaningful if \code{time_value} is of class \code{POSIXct}).} + +\item{new_col_name}{String indicating the name of the new column that will +contain the derivative values. Default is "slide_value"; note that setting +\code{new_col_name} equal to an existing column name will overwrite this column.} + +\item{as_list_col}{Should the slide results be held in a list column, or be +\link[tidyr:chop]{unchopped}/\link[tidyr:unnest]{unnested}? Default is \code{FALSE}, +in which case a list object returned by \code{f} would be unnested (using +\code{\link[tidyr:unnest]{tidyr::unnest()}}), and, if the slide computations output data frames, +the names of the resulting columns are given by prepending \code{new_col_name} +to the names of the list elements.} + +\item{names_sep}{String specifying the separator to use in \code{tidyr::unnest()} +when \code{as_list_col = FALSE}. Default is "_". Using \code{NULL} drops the prefix +from \code{new_col_name} entirely.} + +\item{all_versions}{(Not the same as \code{all_rows} parameter of \code{epi_slide}.) If +\code{all_versions = TRUE}, then \code{f} will be passed the version history (all +\code{version <= ref_time_value}) for rows having \code{time_value} between +\code{ref_time_value - before} and \code{ref_time_value}. Otherwise, \code{f} will be +passed only the most recent \code{version} for every unique \code{time_value}. +Default is \code{FALSE}.} +} +\description{ +Slides a given function over variables in an \code{epi_archive} +object. See the documentation for the wrapper function \code{\link[=epix_slide]{epix_slide()}} for +details. The parameter descriptions below are copied from there +} diff --git a/man/slide.grouped_epi_archive2.Rd b/man/slide.grouped_epi_archive2.Rd new file mode 100644 index 00000000..b5aac24c --- /dev/null +++ b/man/slide.grouped_epi_archive2.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/grouped_archive_new.R +\name{slide.grouped_epi_archive2} +\alias{slide.grouped_epi_archive2} +\title{Slide over grouped epi archive} +\usage{ +\method{slide}{grouped_epi_archive2}( + grouped_epi_archive, + f, + ..., + before, + ref_time_values, + time_step, + new_col_name = "slide_value", + as_list_col = FALSE, + names_sep = "_", + all_versions = FALSE +) +} +\description{ +Slides a given function over variables in a \code{grouped_epi_archive} +object. See the documentation for the wrapper function \code{\link[=epix_slide]{epix_slide()}} for +details. +} diff --git a/man/truncate_versions_after.epi_archive2.Rd b/man/truncate_versions_after.epi_archive2.Rd new file mode 100644 index 00000000..08ae40d4 --- /dev/null +++ b/man/truncate_versions_after.epi_archive2.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive_new.R +\name{truncate_versions_after.epi_archive2} +\alias{truncate_versions_after.epi_archive2} +\title{Truncate versions after} +\usage{ +\method{truncate_versions_after}{epi_archive2}(epi_archive, max_version) +} +\arguments{ +\item{epi_archive}{as in \code{\link{epix_truncate_versions_after}}} + +\item{max_version}{as in \code{\link{epix_truncate_versions_after}}} +} +\description{ +Filter to keep only older versions, mutating the archive by +potentially reseating but not mutating some fields. \code{DT} is likely, but not +guaranteed, to be copied. Returns the mutated archive +\link[base:invisible]{invisibly}. +} diff --git a/man/truncate_versions_after.grouped_epi_archive2.Rd b/man/truncate_versions_after.grouped_epi_archive2.Rd new file mode 100644 index 00000000..7c25950f --- /dev/null +++ b/man/truncate_versions_after.grouped_epi_archive2.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/grouped_archive_new.R +\name{truncate_versions_after.grouped_epi_archive2} +\alias{truncate_versions_after.grouped_epi_archive2} +\title{Truncate versions after a given version, grouped} +\usage{ +\method{truncate_versions_after}{grouped_epi_archive2}(grouped_epi_archive, max_version) +} +\arguments{ +\item{max_version}{as in \code{\link{epix_truncate_versions_after}}} + +\item{x}{as in \code{\link{epix_truncate_versions_after}}} +} +\description{ +Filter to keep only older versions by mutating the underlying +\code{epi_archive} using \verb{$truncate_versions_after}. Returns the mutated +\code{grouped_epi_archive} \link[base:invisible]{invisibly}. +} diff --git a/tests/testthat/test-archive_new.R b/tests/testthat/test-archive_new.R new file mode 100644 index 00000000..98f708d7 --- /dev/null +++ b/tests/testthat/test-archive_new.R @@ -0,0 +1,173 @@ +library(dplyr) + +test_that("first input must be a data.frame", { + expect_error(as_epi_archive2(c(1, 2, 3), compactify = FALSE), + regexp = "Must be of type 'data.frame'." + ) +}) + +dt <- archive_cases_dv_subset_2$DT + +test_that("data.frame must contain geo_value, time_value and version columns", { + expect_error(as_epi_archive2(select(dt, -geo_value), compactify = FALSE), + regexp = "Columns `geo_value`, `time_value`, and `version` must be present in `x`." + ) + expect_error(as_epi_archive2(select(dt, -time_value), compactify = FALSE), + regexp = "Columns `geo_value`, `time_value`, and `version` must be present in `x`." + ) + expect_error(as_epi_archive2(select(dt, -version), compactify = FALSE), + regexp = "Columns `geo_value`, `time_value`, and `version` must be present in `x`." + ) +}) + +test_that("other_keys can only contain names of the data.frame columns", { + expect_error(as_epi_archive2(dt, other_keys = "xyz", compactify = FALSE), + regexp = "`other_keys` must be contained in the column names of `x`." + ) + expect_error(as_epi_archive2(dt, other_keys = "percent_cli", compactify = FALSE), NA) +}) + +test_that("other_keys cannot contain names geo_value, time_value or version", { + expect_error(as_epi_archive2(dt, other_keys = "geo_value", compactify = FALSE), + regexp = "`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\"." + ) + expect_error(as_epi_archive2(dt, other_keys = "time_value", compactify = FALSE), + regexp = "`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\"." + ) + expect_error(as_epi_archive2(dt, other_keys = "version", compactify = FALSE), + regexp = "`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\"." + ) +}) + +test_that("Warning thrown when other_metadata contains overlapping names with geo_type or time_type fields", { + expect_warning(as_epi_archive2(dt, additional_metadata = list(geo_type = 1), compactify = FALSE), + regexp = "`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\"." + ) + expect_warning(as_epi_archive2(dt, additional_metadata = list(time_type = 1), compactify = FALSE), + regexp = "`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\"." + ) +}) + +test_that("epi_archives are correctly instantiated with a variety of data types", { + # Data frame + df <- data.frame( + geo_value = "ca", + time_value = as.Date("2020-01-01"), + version = as.Date("2020-01-01") + 0:19, + value = 1:20 + ) + + ea1 <- as_epi_archive2(df, compactify = FALSE) + expect_equal(key(ea1$DT), c("geo_value", "time_value", "version")) + expect_equal(ea1$additional_metadata, list()) + + ea2 <- as_epi_archive2(df, other_keys = "value", additional_metadata = list(value = df$value), compactify = FALSE) + expect_equal(key(ea2$DT), c("geo_value", "time_value", "value", "version")) + expect_equal(ea2$additional_metadata, list(value = df$value)) + + # Tibble + tib <- tibble::tibble(df, code = "x") + + ea3 <- as_epi_archive2(tib, compactify = FALSE) + expect_equal(key(ea3$DT), c("geo_value", "time_value", "version")) + expect_equal(ea3$additional_metadata, list()) + + ea4 <- as_epi_archive2(tib, other_keys = "code", additional_metadata = list(value = df$value), compactify = FALSE) + expect_equal(key(ea4$DT), c("geo_value", "time_value", "code", "version")) + expect_equal(ea4$additional_metadata, list(value = df$value)) + + # Keyed data.table + kdt <- data.table::data.table( + geo_value = "ca", + time_value = as.Date("2020-01-01"), + version = as.Date("2020-01-01") + 0:19, + value = 1:20, + code = "CA", + key = "code" + ) + + ea5 <- as_epi_archive2(kdt, compactify = FALSE) + # Key from data.table isn't absorbed when as_epi_archive2 is used + expect_equal(key(ea5$DT), c("geo_value", "time_value", "version")) + expect_equal(ea5$additional_metadata, list()) + + ea6 <- as_epi_archive2(kdt, other_keys = "value", additional_metadata = list(value = df$value), compactify = FALSE) + # Mismatched keys, but the one from as_epi_archive2 overrides + expect_equal(key(ea6$DT), c("geo_value", "time_value", "value", "version")) + expect_equal(ea6$additional_metadata, list(value = df$value)) + + # Unkeyed data.table + udt <- data.table::data.table( + geo_value = "ca", + time_value = as.Date("2020-01-01"), + version = as.Date("2020-01-01") + 0:19, + value = 1:20, + code = "CA" + ) + + ea7 <- as_epi_archive2(udt, compactify = FALSE) + expect_equal(key(ea7$DT), c("geo_value", "time_value", "version")) + expect_equal(ea7$additional_metadata, list()) + + ea8 <- as_epi_archive2(udt, other_keys = "code", additional_metadata = list(value = df$value), compactify = FALSE) + expect_equal(key(ea8$DT), c("geo_value", "time_value", "code", "version")) + expect_equal(ea8$additional_metadata, list(value = df$value)) + + # epi_df + edf1 <- jhu_csse_daily_subset %>% + select(geo_value, time_value, cases) %>% + mutate(version = max(time_value), code = "USA") + + ea9 <- as_epi_archive2(edf1, compactify = FALSE) + expect_equal(key(ea9$DT), c("geo_value", "time_value", "version")) + expect_equal(ea9$additional_metadata, list()) + + ea10 <- as_epi_archive2(edf1, other_keys = "code", additional_metadata = list(value = df$value), compactify = FALSE) + expect_equal(key(ea10$DT), c("geo_value", "time_value", "code", "version")) + expect_equal(ea10$additional_metadata, list(value = df$value)) + + # Keyed epi_df + edf2 <- data.frame( + geo_value = "al", + time_value = rep(as.Date("2020-01-01") + 0:9, 2), + version = c( + rep(as.Date("2020-01-25"), 10), + rep(as.Date("2020-01-26"), 10) + ), + cases = 1:20, + misc = "USA" + ) %>% + as_epi_df(additional_metadata = list(other_keys = "misc")) + + ea11 <- as_epi_archive2(edf2, compactify = FALSE) + expect_equal(key(ea11$DT), c("geo_value", "time_value", "version")) + expect_equal(ea11$additional_metadata, list()) + + ea12 <- as_epi_archive2(edf2, other_keys = "misc", additional_metadata = list(value = df$misc), compactify = FALSE) + expect_equal(key(ea12$DT), c("geo_value", "time_value", "misc", "version")) + expect_equal(ea12$additional_metadata, list(value = df$misc)) +}) + +test_that("`epi_archive` rejects nonunique keys", { + toy_update_tbl <- + tibble::tribble( + ~geo_value, ~age_group, ~time_value, ~version, ~value, + "us", "adult", "2000-01-01", "2000-01-02", 121, + "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) + "us", "adult", "2000-01-02", "2000-01-03", 130, + "us", "pediatric", "2000-01-01", "2000-01-02", 5 + ) %>% + mutate( + age_group = ordered(age_group, c("pediatric", "adult")), + time_value = as.Date(time_value), + version = as.Date(version) + ) + expect_error( + as_epi_archive2(toy_update_tbl), + class = "epiprocess__epi_archive_requires_unique_key" + ) + expect_error( + regexp = NA, + as_epi_archive2(toy_update_tbl, other_keys = "age_group"), + ) +}) diff --git a/tests/testthat/test-compactify.R b/tests/testthat/test-compactify.R index 4400c94a..58e97884 100644 --- a/tests/testthat/test-compactify.R +++ b/tests/testthat/test-compactify.R @@ -2,7 +2,7 @@ library(epiprocess) library(data.table) library(dplyr) -dt <- archive_cases_dv_subset$DT +dt <- archive_cases_dv_subset_2$DT dt <- filter(dt, geo_value == "ca") %>% filter(version <= "2020-06-15") %>% select(-case_rate_7d_av) diff --git a/tests/testthat/test-compactify_new.R b/tests/testthat/test-compactify_new.R new file mode 100644 index 00000000..cd53913d --- /dev/null +++ b/tests/testthat/test-compactify_new.R @@ -0,0 +1,110 @@ +library(epiprocess) +library(data.table) +library(dplyr) + +dt <- archive_cases_dv_subset_2$DT +dt <- filter(dt, geo_value == "ca") %>% + filter(version <= "2020-06-15") %>% + select(-case_rate_7d_av) + +test_that("Input for compactify must be NULL or a boolean", { + expect_error(as_epi_archive2(dt, compactify = "no")) +}) + +dt$percent_cli <- c(1:80) +dt$case_rate <- c(1:80) + +row_replace <- function(dt, row, x, y) { + # (This way of "replacing" elements appears to use copy-on-write even though + # we are working with a data.table.) + dt[row, 4] <- x + dt[row, 5] <- y + dt +} + +# Note that compactify is working on version-wise LOCF (last version of each +# observation carried forward) + +# Rows 1 should not be eliminated even if NA +dt <- row_replace(dt, 1, NA, NA) # Not LOCF + +# NOTE! We are assuming that there are no NA's in geo_value, time_value, +# and version. Even though compactify may erroneously remove the first row +# if it has all NA's, we are not testing this behaviour for now as this dataset +# has problems beyond the scope of this test + +# Rows 11 and 12 correspond to different time_values +dt <- row_replace(dt, 12, 11, 11) # Not LOCF + +# Rows 20 and 21 only differ in version +dt <- row_replace(dt, 21, 20, 20) # LOCF + +# Rows 21 and 22 only differ in version +dt <- row_replace(dt, 22, 20, 20) # LOCF + +# Row 39 comprises the first NA's +dt <- row_replace(dt, 39, NA, NA) # Not LOCF + +# Row 40 has two NA's, just like its lag, row 39 +dt <- row_replace(dt, 40, NA, NA) # LOCF + +# Row 62's values already exist in row 15, but row 15 is not a preceding row +dt <- row_replace(dt, 62, 15, 15) # Not LOCF + +# Row 73 only has one value carried over +dt <- row_replace(dt, 74, 73, 74) # Not LOCF + +dt_true <- as_tibble(as_epi_archive2(dt, compactify = TRUE)$DT) +dt_false <- as_tibble(as_epi_archive2(dt, compactify = FALSE)$DT) +dt_null <- suppressWarnings(as_tibble(as_epi_archive2(dt, compactify = NULL)$DT)) + +test_that("Warning for LOCF with compactify as NULL", { + expect_warning(as_epi_archive2(dt, compactify = NULL)) +}) + +test_that("No warning when there is no LOCF", { + expect_warning(as_epi_archive2(dt[1:5], compactify = NULL), NA) +}) + +test_that("LOCF values are ignored with compactify=FALSE", { + expect_identical(nrow(dt), nrow(dt_false)) +}) + +test_that("LOCF values are taken out with compactify=TRUE", { + dt_test <- as_tibble(as_epi_archive2(dt[-c(21, 22, 40), ], compactify = FALSE)$DT) + + expect_identical(dt_true, dt_null) + expect_identical(dt_null, dt_test) +}) + +test_that("as_of produces the same results with compactify=TRUE as with compactify=FALSE", { + ea_true <- as_epi_archive2(dt, compactify = TRUE) + ea_false <- as_epi_archive2(dt, compactify = FALSE) + + # Row 22, an LOCF row corresponding to the latest version, is omitted in + # ea_true + latest_version <- max(ea_false$DT$version) + as_of_true <- as_of(ea_true, latest_version) + as_of_false <- as_of(ea_false, latest_version) + + expect_identical(as_of_true, as_of_false) +}) + +test_that("compactify does not alter the default clobberable and observed version bounds", { + x <- tibble::tibble( + geo_value = "geo1", + time_value = as.Date("2000-01-01"), + version = as.Date("2000-01-01") + 1:5, + value = 42L + ) + ea_true <- as_epi_archive2(x, compactify = TRUE) + ea_false <- as_epi_archive2(x, compactify = FALSE) + # We say that we base the bounds on the user's `x` arg. We might mess up or + # change our minds and base things on the `DT` field (or a temporary `DT` + # variable, post-compactify) instead. Check that this test would trigger + # in that case: + expect_true(max(ea_true$DT$version) != max(ea_false$DT$version)) + # The actual test: + expect_identical(ea_true$clobberable_versions_start, ea_false$clobberable_versions_start) + expect_identical(ea_true$versions_end, ea_false$versions_end) +}) diff --git a/tests/testthat/test-epix_fill_through_version_new.R b/tests/testthat/test-epix_fill_through_version_new.R new file mode 100644 index 00000000..2b76a851 --- /dev/null +++ b/tests/testthat/test-epix_fill_through_version_new.R @@ -0,0 +1,109 @@ +test_that("epix_fill_through_version2 mirrors input when it is sufficiently up to date", { + ea_orig <- as_epi_archive2(data.table::data.table( + geo_value = "g1", time_value = as.Date("2020-01-01"), + version = 1:5, value = 1:5 + )) + some_earlier_observed_version <- 2L + ea_trivial_fill_na1 <- epix_fill_through_version2(ea_orig, some_earlier_observed_version, "na") + ea_trivial_fill_na2 <- epix_fill_through_version2(ea_orig, ea_orig$versions_end, "na") + ea_trivial_fill_locf <- epix_fill_through_version2(ea_orig, some_earlier_observed_version, "locf") + # Below, we want R6 objects to be compared based on contents rather than + # addresses. We appear to get this with `expect_identical` in `testthat` + # edition 3, which is based on `waldo::compare` rather than `base::identical`; + # `waldo::compare` in waldo >=0.3.1 appears (as of 0.4.0) to compare R6 + # objects by contents rather than address (in a way that is tested but maybe + # not guaranteed via user docs). Use `testthat::local_edition` to ensure we + # use testthat edition 3 here (use `testthat::` to prevent ambiguity with + # `readr`). + testthat::local_edition(3) + expect_identical(ea_orig, ea_trivial_fill_na1) + expect_identical(ea_orig, ea_trivial_fill_na2) + expect_identical(ea_orig, ea_trivial_fill_locf) +}) + +test_that("epix_fill_through_version2 can extend observed versions, gives expected `as_of`s", { + ea_orig <- as_epi_archive2(data.table::data.table( + geo_value = "g1", + time_value = as.Date("2020-01-01") + c(rep(0L, 5L), 1L), + version = c(1:5, 2L), + value = 1:6 + )) + first_unobserved_version <- 6L + later_unobserved_version <- 10L + ea_fill_na <- epix_fill_through_version2(ea_orig, later_unobserved_version, "na") + ea_fill_locf <- epix_fill_through_version2(ea_orig, later_unobserved_version, "locf") + + # We use testthat edition 3 features here, passing `ignore_attr` to + # `waldo::compare`. Ensure we are using edition 3: + testthat::local_edition(3) + withCallingHandlers( + { + expect_identical(ea_fill_na$versions_end, later_unobserved_version) + expect_identical(tibble::as_tibble(as_of(ea_fill_na, first_unobserved_version)), + tibble::tibble(geo_value = "g1", time_value = as.Date("2020-01-01") + 0:1, value = rep(NA_integer_, 2L)), + ignore_attr = TRUE + ) + expect_identical(ea_fill_locf$versions_end, later_unobserved_version) + expect_identical( + as_of(ea_fill_locf, first_unobserved_version), + as_of(ea_fill_locf, ea_orig$versions_end) %>% + { + attr(., "metadata")$as_of <- first_unobserved_version + . + } + ) + }, + epiprocess__snapshot_as_of_clobberable_version = function(wrn) invokeRestart("muffleWarning") + ) +}) + +test_that("epix_fill_through_version2 does not mutate x", { + for (ea_orig in list( + # vanilla case + as_epi_archive2(data.table::data.table( + geo_value = "g1", time_value = as.Date("2020-01-01"), + version = 1:5, value = 1:5 + )), + # data.table unique yielding original DT by reference special case (maybe + # having only 1 row is the trigger? having no revisions of initial values + # doesn't seem sufficient to trigger) + as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 10L)) + )) { + # We want to perform a strict comparison of the contents of `ea_orig` before + # and `ea_orig` after. `clone` + `expect_identical` based on waldo would + # sort of work, but we might want something stricter. `as.list` + + # `identical` plus a check of the DT seems to do the trick. + ea_orig_before_as_list <- as.list(ea_orig) + ea_orig_DT_before_copy <- data.table::copy(ea_orig$DT) + some_unobserved_version <- 8L + # + ea_fill_na <- epix_fill_through_version2(ea_orig, some_unobserved_version, "na") + ea_orig_after_as_list <- as.list(ea_orig) + # use identical, not expect_identical, for the R6-as-list test; latter isn't as strict + expect_true(identical(ea_orig_before_as_list, ea_orig_after_as_list)) + expect_identical(ea_orig_DT_before_copy, ea_orig$DT) + # + ea_fill_locf <- epix_fill_through_version2(ea_orig, some_unobserved_version, "locf") + ea_orig_after_as_list <- as.list(ea_orig) + expect_true(identical(ea_orig_before_as_list, ea_orig_after_as_list)) + expect_identical(ea_orig_DT_before_copy, ea_orig$DT) + } +}) + +test_that("epix_fill_through_version return with expected visibility", { + ea <- as_epi_archive(data.table::data.table( + geo_value = "g1", time_value = as.Date("2020-01-01"), + version = 1:5, value = 1:5 + )) + expect_true(withVisible(epix_fill_through_version(ea, 10L, "na"))[["visible"]]) +}) + +test_that("epix_fill_through_version2 returns same key & doesn't mutate old DT or its key", { + ea <- as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 10L)) + old_DT <- ea$DT + old_DT_copy <- data.table::copy(old_DT) + old_key <- data.table::key(ea$DT) + expect_identical(data.table::key(epix_fill_through_version2(ea, 5L, "na")$DT), old_key) + expect_identical(data.table::key(epix_fill_through_version2(ea, 5L, "locf")$DT), old_key) + expect_identical(data.table::key(ea$DT), old_key) +}) diff --git a/tests/testthat/test-epix_merge_new.R b/tests/testthat/test-epix_merge_new.R new file mode 100644 index 00000000..10041dbb --- /dev/null +++ b/tests/testthat/test-epix_merge_new.R @@ -0,0 +1,226 @@ +test_that("epix_merge requires forbids on invalid `y`", { + ea <- archive_cases_dv_subset_2 %>% + clone() + expect_error(epix_merge2(ea, data.frame(x = 1))) +}) + +test_that("epix_merge merges and carries forward updates properly", { + x <- as_epi_archive2( + data.table::as.data.table( + tibble::tribble( + ~geo_value, ~time_value, ~version, ~x_value, + # same version set for x and y + "g1", 1L, 1:3, paste0("XA", 1:3), + # versions of x surround those of y + this measurement has + # max update version beyond some others + "g1", 2L, 1:5, paste0("XB", 1:5), + # mirror case + "g1", 3L, 2L, paste0("XC", 2L), + # x has 1 version, y has 0 + "g1", 4L, 1L, paste0("XD", 1L), + # non-NA values that should be carried forward + # (version-wise LOCF) in other versions, plus NAs that + # should (similarly) be carried forward as NA (latter + # wouldn't work with an ordinary merge + post-processing + # with `data.table::nafill`) + "g1", 6L, c(1L, 3L, 5L), paste0("XE", c(1L, NA, 5L)) + ) %>% + tidyr::unchop(c(version, x_value)) %>% + dplyr::mutate(dplyr::across(c(x_value), ~ dplyr::if_else(grepl("NA", .x), NA_character_, .x))) + ) + ) + y <- as_epi_archive2( + data.table::as.data.table( + tibble::tribble( + ~geo_value, ~time_value, ~version, ~y_value, + "g1", 1L, 1:3, paste0("YA", 1:3), + "g1", 2L, 2L, paste0("YB", 2L), + "g1", 3L, 1:5, paste0("YC", 1:5), + "g1", 5L, 1L, paste0("YD", 1L), + "g1", 6L, 1:5, paste0("YE", 1:5), + ) %>% + tidyr::unchop(c(version, y_value)) %>% + dplyr::mutate(dplyr::across(c(y_value), ~ dplyr::if_else(grepl("NA", .x), NA_character_, .x))) + ) + ) + xy <- epix_merge2(x, y) + xy_expected <- as_epi_archive2( + data.table::as.data.table( + tibble::tribble( + ~geo_value, ~time_value, ~version, ~x_value, ~y_value, + "g1", 1L, 1:3, paste0("XA", 1:3), paste0("YA", 1:3), + "g1", 2L, 1:5, paste0("XB", 1:5), paste0("YB", c(NA, 2L, 2L, 2L, 2L)), + "g1", 3L, 1:5, paste0("XC", c(NA, 2L, 2L, 2L, 2L)), paste0("YC", 1:5), + "g1", 4L, 1L, paste0("XD", 1L), paste0("YD", NA), + "g1", 5L, 1L, paste0("XD", NA), paste0("YD", 1L), + "g1", 6L, 1:5, paste0("XE", c(1L, 1L, NA, NA, 5L)), paste0("YE", 1:5), + ) %>% + tidyr::unchop(c(version, x_value, y_value)) %>% + dplyr::mutate(dplyr::across(c(x_value, y_value), ~ dplyr::if_else(grepl("NA", .x), NA_character_, .x))) + ) + ) + # We rely on testthat edition 3 expect_identical using waldo, not identical. See + # test-epix_fill_through_version.R comments for details. + testthat::local_edition(3) + expect_identical(xy, xy_expected) +}) + +test_that("epix_merge forbids and warns on metadata and naming issues", { + expect_error( + epix_merge2( + as_epi_archive2(tibble::tibble(geo_value = "tx", time_value = 1L, version = 1L, x_value = 1L)), + as_epi_archive2(tibble::tibble(geo_value = "us", time_value = 1L, version = 5L, y_value = 2L)) + ), + regexp = "must have the same.*geo_type" + ) + expect_error( + epix_merge2( + as_epi_archive2(tibble::tibble(geo_value = "pa", time_value = 1L, version = 1L, x_value = 1L)), + as_epi_archive2(tibble::tibble(geo_value = "pa", time_value = as.Date("2020-01-01"), version = 5L, y_value = 2L)) + ), + regexp = "must have the same.*time_type" + ) + expect_error( + epix_merge2( + as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 1L)), + as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 2L)) + ), + regexp = "overlapping.*names" + ) + expect_warning( + epix_merge2( + as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 1L), + additional_metadata = list("updates_fetched" = lubridate::ymd_hms("2022-05-01 16:00:00", tz = "UTC")) + ), + as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, y_value = 2L)) + ), + regexp = "x\\$additional_metadata", + class = "epiprocess__epix_merge_ignores_additional_metadata" + ) + expect_warning( + epix_merge2( + as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 1L)), + as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, y_value = 2L), + additional_metadata = list("updates_fetched" = lubridate::ymd_hms("2022-05-01 16:00:00", tz = "UTC")) + ) + ), + regexp = "y\\$additional_metadata", + class = "epiprocess__epix_merge_ignores_additional_metadata" + ) +}) + +# use `local` to prevent accidentally using the x, y, xy bindings here +# elsewhere, while allowing reuse across a couple tests +local({ + x <- as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 1L), + clobberable_versions_start = 1L, versions_end = 10L + ) + y <- as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, y_value = 2L), + clobberable_versions_start = 3L, versions_end = 10L + ) + xy <- epix_merge2(x, y) + test_that("epix_merge considers partially-clobberable row to be clobberable", { + expect_identical(xy$clobberable_versions_start, 1L) + }) + test_that("epix_merge result uses versions_end metadata not max version val", { + expect_identical(xy$versions_end, 10L) + }) +}) + +local({ + x <- as_epi_archive2( + tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 10L), + clobberable_versions_start = 1L, + versions_end = 3L + ) + y <- as_epi_archive2( + tibble::tibble(geo_value = 1L, time_value = 1L, version = 5L, y_value = 20L), + clobberable_versions_start = 1L + ) + test_that('epix_merge forbids on sync default or "forbid"', { + expect_error(epix_merge2(x, y), + class = "epiprocess__epix_merge_unresolved_sync" + ) + expect_error(epix_merge2(x, y, sync = "forbid"), + class = "epiprocess__epix_merge_unresolved_sync" + ) + }) + test_that('epix_merge sync="na" works', { + expect_equal( + epix_merge2(x, y, sync = "na"), + as_epi_archive2(tibble::tribble( + ~geo_value, ~time_value, ~version, ~x_value, ~y_value, + 1L, 1L, 1L, 10L, NA_integer_, # x updated, y not observed yet + 1L, 1L, 4L, NA_integer_, NA_integer_, # NA-ing out x, y not observed yet + 1L, 1L, 5L, NA_integer_, 20L, # x still NA, y updated + # (we should not have a y vals -> NA update here; version 5 should be + # the `versions_end` of the result) + ), clobberable_versions_start = 1L) + ) + }) + test_that('epix_merge sync="locf" works', { + expect_equal( + epix_merge2(x, y, sync = "locf"), + as_epi_archive2(tibble::tribble( + ~geo_value, ~time_value, ~version, ~x_value, ~y_value, + 1L, 1L, 1L, 10L, NA_integer_, # x updated, y not observed yet + 1L, 1L, 5L, 10L, 20L, # x LOCF'd, y updated + ), clobberable_versions_start = 1L) + ) + }) + test_that('epix_merge sync="truncate" works', { + expect_equal( + epix_merge2(x, y, sync = "truncate"), + as_epi_archive2(tibble::tribble( + ~geo_value, ~time_value, ~version, ~x_value, ~y_value, + 1L, 1L, 1L, 10L, NA_integer_, # x updated, y not observed yet + # y's update beyond x's last update has been truncated + ), clobberable_versions_start = 1L, versions_end = 3L) + ) + }) + x_no_conflict <- as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 10L)) + y_no_conflict <- as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, y_value = 20L)) + xy_no_conflict_expected <- as_epi_archive2(tibble::tribble( + ~geo_value, ~time_value, ~version, ~x_value, ~y_value, + 1L, 1L, 1L, 10L, 20L, # x updated, y not observed yet + )) + test_that('epix_merge sync="forbid" on no-conflict works', { + expect_equal( + epix_merge2(x_no_conflict, y_no_conflict, sync = "forbid"), + xy_no_conflict_expected + ) + }) + test_that('epix_merge sync="na" on no-conflict works', { + # This test is the main reason for these no-conflict tests. We want to make + # sure that we don't add an unnecessary NA-ing-out version beyond a common + # versions_end. + expect_equal( + epix_merge2(x_no_conflict, y_no_conflict, sync = "na"), + xy_no_conflict_expected + ) + }) + test_that('epix_merge sync="locf" on no-conflict works', { + expect_equal( + epix_merge2(x_no_conflict, y_no_conflict, sync = "locf"), + xy_no_conflict_expected + ) + }) + test_that('epix_merge sync="truncate" on no-conflict works', { + expect_equal( + epix_merge2(x_no_conflict, y_no_conflict, sync = "truncate"), + xy_no_conflict_expected + ) + }) +}) + + +test_that('epix_merge sync="na" balks if do not know next_after', { + expect_error( + epix_merge2( + as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = as.POSIXct(as.Date("2020-01-01")), x_value = 10L)), + as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = as.POSIXct(as.Date("2020-01-02")), y_value = 20L)), + sync = "na" + ), + regexp = "no applicable method.*next_after" + ) +}) diff --git a/tests/testthat/test-epix_slide_new.R b/tests/testthat/test-epix_slide_new.R new file mode 100644 index 00000000..49ef5e41 --- /dev/null +++ b/tests/testthat/test-epix_slide_new.R @@ -0,0 +1,810 @@ +library(dplyr) + +test_that("epix_slide2 only works on an epi_archive", { + expect_error(epix_slide2(data.frame(x = 1))) +}) + +x <- tibble::tribble( + ~version, ~time_value, ~binary, + 4, c(1:3), 2^(1:3), + 5, c(1:2, 4), 2^(4:6), + 6, c(1:2, 4:5), 2^(7:10), + 7, 2:6, 2^(11:15) +) %>% + tidyr::unnest(c(time_value, binary)) + +xx <- bind_cols(geo_value = rep("x", 15), x) %>% + as_epi_archive2() + +test_that("epix_slide2 works as intended", { + xx1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + f = ~ sum(.x$binary), + before = 2, + new_col_name = "sum_binary" + ) + + xx2 <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + sum_binary = c( + 2^3 + 2^2, + 2^6 + 2^3, + 2^10 + 2^9, + 2^15 + 2^14 + ) + ) %>% + group_by(geo_value) + + expect_identical(xx1, xx2) # * + + xx3 <- xx %>% + group_by( + dplyr::across(dplyr::all_of("geo_value")) + ) %>% + slide( + f = ~ sum(.x$binary), + before = 2, + new_col_name = "sum_binary" + ) + + expect_identical(xx1, xx3) # This and * imply xx2 and xx3 are identical + + # function interface + xx4 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2(f = function(x, gk, rtv) { + tibble::tibble(sum_binary = sum(x$binary)) + }, before = 2, names_sep = NULL) + + expect_identical(xx1, xx4) + + # tidyeval interface + xx5 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + sum_binary = sum(binary), + before = 2 + ) + + expect_identical(xx1, xx5) +}) + +test_that("epix_slide2 works as intended with `as_list_col=TRUE`", { + xx_dfrow1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + f = ~ data.frame(bin_sum = sum(.x$binary)), + before = 2, + as_list_col = TRUE + ) + + xx_dfrow2 <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + slide_value = + c( + 2^3 + 2^2, + 2^6 + 2^3, + 2^10 + 2^9, + 2^15 + 2^14 + ) %>% + purrr::map(~ data.frame(bin_sum = .x)) + ) %>% + group_by(geo_value) + + expect_identical(xx_dfrow1, xx_dfrow2) # * + + xx_dfrow3 <- xx %>% + group_by(dplyr::across(dplyr::all_of("geo_value"))) %>% + slide( + f = ~ data.frame(bin_sum = sum(.x$binary)), + before = 2, + as_list_col = TRUE + ) + + expect_identical(xx_dfrow1, xx_dfrow3) # This and * Imply xx_dfrow2 and xx_dfrow3 are identical + + xx_df1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + f = ~ data.frame(bin = .x$binary), + before = 2, + as_list_col = TRUE + ) + + xx_df2 <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + slide_value = + list( + c(2^3, 2^2), + c(2^6, 2^3), + c(2^10, 2^9), + c(2^15, 2^14) + ) %>% + purrr::map(~ data.frame(bin = rev(.x))) + ) %>% + group_by(geo_value) + + expect_identical(xx_df1, xx_df2) + + xx_scalar1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + f = ~ sum(.x$binary), + before = 2, + as_list_col = TRUE + ) + + xx_scalar2 <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + slide_value = + list( + 2^3 + 2^2, + 2^6 + 2^3, + 2^10 + 2^9, + 2^15 + 2^14 + ) + ) %>% + group_by(geo_value) + + expect_identical(xx_scalar1, xx_scalar2) + + xx_vec1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + f = ~ .x$binary, + before = 2, + as_list_col = TRUE + ) + + xx_vec2 <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + slide_value = + list( + c(2^3, 2^2), + c(2^6, 2^3), + c(2^10, 2^9), + c(2^15, 2^14) + ) %>% + purrr::map(rev) + ) %>% + group_by(geo_value) + + expect_identical(xx_vec1, xx_vec2) +}) + +test_that("epix_slide2 `before` validation works", { + expect_error( + slide(xx, f = ~ sum(.x$binary)), + "`before` is required" + ) + expect_error( + slide(xx, f = ~ sum(.x$binary), before = NA), + "Assertion on 'before' failed: May not be NA" + ) + expect_error( + slide(xx, f = ~ sum(.x$binary), before = -1), + "Assertion on 'before' failed: Element 1 is not >= 0" + ) + expect_error(slide(xx, f = ~ sum(.x$binary), before = 1.5), + regexp = "before", + class = "vctrs_error_incompatible_type" + ) + # We might want to allow this at some point (issue #219): + expect_error(slide(xx, f = ~ sum(.x$binary), before = Inf), + regexp = "before", + class = "vctrs_error_incompatible_type" + ) + # (wrapper shouldn't introduce a value:) + expect_error(epix_slide2(xx, f = ~ sum(.x$binary)), "`before` is required") + # These `before` values should be accepted: + expect_error( + slide(xx, f = ~ sum(.x$binary), before = 0), + NA + ) + expect_error( + slide(xx, f = ~ sum(.x$binary), before = 2L), + NA + ) + expect_error( + slide(xx, f = ~ sum(.x$binary), before = 365000), + NA + ) +}) + +test_that("quosure passing issue in epix_slide2 is resolved + other potential issues", { + # (First part adapted from @examples) + time_values <- seq(as.Date("2020-06-01"), + as.Date("2020-06-02"), + by = "1 day" + ) + # We only have one non-version, non-time key in the example archive. Add + # another so that we don't accidentally pass tests due to accidentally + # matching the default grouping. + ea <- as_epi_archive2( + archive_cases_dv_subset$DT %>% + dplyr::mutate(modulus = seq_len(nrow(.)) %% 5L), + other_keys = "modulus", + compactify = TRUE + ) + reference_by_modulus <- ea %>% + group_by(modulus) %>% + epix_slide2( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ) + reference_by_neither <- ea %>% + group_by() %>% + epix_slide2( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ) + # test the passing-something-that-must-be-enquosed behavior: + # + # (S3 group_by behavior for this case is the `reference_by_modulus`) + expect_identical( + ea %>% group_by(modulus) %>% slide( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), + reference_by_modulus + ) + # test the .data pronoun behavior: + expect_identical( + epix_slide2( + x = ea %>% group_by(.data$modulus), + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), + reference_by_modulus + ) + expect_identical( + ea %>% group_by(.data$modulus) %>% slide( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), + reference_by_modulus + ) + # test the passing across-all-of-string-literal behavior: + expect_identical( + epix_slide2( + x = ea %>% group_by(dplyr::across(all_of("modulus"))), + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), + reference_by_modulus + ) + expect_identical( + ea %>% group_by(across(all_of("modulus"))) %>% slide( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), + reference_by_modulus + ) + # test the passing-across-all-of-string-var behavior: + my_group_by <- "modulus" + expect_identical( + epix_slide2( + x = ea %>% group_by(dplyr::across(tidyselect::all_of(my_group_by))), + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), + reference_by_modulus + ) + expect_identical( + ea %>% group_by(dplyr::across(tidyselect::all_of(my_group_by))) %>% slide( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), + reference_by_modulus + ) + # test the default behavior (default in this case should just be grouping by neither): + expect_identical( + epix_slide2( + x = ea, + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), + reference_by_neither + ) + expect_identical( + ea %>% slide( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), + reference_by_neither + ) +}) + +ea <- tibble::tribble( + ~version, ~time_value, ~binary, + 2, 1:1, 2^(1:1), + 3, 1:2, 2^(2:1), + 4, 1:3, 2^(3:1), + 5, 1:4, 2^(4:1), + 6, 1:5, 2^(5:1), + 7, 1:6, 2^(6:1) +) %>% + tidyr::unnest(c(time_value, binary)) %>% + mutate(geo_value = "x") %>% + as_epi_archive2() + +test_that("epix_slide2 with all_versions option has access to all older versions", { + library(data.table) + # Make sure we're using testthat edition 3, where `expect_identical` doesn't + # actually mean `base::identical` but something more content-based using + # `waldo` package: + testthat::local_edition(3) + + slide_fn <- function(x, gk, rtv) { + return(tibble( + n_versions = length(unique(x$DT$version)), + n_row = nrow(x$DT), + dt_class1 = class(x$DT)[[1L]], + dt_key = list(key(x$DT)) + )) + } + + ea_orig_mirror <- ea %>% clone(deep = TRUE) + ea_orig_mirror$DT <- copy(ea_orig_mirror$DT) + + result1 <- ea %>% + group_by() %>% + epix_slide2( + f = slide_fn, + before = 10^3, + names_sep = NULL, + all_versions = TRUE + ) + + expect_true(inherits(result1, "tbl_df")) + + result2 <- tibble::tribble( + ~time_value, ~n_versions, ~n_row, ~dt_class1, ~dt_key, + 2, 1L, sum(1:1), "data.table", key(ea$DT), + 3, 2L, sum(1:2), "data.table", key(ea$DT), + 4, 3L, sum(1:3), "data.table", key(ea$DT), + 5, 4L, sum(1:4), "data.table", key(ea$DT), + 6, 5L, sum(1:5), "data.table", key(ea$DT), + 7, 6L, sum(1:6), "data.table", key(ea$DT), + ) + + expect_identical(result1, result2) # * + + result3 <- ea %>% + group_by() %>% + slide( + f = slide_fn, + before = 10^3, + names_sep = NULL, + all_versions = TRUE + ) + + expect_identical(result1, result3) # This and * Imply result2 and result3 are identical + + # formula interface + result4 <- ea %>% + group_by() %>% + epix_slide2( + f = ~ slide_fn(.x, .y), + before = 10^3, + names_sep = NULL, + all_versions = TRUE + ) + + expect_identical(result1, result4) # This and * Imply result2 and result4 are identical + + # tidyeval interface + result5 <- ea %>% + group_by() %>% + epix_slide2( + data = slide_fn( + .x, + stop("slide_fn doesn't use group key, no need to prepare it") + ), + before = 10^3, + names_sep = NULL, + all_versions = TRUE + ) + + expect_identical(result1, result5) # This and * Imply result2 and result5 are identical + expect_identical(ea, ea_orig_mirror) # We shouldn't have mutated ea +}) + +test_that("as_of and epix_slide2 with long enough window are compatible", { + library(data.table) + testthat::local_edition(3) + + # For all_versions = FALSE: + + f1 <- function(x, gk, rtv) { + tibble( + diff_mean = mean(diff(x$binary)) + ) + } + ref_time_value1 <- 5 + + expect_identical( + ea %>% as_of(ref_time_value1) %>% f1() %>% mutate(time_value = ref_time_value1, .before = 1L), + ea %>% slide(f1, before = 1000L, ref_time_values = ref_time_value1, names_sep = NULL) + ) + + # For all_versions = TRUE: + + f2 <- function(x, gk, rtv) { + x %>% + # extract time&version-lag-1 data: + epix_slide2( + function(subx, subgk, rtv) { + tibble(data = list( + subx %>% + filter(time_value == attr(subx, "metadata")$as_of - 1) %>% + rename(real_time_value = time_value, lag1 = binary) + )) + }, + before = 1, names_sep = NULL + ) %>% + # assess as nowcast: + unnest(data) %>% + inner_join(x %>% as_of(x$versions_end), by = setdiff(key(x$DT), c("version"))) %>% + summarize(mean_abs_delta = mean(abs(binary - lag1))) + } + ref_time_value2 <- 5 + + expect_identical( + ea %>% as_of(ref_time_value2, all_versions = TRUE) %>% f2() %>% mutate(time_value = ref_time_value2, .before = 1L), + ea %>% slide(f2, before = 1000L, ref_time_values = ref_time_value2, all_versions = TRUE, names_sep = NULL) + ) + + # Test the same sort of thing when grouping by geo in an archive with multiple geos. + ea_multigeo <- ea %>% clone() + ea_multigeo$DT <- rbind( + ea_multigeo$DT, + copy(ea_multigeo$DT)[, geo_value := "y"][, binary := -binary][] + ) + setkeyv(ea_multigeo$DT, key(ea$DT)) + + expect_identical( + ea_multigeo %>% + group_by(geo_value) %>% + epix_slide2(f2, before = 1000L, ref_time_values = ref_time_value2, all_versions = TRUE, names_sep = NULL) %>% + filter(geo_value == "x"), + ea %>% # using `ea` here is like filtering `ea_multigeo` to `geo_value=="x"` + epix_as_of2(ref_time_value2, all_versions = TRUE) %>% + f2() %>% + transmute(geo_value = "x", time_value = ref_time_value2, mean_abs_delta) %>% + group_by(geo_value) + ) +}) + +test_that("epix_slide2 `f` is passed an ungrouped `epi_archive` when `all_versions=TRUE`", { + slide_fn <- function(x, gk, rtv) { + expect_true(is_epi_archive2(x)) + return(NA) + } + + ea %>% + group_by() %>% + epix_slide2( + f = slide_fn, + before = 1, + ref_time_values = 5, + new_col_name = "out", + all_versions = TRUE + ) +}) + +test_that("epix_slide2 with all_versions option works as intended", { + xx1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + f = ~ sum(.x$DT$binary), + before = 2, + new_col_name = "sum_binary", + all_versions = TRUE + ) + + xx2 <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + sum_binary = c( + 2^3 + 2^2, + 2^6 + 2^3, + 2^10 + 2^9 + 2^6, + 2^15 + 2^14 + 2^10 + ) + ) %>% + group_by(geo_value) + + expect_identical(xx1, xx2) # * + + xx3 <- xx %>% + group_by(dplyr::across(dplyr::all_of("geo_value"))) %>% + slide( + f = ~ sum(.x$DT$binary), + before = 2, + new_col_name = "sum_binary", + all_versions = TRUE + ) + + expect_identical(xx1, xx3) # This and * Imply xx2 and xx3 are identical +}) + +# XXX currently, we're using a stopgap measure of having `epix_slide2` always +# output a (grouped/ungrouped) tibble while we think about the class, columns, +# and attributes of `epix_slide2` output more carefully. We might bring this test +# back depending on the decisions there: +# +# test_that("`epix_slide2` uses `versions_end` as a resulting `epi_df`'s `as_of`", { +# ea_updated_stale = ea$clone() +# ea_updated_stale$versions_end <- ea_updated_stale$versions_end + 3 # (dbl) +# # +# expect_identical( +# ea_updated_stale %>% +# group_by(geo_value) %>% +# epix_slide2(~ slice_head(.x, n = 1L), before = 10L) %>% +# ungroup() %>% +# attr("metadata") %>% +# .$as_of, +# 10 +# ) +# }) + +test_that("epix_slide2 works with 0-row computation outputs", { + epix_slide_empty <- function(ea, ...) { + ea %>% + epix_slide2(before = 5L, ..., function(x, gk, rtv) { + tibble::tibble() + }) + } + expect_identical( + ea %>% + epix_slide_empty(), + tibble::tibble( + time_value = ea$DT$version[integer(0)] + ) + ) + expect_identical( + ea %>% + group_by(geo_value) %>% + epix_slide_empty(), + tibble::tibble( + geo_value = ea$DT$geo_value[integer(0)], + time_value = ea$DT$version[integer(0)] + ) %>% + # new_epi_df(geo_type = ea$geo_type, time_type = ea$time_type, + # as_of = ea$versions_end) %>% + group_by(geo_value) + ) + # with `all_versions=TRUE`, we have something similar but never get an + # `epi_df`: + expect_identical( + ea %>% + epix_slide_empty(all_versions = TRUE), + tibble::tibble( + time_value = ea$DT$version[integer(0)] + ) + ) + expect_identical( + ea %>% + group_by(geo_value) %>% + epix_slide_empty(all_versions = TRUE), + tibble::tibble( + geo_value = ea$DT$geo_value[integer(0)], + time_value = ea$DT$version[integer(0)] + ) %>% + group_by(geo_value) + ) +}) + +# test_that("epix_slide grouped by geo can produce `epi_df` output", { +# # This is a characterization test. Not sure we actually want this behavior; +# # https://github.com/cmu-delphi/epiprocess/pull/290#issuecomment-1489099157 +# expect_identical( +# ea %>% +# group_by(geo_value) %>% +# epix_slide(before = 5L, function(x,g) { +# tibble::tibble(value = 42) +# }, names_sep = NULL), +# tibble::tibble( +# geo_value = "x", +# time_value = epix_slide_ref_time_values_default(ea), +# value = 42 +# ) %>% +# new_epi_df(as_of = ea$versions_end) +# ) +# }) + +test_that("epix_slide alerts if the provided f doesn't take enough args", { + f_xgt <- function(x, g, t) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) + # If `regexp` is NA, asserts that there should be no errors/messages. + expect_error(epix_slide2(xx, f = f_xgt, before = 2L), regexp = NA) + expect_warning(epix_slide2(xx, f = f_xgt, before = 2L), regexp = NA) + + f_x_dots <- function(x, ...) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) + expect_warning(epix_slide2(xx, f_x_dots, before = 2L), + class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots" + ) +}) + +test_that("epix_slide2 computation via formula can use ref_time_value", { + xx_ref <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + slide_value = c(4, 5, 6, 7) + ) %>% + group_by(geo_value) + + xx1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + f = ~.ref_time_value, + before = 2 + ) + + expect_identical(xx1, xx_ref) + + xx2 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + f = ~.z, + before = 2 + ) + + expect_identical(xx2, xx_ref) + + xx3 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + f = ~..3, + before = 2 + ) + + expect_identical(xx3, xx_ref) +}) + +test_that("epix_slide2 computation via function can use ref_time_value", { + xx_ref <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + slide_value = c(4, 5, 6, 7) + ) %>% + group_by(geo_value) + + xx1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + f = function(x, g, t) t, + before = 2 + ) + + expect_identical(xx1, xx_ref) +}) + +test_that("epix_slide2 computation via dots can use ref_time_value and group", { + # ref_time_value + xx_ref <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + slide_value = c(4, 5, 6, 7) + ) %>% + group_by(geo_value) + + xx1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + before = 2, + slide_value = .ref_time_value + ) + + expect_identical(xx1, xx_ref) + + # group_key + xx_ref <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + slide_value = "x" + ) %>% + group_by(geo_value) + + # Use group_key column + xx3 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + before = 2, + slide_value = .group_key$geo_value + ) + + expect_identical(xx3, xx_ref) + + # Use entire group_key object + expect_error( + xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + before = 2, + slide_value = nrow(.group_key) + ), + NA + ) +}) + +test_that("epix_slide2 computation via dots outputs the same result using col names and the data var", { + xx_ref <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + before = 2, + sum_binary = sum(time_value) + ) + + xx1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + before = 2, + sum_binary = sum(.x$time_value) + ) + + expect_identical(xx1, xx_ref) + + xx2 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + before = 2, + sum_binary = sum(.data$time_value) + ) + + expect_identical(xx2, xx_ref) +}) + +test_that("`epix_slide2` doesn't decay date output", { + expect_true( + xx$DT %>% + as_tibble() %>% + mutate(across(c(time_value, version), ~ as.Date("2000-01-01") + .x - 1L)) %>% + as_epi_archive2() %>% + epix_slide2(before = 5L, ~ attr(.x, "metadata")$as_of) %>% + `[[`("slide_value") %>% + inherits("Date") + ) +}) + +test_that("`epix_slide2` can access objects inside of helper functions", { + helper <- function(archive_haystack, time_value_needle) { + archive_haystack %>% epix_slide2(has_needle = time_value_needle %in% time_value, before = 365000L) + } + expect_error( + helper(archive_cases_dv_subset_2, as.Date("2021-01-01")), + NA + ) + expect_error( + helper(xx, 3L), + NA + ) +}) diff --git a/tests/testthat/test-grouped_epi_archive_new.R b/tests/testthat/test-grouped_epi_archive_new.R new file mode 100644 index 00000000..8f0133b9 --- /dev/null +++ b/tests/testthat/test-grouped_epi_archive_new.R @@ -0,0 +1,104 @@ +test_that("Grouping, regrouping, and ungrouping archives works as intended", { + # From an example: + library(dplyr) + toy_archive <- + tribble( + ~geo_value, ~age_group, ~time_value, ~version, ~value, + "us", "adult", "2000-01-01", "2000-01-02", 121, + "us", "pediatric", "2000-01-02", "2000-01-03", 5, # (addition) + "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) + "us", "adult", "2000-01-02", "2000-01-03", 130 # (addition) + ) %>% + mutate( + age_group = ordered(age_group, c("pediatric", "adult")), + time_value = as.Date(time_value), + version = as.Date(version) + ) %>% + as_epi_archive2(other_keys = "age_group") + + # Ensure that we're using testthat edition 3's idea of "identical", which is + # not as strict as `identical`: + testthat::local_edition(3) + + # Test equivalency claims in example: + by_both_keys <- toy_archive %>% group_by(geo_value, age_group) + expect_identical( + by_both_keys, + toy_archive %>% group_by(geo_value) %>% group_by(age_group, .add = TRUE) + ) + grouping_cols <- c("geo_value", "age_group") + expect_identical( + by_both_keys, + toy_archive %>% group_by(across(all_of(grouping_cols))) + ) + + expect_identical( + toy_archive %>% group_by(geo_value), + toy_archive %>% group_by(geo_value, age_group) %>% ungroup(age_group) + ) + + # Test `.drop` behavior: + expect_error(toy_archive %>% group_by(.drop = "bogus"), + regexp = "Must be of type 'logical', not 'character'" + ) + expect_warning(toy_archive %>% group_by(.drop = FALSE), + class = "epiprocess__group_by_epi_archive__drop_FALSE_no_factors" + ) + expect_warning(toy_archive %>% group_by(geo_value, .drop = FALSE), + class = "epiprocess__group_by_epi_archive__drop_FALSE_no_factors" + ) + expect_warning( + grouped_factor_then_nonfactor <- + toy_archive %>% group_by(age_group, geo_value, .drop = FALSE), + class = "epiprocess__group_by_epi_archive__drop_FALSE_nonfactor_after_factor" + ) + expect_identical( + grouped_factor_then_nonfactor %>% + epix_slide2(before = 10, s = sum(value)), + tibble::tribble( + ~age_group, ~geo_value, ~time_value, ~s, + "pediatric", NA_character_, "2000-01-02", 0, + "adult", "us", "2000-01-02", 121, + "pediatric", "us", "2000-01-03", 5, + "adult", "us", "2000-01-03", 255 + ) %>% + mutate( + age_group = ordered(age_group, c("pediatric", "adult")), + time_value = as.Date(time_value) + ) %>% + # # See + # # https://github.com/cmu-delphi/epiprocess/pull/290#issuecomment-1489099157 + # # and + # # https://github.com/cmu-delphi/epiprocess/pull/311#issuecomment-1535149256 + # # for why this is commented out, pending some design + # # decisions. + # # + # as_epi_df(geo_type = "nation", # bug; want "custom" from NA; issue #242 + # as_of = as.Date("2000-01-03"), + # additional_metadata = list(other_keys = "age_group")) %>% + # # put back in expected order; see issue #166: + # select(age_group, geo_value, time_value, s) %>% + group_by(age_group, geo_value, .drop = FALSE) + ) + expect_identical( + toy_archive %>% + group_by(geo_value, age_group, .drop = FALSE) %>% + epix_slide2(before = 10, s = sum(value)), + tibble::tribble( + ~geo_value, ~age_group, ~time_value, ~s, + "us", "pediatric", "2000-01-02", 0, + "us", "adult", "2000-01-02", 121, + "us", "pediatric", "2000-01-03", 5, + "us", "adult", "2000-01-03", 255 + ) %>% + mutate( + age_group = ordered(age_group, c("pediatric", "adult")), + time_value = as.Date(time_value) + ) %>% + # as_epi_df(as_of = as.Date("2000-01-03"), + # additional_metadata = list(other_keys = "age_group")) %>% + # # put back in expected order; see issue #166: + # select(geo_value, age_group, time_value, s) %>% + group_by(geo_value, age_group, .drop = FALSE) + ) +}) diff --git a/tests/testthat/test-methods-epi_archive_new.R b/tests/testthat/test-methods-epi_archive_new.R new file mode 100644 index 00000000..eb2c14be --- /dev/null +++ b/tests/testthat/test-methods-epi_archive_new.R @@ -0,0 +1,136 @@ +library(dplyr) + +ea <- archive_cases_dv_subset_2 %>% + clone() + +ea2_data <- tibble::tribble( + ~geo_value, ~time_value, ~version, ~cases, + "ca", "2020-06-01", "2020-06-01", 1, + "ca", "2020-06-01", "2020-06-02", 2, + # + "ca", "2020-06-02", "2020-06-02", 0, + "ca", "2020-06-02", "2020-06-03", 1, + "ca", "2020-06-02", "2020-06-04", 2, + # + "ca", "2020-06-03", "2020-06-03", 1, + # + "ca", "2020-06-04", "2020-06-04", 4, +) %>% + dplyr::mutate(dplyr::across(c(time_value, version), as.Date)) + +# epix_as_of tests +test_that("epix_as_of behaves identically to as_of method", { + expect_identical( + epix_as_of2(ea, max_version = min(ea$DT$version)), + ea %>% as_of(max_version = min(ea$DT$version)) + ) +}) + +test_that("Errors are thrown due to bad as_of inputs", { + # max_version cannot be of string class rather than date class + expect_error(ea %>% as_of("2020-01-01")) + # max_version cannot be later than latest version + expect_error(ea %>% as_of(as.Date("2025-01-01"))) + # max_version cannot be a vector + expect_error(ea %>% as_of(c(as.Date("2020-01-01"), as.Date("2020-01-02")))) +}) + +test_that("Warning against max_version being clobberable", { + # none by default + expect_warning(regexp = NA, ea %>% as_of(max_version = max(ea$DT$version))) + expect_warning(regexp = NA, ea %>% as_of(max_version = min(ea$DT$version))) + # but with `clobberable_versions_start` non-`NA`, yes + ea_with_clobberable <- ea %>% clone() + ea_with_clobberable$clobberable_versions_start <- max(ea_with_clobberable$DT$version) + expect_warning(ea_with_clobberable %>% as_of(max_version = max(ea$DT$version))) + expect_warning(regexp = NA, ea_with_clobberable %>% as_of(max_version = min(ea$DT$version))) +}) + +test_that("as_of properly grabs the data and doesn't mutate key", { + d <- as.Date("2020-06-01") + + ea2 <- ea2_data %>% + as_epi_archive2() + + old_key <- data.table::key(ea2$DT) + + edf_as_of <- ea2 %>% + epix_as_of2(max_version = as.Date("2020-06-03")) + + edf_expected <- as_epi_df(tibble( + geo_value = "ca", + time_value = d + 0:2, + cases = c(2, 1, 1) + ), as_of = as.Date("2020-06-03")) + + expect_equal(edf_as_of, edf_expected, ignore_attr = c(".internal.selfref", "sorted")) + expect_equal(data.table::key(ea2$DT), old_key) +}) + +test_that("Errors are thrown due to bad epix_truncate_versions_after inputs", { + # x must be an archive + expect_error(epix_truncate_versions_after(data.frame(), as.Date("2020-01-01"))) + # max_version cannot be of string class rather than date class + expect_error(epix_truncate_versions_after(ea, "2020-01-01")) + # max_version cannot be a vector + expect_error(epix_truncate_versions_after(ea, c(as.Date("2020-01-01"), as.Date("2020-01-02")))) + # max_version cannot be missing + expect_error(epix_truncate_versions_after(ea, as.Date(NA))) + # max_version cannot be after latest version in archive + expect_error(epix_truncate_versions_after(ea, as.Date("2025-01-01"))) +}) + +test_that("epix_truncate_version_after properly grabs the data and doesn't mutate key", { + ea2 <- ea2_data %>% + as_epi_archive2() + + old_key <- data.table::key(ea2$DT) + + ea_as_of <- ea2 %>% + epix_truncate_versions_after(max_version = as.Date("2020-06-02")) + + ea_expected <- ea2_data[1:3, ] %>% + as_epi_archive2() + + expect_equal(ea_as_of, ea_expected, ignore_attr = c(".internal.selfref", "sorted")) + expect_equal(data.table::key(ea2$DT), old_key) +}) + +test_that("epix_truncate_version_after doesn't filter if max_verion at latest version", { + ea2 <- ea2_data %>% + as_epi_archive2() + + ea_expected <- ea2 %>% clone() + + ea_as_of <- ea2 %>% + epix_truncate_versions_after(max_version = as.Date("2020-06-04")) + expect_equal(ea_as_of, ea_expected, ignore_attr = c(".internal.selfref", "sorted")) +}) + +test_that("epix_truncate_version_after returns the same grouping type as input epi_archive", { + ea2 <- ea2_data %>% + as_epi_archive2() + + ea_as_of <- ea2 %>% + epix_truncate_versions_after(max_version = as.Date("2020-06-04")) + expect_true(is_epi_archive2(ea_as_of, grouped_okay = FALSE)) + + ea2_grouped <- ea2 %>% group_by(geo_value) + + ea_as_of <- ea2_grouped %>% + epix_truncate_versions_after(max_version = as.Date("2020-06-04")) + expect_true(is_grouped_epi_archive2(ea_as_of)) +}) + + +test_that("epix_truncate_version_after returns the same groups as input grouped_epi_archive", { + ea2 <- ea2_data %>% + as_epi_archive2() + ea2 <- ea2 %>% group_by(geo_value) + + ea_expected <- ea2 %>% clone() + + ea_as_of <- ea2 %>% + epix_truncate_versions_after(max_version = as.Date("2020-06-04")) + expect_equal(ea_as_of %>% groups(), ea_expected %>% groups()) +}) diff --git a/vignettes/advanced.Rmd b/vignettes/advanced.Rmd index d4fad3e7..c010c1f3 100644 --- a/vignettes/advanced.Rmd +++ b/vignettes/advanced.Rmd @@ -111,17 +111,17 @@ edf %>% edf %>% # pretend that observations about time_value t are reported in version t (nowcasts) mutate(version = time_value) %>% - as_epi_archive() %>% + as_epi_archive2() %>% group_by(geo_value) %>% - epix_slide(x_2dav = mean(x), before = 1, ref_time_values = as.Date("2020-06-02")) %>% + epix_slide2(x_2dav = mean(x), before = 1, ref_time_values = as.Date("2020-06-02")) %>% ungroup() edf %>% # pretend that observations about time_value t are reported in version t (nowcasts) mutate(version = time_value) %>% - as_epi_archive() %>% + as_epi_archive2() %>% group_by(geo_value) %>% - epix_slide(~ mean(.x$x), before = 1, ref_time_values = as.Date("2020-06-02")) %>% + epix_slide2(~ mean(.x$x), before = 1, ref_time_values = as.Date("2020-06-02")) %>% ungroup() ``` @@ -219,9 +219,9 @@ edf %>% edf %>% mutate(version = time_value) %>% - as_epi_archive() %>% + as_epi_archive2() %>% group_by(geo_value) %>% - epix_slide( + epix_slide2( a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), ref_time_values = as.Date("2020-06-02"), before = 1, as_list_col = FALSE, names_sep = NULL @@ -317,16 +317,17 @@ x <- y1 %>% version = issue, percent_cli = value ) %>% - as_epi_archive(compactify = FALSE) + as_epi_archive2(compactify = FALSE) # mutating merge operation: -x$merge( +x <- epix_merge2( + x, y2 %>% select(geo_value, time_value, version = issue, case_rate_7d_av = value ) %>% - as_epi_archive(compactify = FALSE), + as_epi_archive2(compactify = FALSE), sync = "locf", compactify = FALSE ) @@ -337,9 +338,9 @@ library(data.table) library(ggplot2) theme_set(theme_bw()) -x <- archive_cases_dv_subset$DT %>% +x <- archive_cases_dv_subset_2$DT %>% filter(geo_value %in% c("ca", "fl")) %>% - as_epi_archive(compactify = FALSE) + as_epi_archive2(compactify = FALSE) ``` Next, we extend the ARX function to handle multiple geo values, since in the @@ -457,7 +458,7 @@ data. ```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 6} # Latest snapshot of data, and forecast dates -x_latest <- epix_as_of(x, max_version = max(x$DT$version)) +x_latest <- epix_as_of2(x, max_version = max(x$DT$version)) fc_time_values <- seq(as.Date("2020-08-01"), as.Date("2021-11-30"), by = "1 month" @@ -467,7 +468,7 @@ fc_time_values <- seq(as.Date("2020-08-01"), k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { if (as_of) { x %>% - epix_slide( + epix_slide2( fc = prob_arx(.data$percent_cli, .data$case_rate_7d_av, .data$geo_value, .data$time_value, args = prob_arx_args(ahead = ahead) ), diff --git a/vignettes/archive.Rmd b/vignettes/archive.Rmd index fdb0e3c6..0b57d639 100644 --- a/vignettes/archive.Rmd +++ b/vignettes/archive.Rmd @@ -76,16 +76,16 @@ the compactify vignette. ```{r, eval=FALSE} x <- dv %>% select(geo_value, time_value, version = issue, percent_cli = value) %>% - as_epi_archive(compactify = TRUE) + as_epi_archive2(compactify = TRUE) class(x) print(x) ``` ```{r, echo=FALSE, message=FALSE, warning=FALSE} -x <- archive_cases_dv_subset$DT %>% +x <- archive_cases_dv_subset_2$DT %>% select(geo_value, time_value, version, percent_cli) %>% - as_epi_archive(compactify = TRUE) + as_epi_archive2(compactify = TRUE) class(x) print(x) @@ -154,7 +154,7 @@ function `epix_as_of()` since this is likely a more familiar interface for users not familiar with R6 (or object-oriented programming). ```{r} -x_snapshot <- epix_as_of(x, max_version = as.Date("2021-06-01")) +x_snapshot <- epix_as_of2(x, max_version = as.Date("2021-06-01")) class(x_snapshot) head(x_snapshot) max(x_snapshot$time_value) @@ -174,7 +174,7 @@ this case, since updates to the current version may still come in at a later point in time, due to various reasons, such as synchronization issues. ```{r} -x_latest <- epix_as_of(x, max_version = max(x$DT$version)) +x_latest <- epix_as_of2(x, max_version = max(x$DT$version)) ``` Below, we pull several snapshots from the archive, spaced one month apart. We @@ -188,7 +188,7 @@ theme_set(theme_bw()) self_max <- max(x$DT$version) versions <- seq(as.Date("2020-06-01"), self_max - 1, by = "1 month") snapshots <- map_dfr(versions, function(v) { - epix_as_of(x, max_version = v) %>% mutate(version = v) + epix_as_of2(x, max_version = v) %>% mutate(version = v) }) %>% bind_rows( x_latest %>% mutate(version = self_max) @@ -258,15 +258,15 @@ y <- pub_covidcast( issues = epirange(20200601, 20211201) ) %>% select(geo_value, time_value, version = issue, case_rate_7d_av = value) %>% - as_epi_archive(compactify = TRUE) + as_epi_archive2(compactify = TRUE) -x$merge(y, sync = "locf", compactify = FALSE) +x <- epix_merge2(x, y, sync = "locf", compactify = TRUE) print(x) head(x$DT) ``` ```{r, echo=FALSE, message=FALSE, warning=FALSE} -x <- archive_cases_dv_subset +x <- archive_cases_dv_subset_2 print(x) head(x$DT) ``` @@ -362,7 +362,7 @@ fc_time_values <- seq(as.Date("2020-08-01"), z <- x %>% group_by(geo_value) %>% - epix_slide( + epix_slide2( fc = prob_arx(x = percent_cli, y = case_rate_7d_av), before = 119, ref_time_values = fc_time_values ) %>% @@ -389,14 +389,14 @@ points in time and forecast horizons. The former comes from using `epi_slide()` to the latest snapshot of the data `x_latest`. ```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 6} -x_latest <- epix_as_of(x, max_version = max(x$DT$version)) +x_latest <- epix_as_of2(x, max_version = max(x$DT$version)) # Simple function to produce forecasts k weeks ahead k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { if (as_of) { x %>% group_by(.data$geo_value) %>% - epix_slide( + epix_slide2( fc = prob_arx(.data$percent_cli, .data$case_rate_7d_av, ahead = ahead), before = 119, ref_time_values = fc_time_values ) %>% diff --git a/vignettes/compactify.Rmd b/vignettes/compactify.Rmd index cad065e7..0b68c73b 100644 --- a/vignettes/compactify.Rmd +++ b/vignettes/compactify.Rmd @@ -32,10 +32,10 @@ from the second from the third value included. library(epiprocess) library(dplyr) -dt <- archive_cases_dv_subset$DT +dt <- archive_cases_dv_subset_2$DT -locf_omitted <- as_epi_archive(dt) -locf_included <- as_epi_archive(dt, compactify = FALSE) +locf_omitted <- as_epi_archive2(dt) +locf_included <- as_epi_archive2(dt, compactify = FALSE) head(locf_omitted$DT) head(locf_included$DT) @@ -48,8 +48,8 @@ LOCF-redundant values can mar the performance of dataset operations. As the colu ```{r} dt2 <- select(dt, -percent_cli) -locf_included_2 <- as_epi_archive(dt2, compactify = FALSE) -locf_omitted_2 <- as_epi_archive(dt2, compactify = TRUE) +locf_included_2 <- as_epi_archive2(dt2, compactify = FALSE) +locf_omitted_2 <- as_epi_archive2(dt2, compactify = TRUE) ``` In this example, a huge proportion of the original version update data were