diff --git a/R/archive.R b/R/archive.R index f7a98526..94420898 100644 --- a/R/archive.R +++ b/R/archive.R @@ -16,7 +16,7 @@ #' @details An `epi_archive` is an R6 class which contains a data table `DT`, of #' class `data.table` from the `data.table` package, with (at least) the #' following columns: -#' +#' #' * `geo_value`: the geographic value associated with each row of measurements. #' * `time_value`: the time value associated with each row of measurements. #' * `version`: the time value specifying the version for each row of @@ -31,7 +31,7 @@ #' on `DT` directly). There can only be a single row per unique combination of #' key variables, and thus the key variables are critical for figuring out how #' to generate a snapshot of data from the archive, as of a given version. -#' +#' #' In general, last observation carried forward (LOCF) is used to data in #' between recorded versions. Currently, deletions must be represented as #' revising a row to a special state (e.g., making the entries `NA` or @@ -43,7 +43,7 @@ #' reference semantics. A primary consequence of this is that objects are not #' copied when modified. You can read more about this in Hadley Wickham's #' [Advanced R](https://adv-r.hadley.nz/r6.html#r6-semantics) book. -#' +#' #' @section Metadata: #' The following pieces of metadata are included as fields in an `epi_archive` #' object: @@ -75,7 +75,7 @@ #' sliding computation at any given reference time point t is performed on #' **data that would have been available as of t**. More details on `slide()` #' are documented in the wrapper function `epix_slide()`. -#' +#' #' @importFrom R6 R6Class #' @export epi_archive = @@ -89,7 +89,7 @@ epi_archive = additional_metadata = NULL, #' @description Creates a new `epi_archive` object. #' @param x A data frame, data table, or tibble, with columns `geo_value`, -#' `time_value`, `version`, and then any additional number of columns. +#' `time_value`, `version`, and then any additional number of columns. #' @param geo_type Type for the geo values. If missing, then the function will #' attempt to infer it from the geo values present; if this fails, then it #' will be set to "custom". @@ -102,15 +102,24 @@ epi_archive = #' @param additional_metadata List of additional metadata to attach to the #' `epi_archive` object. The metadata will have `geo_type` and `time_type` #' fields; named entries from the passed list or will be included as well. +#' @param compactify Optional, Boolean: should we remove rows that are +#' considered redundant for the purposes of `epi_archive`'s built-in methods +#' such as `$as_of`? As these methods use the last (version of an) +#' observation carried forward (LOCF) to interpolate between the version data +#' provided, rows that won't change these LOCF results can potentially be +#' omitted to save space. Generally, this can be set to `TRUE`, but if you +#' directly inspect or edit the fields of the `epi_archive` such as the `$DT`, +#' you will have to determine whether `compactify=TRUE` will still produce +#' equivalent results. #' @return An `epi_archive` object. #' @importFrom data.table as.data.table key setkeyv initialize = function(x, geo_type, time_type, other_keys, - additional_metadata) { + additional_metadata, compactify) { # Check that we have a data frame if (!is.data.frame(x)) { Abort("`x` must be a data frame.") } - + # Check that we have geo_value, time_value, version columns if (!("geo_value" %in% names(x))) { Abort("`x` must contain a `geo_value` column.") @@ -121,7 +130,7 @@ epi_archive = if (!("version" %in% names(x))) { Abort("`x` must contain a `version` column.") } - + # If geo type is missing, then try to guess it if (missing(geo_type)) { geo_type = guess_geo_type(x$geo_value) @@ -131,7 +140,7 @@ epi_archive = if (missing(time_type)) { time_type = guess_time_type(x$time_value) } - + # Finish off with small checks on keys variables and metadata if (missing(other_keys)) other_keys = NULL if (missing(additional_metadata)) additional_metadata = list() @@ -145,14 +154,75 @@ epi_archive = c("geo_type", "time_type"))) { Warn("`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\".") } - + # Finish off with compactify + if (missing(compactify)) { + compactify = NULL + } else if (!rlang::is_bool(compactify) && + !rlang::is_null(compactify)) { + Abort("compactify must be boolean or null.") + } # Create the data table; if x was an un-keyed data.table itself, # then the call to as.data.table() will fail to set keys, so we # need to check this, then do it manually if needed key_vars = c("geo_value", "time_value", other_keys, "version") DT = as.data.table(x, key = key_vars) if (!identical(key_vars, key(DT))) setkeyv(DT, cols = key_vars) - + + # Checks to see if a value in a vector is LOCF + is_locf <- function(vec) { + dplyr::if_else(!is.na(vec) & !is.na(dplyr::lag(vec)), + vec == lag(vec), + is.na(vec) & is.na(dplyr::lag(vec))) + } + + # LOCF is defined by a row where all values except for the version + # differ from their respective lag values + + # Checks for LOCF's in a data frame + rm_locf <- function(df) { + dplyr::filter(df,if_any(c(everything(),-version),~ !is_locf(.))) + } + + # Keeps LOCF values, such as to be printed + keep_locf <- function(df) { + dplyr::filter(df,if_all(c(everything(),-version),~ is_locf(.))) + } + + # Runs compactify on data frame + if (is.null(compactify) || compactify == TRUE) { + elim = keep_locf(DT) + DT = rm_locf(DT) + } else { + # Create empty data frame for nrow(elim) to be 0 + elim = tibble() + } + + # Warns about redundant rows + if (is.null(compactify) && nrow(elim) > 0) { + warning_intro <- paste("LOCF rows found;", + "these have been removed: \n") + + # elim size capped at 6 + len <- nrow(elim) + elim <- elim[1:min(6,len),] + + warning_data <- paste(collapse="\n",capture.output(print(elim))) + + warning_message <- paste(warning_intro,warning_data) + if (len > 6) { + warning_message <- paste0(warning_message,"\n", + "Only the first 6 LOCF rows are ", + "printed. There are more than 6 LOCF", + " rows.") + } + + warning_message <- paste0(warning_message,"\n", + "To disable warning but still remove ", + "LOCF rows, set compactify=FALSE.") + + rlang::warn(warning_message) + } + # Instantiate all self variables self$DT = DT self$geo_type = geo_type @@ -163,8 +233,8 @@ epi_archive = cat("An `epi_archive` object, with metadata:\n") cat(sprintf("* %-9s = %s\n", "geo_type", self$geo_type)) cat(sprintf("* %-9s = %s\n", "time_type", self$time_type)) - if (!is.null(self$additional_metadata)) { - sapply(self$additional_metadata, function(m) { + if (!is.null(self$additional_metadata)) { + sapply(self$additional_metadata, function(m) { cat(sprintf("* %-9s = %s\n", names(m), m)) }) } @@ -178,12 +248,18 @@ epi_archive = cat(sprintf("* %-14s = %s\n", "max version", max(self$DT$version))) cat("----------\n") - cat(sprintf("Data archive (stored in DT field): %i x %i\n", + cat(sprintf("Data archive (stored in DT field): %i x %i\n", nrow(self$DT), ncol(self$DT))) cat("----------\n") - cat(sprintf("Public methods: %s", + cat(sprintf("Columns in DT: %s\n", paste(ifelse(length( + colnames(self$DT)) <= 4, paste(colnames(self$DT), collapse = ", "), + paste(paste(colnames(self$DT)[1:4], collapse = ", "), "and", + length(colnames(self$DT)[5:length(colnames(self$DT))]), "more columns"))))) + cat("----------\n") + cat(sprintf("Public methods: %s\n", paste(names(epi_archive$public_methods), - collapse = ", "))) + collapse = ", ")),"\n") + }, ##### #' @description Generates a snapshot in `epi_df` format as of a given version. @@ -195,7 +271,7 @@ epi_archive = other_keys = setdiff(key(self$DT), c("geo_value", "time_value", "version")) if (length(other_keys) == 0) other_keys = NULL - + # Check a few things on max_version if (!identical(class(max_version), class(self$DT$version))) { Abort("`max_version` and `DT$version` must have same class.") @@ -209,17 +285,17 @@ epi_archive = if (max_version == self_max) { Warn("Getting data as of the latest version possible. For a variety of reasons, it is possible that we only have a preliminary picture of this version (e.g., the upstream source has updated it but we have not seen it due to latency in synchronization). Thus, the snapshot that we produce here might not be reproducible at a later time (e.g., when the archive has caught up in terms of synchronization).") } - + # Filter by version and return return( - # Make sure to use data.table ways of filtering and selecting + # Make sure to use data.table ways of filtering and selecting self$DT[between(time_value, min_time_value, max_version) & version <= max_version, ] %>% unique(by = c("geo_value", "time_value", other_keys), fromLast = TRUE) %>% - tibble::as_tibble() %>% + tibble::as_tibble() %>% dplyr::select(-.data$version) %>% as_epi_df(geo_type = self$geo_type, time_type = self$time_type, @@ -227,7 +303,7 @@ epi_archive = additional_metadata = c(self$additional_metadata, other_keys = other_keys)) ) - }, + }, ##### #' @description Merges another `data.table` with the current one, and allows for #' a post-filling of `NA` values by last observation carried forward (LOCF). @@ -236,7 +312,7 @@ epi_archive = merge = function(y, ..., locf = TRUE, nan = NA) { # Check we have a `data.table` object if (!(inherits(y, "data.table") || inherits(y, "epi_archive"))) { - Abort("`y` must be of class `data.table` or `epi_archive`.") + Abort("`y` must be of class `data.table` or `epi_archive`.") } # Use the data.table merge function, carrying through ... args @@ -251,25 +327,25 @@ epi_archive = # Important: use nafill and not setnafill because the latter # returns the entire data frame by reference, and the former can - # be set to act on particular columns by reference using := + # be set to act on particular columns by reference using := self$DT[, - (cols) := nafill(.SD, type = "locf", nan = nan), - .SDcols = cols, + (cols) := nafill(.SD, type = "locf", nan = nan), + .SDcols = cols, by = by] } - }, + }, ##### #' @description Slides a given function over variables in an `epi_archive` #' object. See the documentation for the wrapper function `epix_as_of()` for -#' details. +#' details. #' @importFrom data.table key #' @importFrom rlang !! !!! enquo enquos is_quosure sym syms - slide = function(f, ..., n = 7, group_by, ref_time_values, + slide = function(f, ..., n = 7, group_by, ref_time_values, time_step, new_col_name = "slide_value", as_list_col = FALSE, names_sep = "_", - all_rows = FALSE) { + all_rows = FALSE) { # If missing, then set ref time values to be everything; else make - # sure we intersect with observed time values + # sure we intersect with observed time values if (missing(ref_time_values)) { ref_time_values = unique(self$DT$time_value) } @@ -277,16 +353,16 @@ epi_archive = ref_time_values = ref_time_values[ref_time_values %in% unique(self$DT$time_value)] } - - # If a custom time step is specified, then redefine units + + # If a custom time step is specified, then redefine units before_num = n-1 if (!missing(time_step)) before_num = time_step(n-1) - + # What to group by? If missing, set according to internal keys if (missing(group_by)) { group_by = setdiff(key(self$DT), c("time_value", "version")) } - + # Symbolize column name, defuse grouping variables. We have to do # the middle step here which is a bit complicated (unfortunately) # since the function epix_slide() could have called the current one, @@ -298,20 +374,20 @@ epi_archive = # Key variable names, apart from time value and version key_vars = setdiff(key(self$DT), c("time_value", "version")) - + # Computation for one group, one time value comp_one_grp = function(.data_group, - f, ..., + f, ..., time_value, key_vars, new_col) { - # Carry out the specified computation + # Carry out the specified computation comp_value = f(.data_group, ...) # Count the number of appearances of the reference time value. # Note: ideally, we want to directly count occurrences of the ref # time value but due to latency, this will often not appear in the - # data group. So we count the number of unique key values, outside + # data group. So we count the number of unique key values, outside # of the time value column count = sum(!duplicated(.data_group[, key_vars])) @@ -345,23 +421,23 @@ epi_archive = else { Abort("The slide computation must return an atomic vector or a data frame.") } - + # Note that we've already recycled comp value to make size stable, # so tibble() will just recycle time value appropriately - return(tibble::tibble(time_value = time_value, + return(tibble::tibble(time_value = time_value, !!new_col := comp_value)) } - + # If f is not missing, then just go ahead, slide by group if (!missing(f)) { if (rlang::is_formula(f)) f = rlang::as_function(f) - + x = purrr::map_dfr(ref_time_values, function(t) { self$as_of(t, min_time_value = t - before_num) %>% - tibble::as_tibble() %>% + tibble::as_tibble() %>% dplyr::group_by(!!!group_by) %>% dplyr::group_modify(comp_one_grp, - f = f, ..., + f = f, ..., time_value = t, key_vars = key_vars, new_col = new_col, @@ -379,14 +455,14 @@ epi_archive = if (length(quos) > 1) { Abort("If `f` is missing then only a single computation can be specified via `...`.") } - + quo = quos[[1]] f = function(x, quo, ...) rlang::eval_tidy(quo, x) new_col = sym(names(rlang::quos_auto_name(quos))) x = purrr::map_dfr(ref_time_values, function(t) { self$as_of(t, min_time_value = t - before_num) %>% - tibble::as_tibble() %>% + tibble::as_tibble() %>% dplyr::group_by(!!!group_by) %>% dplyr::group_modify(comp_one_grp, f = f, quo = quo, @@ -397,12 +473,12 @@ epi_archive = dplyr::ungroup() }) } - + # Unnest if we need to if (!as_list_col) { x = tidyr::unnest(x, !!new_col, names_sep = names_sep) } - + # Join to get all rows, if we need to, then return if (all_rows) { cols = c(as.character(group_by), "time_value") @@ -413,7 +489,7 @@ epi_archive = } ) ) - + #' Convert to `epi_archive` format #' #' Converts a data frame, data table, or tibble into an `epi_archive` @@ -435,6 +511,9 @@ epi_archive = #' @param additional_metadata List of additional metadata to attach to the #' `epi_archive` object. The metadata will have `geo_type` and `time_type` #' fields; named entries from the passed list or will be included as well. +#' @param compactify By default, removes LOCF rows and warns the user about +#' them. Optionally, one can input a Boolean: TRUE eliminates LOCF rows, +#' while FALSE keeps them. #' @return An `epi_archive` object. #' #' @details This simply a wrapper around the `new()` method of the `epi_archive` @@ -466,15 +545,16 @@ epi_archive = #' time_type = "day", #' other_keys = "county") as_epi_archive = function(x, geo_type, time_type, other_keys, - additional_metadata = list()) { - epi_archive$new(x, geo_type, time_type, other_keys, additional_metadata) + additional_metadata = list(),compactify = NULL) { + epi_archive$new(x, geo_type, time_type, other_keys, additional_metadata, + compactify) } #' Test for `epi_archive` format #' #' @param x An object. #' @return `TRUE` if the object inherits from `epi_archive`. -#' +#' #' @export #' @examples #' is_epi_archive(jhu_csse_daily_subset) # FALSE (this is an epi_df, not epi_archive) diff --git a/R/slide.R b/R/slide.R index 5847b130..2323107a 100644 --- a/R/slide.R +++ b/R/slide.R @@ -84,7 +84,27 @@ #' tidy evaluation (first example, above), then the name for the new column is #' inferred from the given expression and overrides any name passed explicitly #' through the `new_col_name` argument. -#' +#' +#' When `f` is a named function with arguments, if a tibble with an unnamed +#' grouping variable is passed in as the method argument to `f`, include a +#' parameter for the grouping-variable in `function()` just prior to +#' specifying the method to prevent that from being overridden. For example: +#' ``` +#' # Construct an tibble with an unnamed grouping variable +#' edf = bind_rows(tibble(geo_value = "ak", time_value = as.Date("2020-01-01") +#' + 1:10, x1=1:10, y=1:10 + rnorm(10L))) %>% +#' as_epi_df() +#' +#' # Now, include a row parameter for the grouping variable in the tibble, +#' # which we denote as g, just prior to method = "qr" +#' # Note that if g was not included below, then the method = "qr" would be +#' # overridden, as described above +#' edf %>% +#' group_by(geo_value) %>% +#' epi_slide(function(x, g, method="qr", ...) tibble(model=list( +#' lm(y ~ x1, x, method=method))), n=7L) +#' ``` +#' #' @importFrom lubridate days weeks #' @importFrom rlang .data .env !! enquo enquos sym #' @export @@ -121,7 +141,7 @@ epi_slide = function(x, f, ..., n = 7, ref_time_values, # intersect with observed time values if (missing(ref_time_values)) { ref_time_values = unique(x$time_value) - } + } else { ref_time_values = ref_time_values[ref_time_values %in% unique(x$time_value)] @@ -164,6 +184,10 @@ epi_slide = function(x, f, ..., n = 7, ref_time_values, time_range = range(unique(x$time_value)) starts = in_range(ref_time_values - before_num, time_range) stops = in_range(ref_time_values + after_num, time_range) + + if( length(starts) == 0 || length(stops) == 0 ) { + Abort("The starting and/or stopping times for sliding are out of bounds with respect to the range of times in your data. Check your settings for ref_time_values and align (and before, if specified).") + } # Symbolize new column name new_col = sym(new_col_name) diff --git a/man/as_epi_archive.Rd b/man/as_epi_archive.Rd index a95550be..ba4f5c38 100644 --- a/man/as_epi_archive.Rd +++ b/man/as_epi_archive.Rd @@ -9,7 +9,8 @@ as_epi_archive( geo_type, time_type, other_keys, - additional_metadata = list() + additional_metadata = list(), + compactify = NULL ) } \arguments{ @@ -31,6 +32,10 @@ apart from "geo_value", "time_value", and "version".} \item{additional_metadata}{List of additional metadata to attach to the \code{epi_archive} object. The metadata will have \code{geo_type} and \code{time_type} fields; named entries from the passed list or will be included as well.} + +\item{compactify}{By default, removes LOCF rows and warns the user about +them. Optionally, one can input a Boolean: TRUE eliminates LOCF rows, +while FALSE keeps them.} } \value{ An \code{epi_archive} object. @@ -42,11 +47,15 @@ examples. } \details{ This simply a wrapper around the \code{new()} method of the \code{epi_archive} -class, so for example:\preformatted{x <- as_epi_archive(df, geo_type = "state", time_type = "day") -} +class, so for example: -would be equivalent to:\preformatted{x <- epi_archive$new(df, geo_type = "state", time_type = "day") -} +\if{html}{\out{
}}\preformatted{x <- as_epi_archive(df, geo_type = "state", time_type = "day") +}\if{html}{\out{
}} + +would be equivalent to: + +\if{html}{\out{
}}\preformatted{x <- epi_archive$new(df, geo_type = "state", time_type = "day") +}\if{html}{\out{
}} } \examples{ df <- data.frame (geo_value = c(replicate(2, "ca"), replicate(2, "fl")), diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index fff4e714..161f4a85 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -84,21 +84,28 @@ are documented in the wrapper function \code{epix_slide()}. \section{Methods}{ \subsection{Public methods}{ \itemize{ -\item \href{#method-new}{\code{epi_archive$new()}} -\item \href{#method-print}{\code{epi_archive$print()}} -\item \href{#method-as_of}{\code{epi_archive$as_of()}} -\item \href{#method-merge}{\code{epi_archive$merge()}} -\item \href{#method-slide}{\code{epi_archive$slide()}} -\item \href{#method-clone}{\code{epi_archive$clone()}} +\item \href{#method-epi_archive-new}{\code{epi_archive$new()}} +\item \href{#method-epi_archive-print}{\code{epi_archive$print()}} +\item \href{#method-epi_archive-as_of}{\code{epi_archive$as_of()}} +\item \href{#method-epi_archive-merge}{\code{epi_archive$merge()}} +\item \href{#method-epi_archive-slide}{\code{epi_archive$slide()}} +\item \href{#method-epi_archive-clone}{\code{epi_archive$clone()}} } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-new}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-epi_archive-new}{}}} \subsection{Method \code{new()}}{ Creates a new \code{epi_archive} object. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{epi_archive$new(x, geo_type, time_type, other_keys, additional_metadata)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{epi_archive$new( + x, + geo_type, + time_type, + other_keys, + additional_metadata, + compactify +)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -122,6 +129,16 @@ apart from "geo_value", "time_value", and "version".} \item{\code{additional_metadata}}{List of additional metadata to attach to the \code{epi_archive} object. The metadata will have \code{geo_type} and \code{time_type} fields; named entries from the passed list or will be included as well.} + +\item{\code{compactify}}{Optional, Boolean: should we remove rows that are +considered redundant for the purposes of \code{epi_archive}'s built-in methods +such as \verb{$as_of}? As these methods use the last (version of an) +observation carried forward (LOCF) to interpolate between the version data +provided, rows that won't change these LOCF results can potentially be +omitted to save space. Generally, this can be set to \code{TRUE}, but if you +directly inspect or edit the fields of the \code{epi_archive} such as the \verb{$DT}, +you will have to determine whether \code{compactify=TRUE} will still produce +equivalent results.} } \if{html}{\out{}} } @@ -130,8 +147,8 @@ An \code{epi_archive} object. } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-print}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-epi_archive-print}{}}} \subsection{Method \code{print()}}{ \subsection{Usage}{ \if{html}{\out{
}}\preformatted{epi_archive$print()}\if{html}{\out{
}} @@ -139,8 +156,8 @@ An \code{epi_archive} object. } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-as_of}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-epi_archive-as_of}{}}} \subsection{Method \code{as_of()}}{ Generates a snapshot in \code{epi_df} format as of a given version. See the documentation for the wrapper function \code{epix_as_of()} for details. @@ -150,8 +167,8 @@ See the documentation for the wrapper function \code{epix_as_of()} for details. } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-merge}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-epi_archive-merge}{}}} \subsection{Method \code{merge()}}{ Merges another \code{data.table} with the current one, and allows for a post-filling of \code{NA} values by last observation carried forward (LOCF). @@ -162,8 +179,8 @@ See the documentation for the wrapper function \code{epix_merge()} for details. } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-slide}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-epi_archive-slide}{}}} \subsection{Method \code{slide()}}{ Slides a given function over variables in an \code{epi_archive} object. See the documentation for the wrapper function \code{epix_as_of()} for @@ -185,8 +202,8 @@ details. } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-clone}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-epi_archive-clone}{}}} \subsection{Method \code{clone()}}{ The objects of this class are cloneable with this method. \subsection{Usage}{ diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd index 2e737293..903cb017 100644 --- a/man/epi_slide.Rd +++ b/man/epi_slide.Rd @@ -104,17 +104,41 @@ incomplete windows) is therefore left up to the user, either through the specified function or formula \code{f}, or through post-processing. If \code{f} is missing, then an expression for tidy evaluation can be specified, -for example, as in:\preformatted{epi_slide(x, cases_7dav = mean(cases), n = 7) -} +for example, as in: + +\if{html}{\out{
}}\preformatted{epi_slide(x, cases_7dav = mean(cases), n = 7) +}\if{html}{\out{
}} -which would be equivalent to:\preformatted{epi_slide(x, function(x, ...) mean(x$cases), n = 7, +which would be equivalent to: + +\if{html}{\out{
}}\preformatted{epi_slide(x, function(x, ...) mean(x$cases), n = 7, new_col_name = "cases_7dav") -} +}\if{html}{\out{
}} Thus, to be clear, when the computation is specified via an expression for tidy evaluation (first example, above), then the name for the new column is inferred from the given expression and overrides any name passed explicitly through the \code{new_col_name} argument. + +When \code{f} is a named function with arguments, if a tibble with an unnamed +grouping variable is passed in as the method argument to \code{f}, include a +parameter for the grouping-variable in \verb{function()} just prior to +specifying the method to prevent that from being overridden. For example: + +\if{html}{\out{
}}\preformatted{# Construct an tibble with an unnamed grouping variable +edf = bind_rows(tibble(geo_value = "ak", time_value = as.Date("2020-01-01") + + 1:10, x1=1:10, y=1:10 + rnorm(10L))) \%>\% + as_epi_df() + +# Now, include a row parameter for the grouping variable in the tibble, +# which we denote as g, just prior to method = "qr" +# Note that if g was not included below, then the method = "qr" would be +# overridden, as described above +edf \%>\% +group_by(geo_value) \%>\% +epi_slide(function(x, g, method="qr", ...) tibble(model=list( + lm(y ~ x1, x, method=method))), n=7L) +}\if{html}{\out{
}} } \examples{ # slide a 7-day trailing average formula on cases diff --git a/man/epix_as_of.Rd b/man/epix_as_of.Rd index 658e7169..b5d5969c 100644 --- a/man/epix_as_of.Rd +++ b/man/epix_as_of.Rd @@ -29,11 +29,15 @@ examples. } \details{ This is simply a wrapper around the \code{as_of()} method of the -\code{epi_archive} class, so if \code{x} is an \code{epi_archive} object, then:\preformatted{epix_as_of(x, max_version = v) -} +\code{epi_archive} class, so if \code{x} is an \code{epi_archive} object, then: -is equivalent to:\preformatted{x$as_of(max_version = v) -} +\if{html}{\out{
}}\preformatted{epix_as_of(x, max_version = v) +}\if{html}{\out{
}} + +is equivalent to: + +\if{html}{\out{
}}\preformatted{x$as_of(max_version = v) +}\if{html}{\out{
}} } \examples{ # warning message of data latency shown diff --git a/man/epix_merge.Rd b/man/epix_merge.Rd index 781ef6fe..3d1b2e1c 100644 --- a/man/epix_merge.Rd +++ b/man/epix_merge.Rd @@ -35,11 +35,15 @@ examples. } \details{ This is simply a wrapper around the \code{merge()} method of the -\code{epi_archive} class, so if \code{x} and \code{y} are an \code{epi_archive} objects, then:\preformatted{epix_merge(x, y) -} +\code{epi_archive} class, so if \code{x} and \code{y} are an \code{epi_archive} objects, then: -is equivalent to:\preformatted{x$merge(y) -} +\if{html}{\out{
}}\preformatted{epix_merge(x, y) +}\if{html}{\out{
}} + +is equivalent to: + +\if{html}{\out{
}}\preformatted{x$merge(y) +}\if{html}{\out{
}} } \examples{ # create two example epi_archive datasets diff --git a/man/epix_slide.Rd b/man/epix_slide.Rd index f01a0a71..b6f7a323 100644 --- a/man/epix_slide.Rd +++ b/man/epix_slide.Rd @@ -115,11 +115,15 @@ should never be used in place of \code{epi_slide()}, and only used when version-aware sliding is necessary (as it its purpose). Finally, this is simply a wrapper around the \code{slide()} method of the -\code{epi_archive} class, so if \code{x} is an \code{epi_archive} object, then:\preformatted{epix_slide(x, new_var = comp(old_var), n = 120) -} +\code{epi_archive} class, so if \code{x} is an \code{epi_archive} object, then: -is equivalent to:\preformatted{x$slide(x, new_var = comp(old_var), n = 120) -} +\if{html}{\out{
}}\preformatted{epix_slide(x, new_var = comp(old_var), n = 120) +}\if{html}{\out{
}} + +is equivalent to: + +\if{html}{\out{
}}\preformatted{x$slide(x, new_var = comp(old_var), n = 120) +}\if{html}{\out{
}} } \examples{ # these dates are reference time points for the 3 day average sliding window diff --git a/tests/testthat/test-compactify.R b/tests/testthat/test-compactify.R new file mode 100644 index 00000000..8ccdeee5 --- /dev/null +++ b/tests/testthat/test-compactify.R @@ -0,0 +1,84 @@ +library(epiprocess) +library(data.table) +library(dplyr) + +dt <- archive_cases_dv$DT +test_that("Input for compactify must be NULL or a boolean", { + expect_error(as_epi_archive(dv_duplicated,compactify="no")) +}) + +dt <- filter(dt,geo_value == "ca") +dt$percent_cli <- c(1:80) +dt$case_rate <- c(1:80) + +row_replace <- function(row,x,y) { + dt[row,4] <- x + dt[row,5] <- y + dt +} + +# Rows 1 should not be eliminated even if NA +dt <- row_replace(1,NA,NA) # Not LOCF + +# NOTE! We are assuming that there are no NA's in geo_value, time_value, +# and version. Even though compactify may erroneously remove the first row +# if it has all NA's, we are not testing this behaviour for now as this dataset +# has problems beyond the scope of this test + +# Rows 11 and 12 correspond to different time_values +dt <- row_replace(12,11,11) # Not LOCF + +# Rows 20 and 21 only differ in version +dt <- row_replace(21,20,20) # LOCF + +# Rows 21 and 22 only differ in version +dt <- row_replace(22,20,20) # LOCF + +# Row 39 comprises the first NA's +dt <-row_replace(39,NA,NA) # Not LOCF + +# Row 40 has two NA's, just like its lag, row 39 +dt <- row_replace(40,NA,NA) # LOCF + +# Row 62's values already exist in row 15, but row 15 is not a preceding row +dt <- row_replace(62,15,15) # Not LOCF + +# Row 73 only has one value carried over +dt <- row_replace(74,73,74) # Not LOCF + +dt_true <- as_tibble(as_epi_archive(dt,compactify=TRUE)$DT) +dt_false <- as_tibble(as_epi_archive(dt,compactify=FALSE)$DT) +dt_null <- as_tibble(as_epi_archive(dt,compactify=NULL)$DT) + +test_that("Warning for LOCF with compactify as NULL", { + expect_warning(as_epi_archive(dt,compactify=NULL)) +}) + +test_that("No warning when there is no LOCF", { + expect_warning(as_epi_archive(dt[1:10,],compactify=NULL),NA) +}) + +test_that("LOCF values are ignored with compactify=FALSE", { + expect_identical(nrow(dt),nrow(dt_false)) +}) + +test_that("LOCF values are taken out with compactify=TRUE", { + dt_test <- as_tibble(as_epi_archive(dt[-c(21,22,40),],compactify=FALSE)$DT) + + expect_identical(dt_true,dt_null) + expect_identical(dt_null,dt_test) +}) + +test_that("as_of utilizes LOCF even after removal of LOCF values",{ + ea_true <- as_epi_archive(dt,compactify=TRUE) + ea_false <- as_epi_archive(dt,compactify=FALSE) + + epix_as_of(ea_true,max(ea_true$DT$version)) + + # Row 22, an LOCF row corresponding to the latest version, but for the + # date 2020-06-02, is omitted in ea_true + as_of_true <- ea_true$as_of(max(ea_true$DT$version)) + as_of_false <- ea_false$as_of(max(ea_false$DT$version)) + + expect_identical(as_of_true,as_of_false) +}) diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R new file mode 100644 index 00000000..71a180c4 --- /dev/null +++ b/tests/testthat/test-epi_slide.R @@ -0,0 +1,34 @@ +## Create an epi. df and a function to test epi_slide with + +edf = dplyr::bind_rows( + dplyr::tibble(geo_value = "ak", time_value = as.Date("2020-01-01") + 1:200, value=1:200), + dplyr::tibble(geo_value = "al", time_value=as.Date("2020-01-01") + 1:5, value=-(1:5)) +) %>% + as_epi_df() + +f = function(x, ...) dplyr::tibble(value=mean(x$value), count=length(x$value)) + +## --- These cases generate the error: --- +test_that("`ref_time_values` + `align` that result in no slide data, generate the error", { + expect_error(edf %>% group_by(geo_value) %>% epi_slide(f, n=3L, ref_time_values=as.Date("2020-01-01")), + "starting and/or stopping times for sliding are out of bounds") # before the first, no data in the slide windows + expect_error(edf %>% group_by(geo_value) %>% epi_slide(f, n=3L, ref_time_values=as.Date("2020-01-01")+207L), + "starting and/or stopping times for sliding are out of bounds") # beyond the last, no data in window +}) + +test_that("`ref_time_values` + `align` that have some slide data, but generate the error due to ref. time being out of time range", { + expect_error(edf %>% group_by(geo_value) %>% epi_slide(f, n=3L, ref_time_values=as.Date("2020-01-01"), align="left"), + "starting and/or stopping times for sliding are out of bounds") # before the first, but we'd expect there to be data in the window + expect_error(edf %>% group_by(geo_value) %>% epi_slide(f, n=3L, ref_time_values=as.Date("2020-01-01")+201L), + "starting and/or stopping times for sliding are out of bounds") # beyond the last, but still with data in window +}) + +## --- These cases doesn't generate the error: --- +test_that("these doesn't produce an error; the error appears only if the ref time values are out of the range for every group", { + expect_identical(edf %>% group_by(geo_value) %>% epi_slide(f, n=3L, ref_time_values=as.Date("2020-01-01")+200L) %>% + dplyr::select("geo_value","slide_value_value"), + dplyr::tibble(geo_value = "ak", slide_value_value = 199) %>% group_by(geo_value)) # out of range for one group + expect_identical(edf %>% group_by(geo_value) %>% epi_slide(f, n=3L, ref_time_values=as.Date("2020-01-04")) %>% + dplyr::select("geo_value","slide_value_value"), + dplyr::tibble(geo_value = c("ak", "al"), slide_value_value = c(2, -2)) %>% group_by(geo_value)) # not out of range for either group +}) \ No newline at end of file diff --git a/vignettes/archive.Rmd b/vignettes/archive.Rmd index 0e399357..420e0bda 100644 --- a/vignettes/archive.Rmd +++ b/vignettes/archive.Rmd @@ -66,7 +66,9 @@ has (at least) the following columns: As we can see from the above, the data frame returned by `delphi.epidata::covidcast()` has the columns required for the `epi_archive` format, with `issue` playing the role of `version`. We can now use -`as_epi_archive()` to bring it into `epi_archive` format. +`as_epi_archive()` to bring it into `epi_archive` format. For removal of +LOCF values using `as_epi_archive` using compactify, please refer to the +compactify vignette. ```{r, eval=FALSE} x <- dv %>% @@ -123,7 +125,7 @@ x$DT$percent_cli[1] <- original_value To make a copy, we can use the `clone()` method for an R6 class, as in `y <- x$clone()`. You can read more about reference semantics in Hadley Wickham's [Advanced R](https://adv-r.hadley.nz/r6.html#r6-semantics) book. - + ## Some details on metadata The following pieces of metadata are included as fields in an `epi_archive` @@ -139,6 +141,15 @@ as in `x$geo_type` or `x$time_type`, etc. Just like `as_epi_df()`, the function object is instantiated, if they are not explicitly specified in the function call (as it did in the case above). +Note that `compactify` is **NOT** metadata and is an argument passed when creating +the dataset, without being stored in the end: + +```{r,message=FALSE} +# `dt` here is taken from the tests +as_epi_archive(archive_cases_dv$DT,compactify=TRUE)$geo_type # "state" +as_epi_archive(archive_cases_dv$DT,compactify=TRUE)$compactify # NULL +``` + ## Producing snapshots in `epi_df` form A key method of an `epi_archive` class is `as_of()`, which generates a snapshot diff --git a/vignettes/compactify.Rmd b/vignettes/compactify.Rmd new file mode 100644 index 00000000..3cedf332 --- /dev/null +++ b/vignettes/compactify.Rmd @@ -0,0 +1,117 @@ +--- +title: Compactify to remove LOCF values +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Compactify to remove LOCF values} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +## Removing LOCF data to save space + +We need not even store rows that look like the last observation carried forward, +as they use up extra space. Furthermore, we already apply LOCF in the +epi_archive-related functions, such that the results should be the same with or +without the rows, aas long as use code does not rely on directly modifying +epi_archive's fields in a way that expects all the original records and breaks +if they are trimmed down. + +There are three +different values that can be assigned to `compactify`: + +* No argument: Does not put in LOCF values, and prints the first six LOCF values +that have been omitted but could have been placed. +* `TRUE`: Does not put in LOCF values, but doesn't print anything relating +to which values have been omitted. +* `FALSE`: Includes LOCF values. + +For this example, we have one chart using LOCF values, while another doesn't +use them to illustrate LOCF. Notice how the head of the first dataset differs +from the second from the third value included. + +```{r} +library(dplyr) + +dt <- archive_cases_dv_subset$DT + +locf_omitted <- as_epi_archive(dt) +locf_included <- as_epi_archive(dt,compactify = FALSE) + +head(locf_omitted$DT) +head(locf_included$DT) +``` + +LOCF can mar the performance of dataset operations. As the column +`case_rate_7d_av` has many more LOCF values than `percent_cli`, we will omit the +`percent_cli` column for comparing performance. + +```{r} +dt2 <- select(dt,-percent_cli) + +locf_included_2 <- as_epi_archive(dt2,compactify=FALSE) +locf_omitted_2 <- as_epi_archive(dt2,compactify=TRUE) +``` + +We can see how large each dataset is to better understand why LOCF uses up +space that may slow down performance. + +```{r} +nrow(locf_included_2$DT) +nrow(locf_omitted_2$DT) +``` + + +As we can see, performing 200 iterations of `dplyr::filter` is faster when the +LOCF values are omitted. + +```{r} +# Performance of filtering +iterate_filter <- function(my_ea) { + for (i in 1:1000) { + filter(my_ea$DT,version >= as.Date("2020-01-01") + i) + } +} + +elapsed_time <- function(fx) c(system.time(fx))[[3]] + +speed_test <- function(f,name) { + data.frame( + operation = name, + locf=elapsed_time(f(locf_included_2)), + no_locf=elapsed_time(f(locf_omitted_2)) + ) +} + +speeds <- speed_test(iterate_filter,"filter_1000x") + +``` + +We would also like to measure the speed of `epi_archive` methods. + +```{r} +# Performance of as_of iterated 200 times +iterate_as_of <- function(my_ea) { + for (i in 1:1000) { + my_ea$as_of(min(my_ea$DT$time_value) + i - 1000) + } +} + +speeds <- rbind(speeds, speed_test(iterate_as_of,"as_of_1000x")) + +# Performance of slide +slide_median <- function(my_ea) { + my_ea$slide(median = median(case_rate_7d_av)) +} + +speeds <- rbind(speeds, speed_test(slide_median,"slide_median")) +``` +Here is a detailed performance comparison: + +```{r} +speeds_tidy <- gather(speeds,key="is_locf",value="time_in_s",locf,no_locf) + +library(ggplot2) + +ggplot(speeds_tidy) + + geom_bar(aes(x=is_locf,y=time_in_s,fill=operation),stat = "identity") +```