From 636f2126fe39b88dcecbc52a266643f0b2e1cd7c Mon Sep 17 00:00:00 2001 From: kenmawer Date: Fri, 13 May 2022 16:08:41 -0700 Subject: [PATCH 01/96] Added some details for compactify --- R/archive.R | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/R/archive.R b/R/archive.R index 9fb7c27b..0dfeaa8a 100644 --- a/R/archive.R +++ b/R/archive.R @@ -102,10 +102,15 @@ epi_archive = #' @param additional_metadata List of additional metadata to attach to the #' `epi_archive` object. The metadata will have `geo_type` and `time_type` #' fields; named entries from the passed list or will be included as well. +#' @param compactify Determines whether redundant rows are removed for last +#' observation carried first (LOCF) results. Set to TRUE to remove these, +#' FALSE to leave as is. Not specifying the argument does the same as TRUE, +#' except it also notifies the user of change in the data, as well as the +#' methods that can be done to silence the message. #' @return An `epi_archive` object. #' @importFrom data.table as.data.table key setkeyv initialize = function(x, geo_type, time_type, other_keys, - additional_metadata) { + additional_metadata, compacify) { # Check that we have a data frame if (!is.data.frame(x)) { Abort("`x` must be a data frame.") @@ -132,7 +137,7 @@ epi_archive = time_type = guess_time_type(x$time_value) } - # Finish off with small checks on keys variables and metadata + # Conduct checks on keys variables and metadata if (missing(other_keys)) other_keys = NULL if (missing(additional_metadata)) additional_metadata = list() if (!all(other_keys %in% names(x))) { @@ -145,6 +150,9 @@ epi_archive = c("geo_type", "time_type"))) { Warn("`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\".") } + + # Finish off with compactify + if (missing(compacify)) compactify = NULL # Create the data table; if x was an un-keyed data.table itself, # then the call to as.data.table() will fail to set keys, so we From 05a4804084b67125b01ea24c71f7b76dc9493c11 Mon Sep 17 00:00:00 2001 From: kenmawer Date: Mon, 16 May 2022 09:23:39 -0700 Subject: [PATCH 02/96] Added compactify variable with check. --- R/archive.R | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/R/archive.R b/R/archive.R index 0dfeaa8a..0c68e954 100644 --- a/R/archive.R +++ b/R/archive.R @@ -152,7 +152,12 @@ epi_archive = } # Finish off with compactify - if (missing(compacify)) compactify = NULL + if (missing(compacify)) { + compactify = NULL + } else if (compactify != TRUE && compactify != FALSE) { + Warn("Non-boolean value inserted for boolean. Resetting to default") + compactify = NULL + } # Create the data table; if x was an un-keyed data.table itself, # then the call to as.data.table() will fail to set keys, so we @@ -166,6 +171,7 @@ epi_archive = self$geo_type = geo_type self$time_type = time_type self$additional_metadata = additional_metadata + self$compactify = compactify }, print = function() { cat("An `epi_archive` object, with metadata:\n") From 7760ce4a3ff8db48a790489be39597f87e5b4d4d Mon Sep 17 00:00:00 2001 From: kenmawer Date: Mon, 16 May 2022 11:31:27 -0700 Subject: [PATCH 03/96] Updated function. --- R/archive.R | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/R/archive.R b/R/archive.R index 0c68e954..2b990569 100644 --- a/R/archive.R +++ b/R/archive.R @@ -155,16 +155,27 @@ epi_archive = if (missing(compacify)) { compactify = NULL } else if (compactify != TRUE && compactify != FALSE) { - Warn("Non-boolean value inserted for boolean. Resetting to default") + Warn("Non-boolean value inserted for compactify. Resetting to default") compactify = NULL } - + + # Code for running compactify + comp <- function(df) { + df #stub + } + + # Runs compactify on data frame + if (compactify == TRUE || is.null(compactify)) { + self$comp(df) + } + # Create the data table; if x was an un-keyed data.table itself, # then the call to as.data.table() will fail to set keys, so we # need to check this, then do it manually if needed key_vars = c("geo_value", "time_value", other_keys, "version") DT = as.data.table(x, key = key_vars) if (!identical(key_vars, key(DT))) setkeyv(DT, cols = key_vars) + # Instantiate all self variables self$DT = DT @@ -198,6 +209,10 @@ epi_archive = cat(sprintf("Public methods: %s", paste(names(epi_archive$public_methods), collapse = ", "))) + if (is.null(compactify)) { + cat("----------\n") + cat(sprintf("To avoid warning, please do the following:")) + } }, ##### #' @description Generates a snapshot in `epi_df` format as of a given version. From fe1f2d88ea734faa97a1803701d299274c9294c2 Mon Sep 17 00:00:00 2001 From: kenmawer Date: Mon, 16 May 2022 11:31:48 -0700 Subject: [PATCH 04/96] Put !!! to indicate incomplete part. --- R/archive.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/archive.R b/R/archive.R index 2b990569..9af7e9d5 100644 --- a/R/archive.R +++ b/R/archive.R @@ -212,6 +212,7 @@ epi_archive = if (is.null(compactify)) { cat("----------\n") cat(sprintf("To avoid warning, please do the following:")) + # !!! } }, ##### From 1268bb7870d4db72433eb5220fac314a1ba98a96 Mon Sep 17 00:00:00 2001 From: kenmawer Date: Mon, 16 May 2022 14:10:39 -0700 Subject: [PATCH 05/96] Shortened code --- R/archive.R | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/R/archive.R b/R/archive.R index 9af7e9d5..3a9bb627 100644 --- a/R/archive.R +++ b/R/archive.R @@ -159,14 +159,9 @@ epi_archive = compactify = NULL } - # Code for running compactify - comp <- function(df) { - df #stub - } - # Runs compactify on data frame if (compactify == TRUE || is.null(compactify)) { - self$comp(df) + df # stub } # Create the data table; if x was an un-keyed data.table itself, @@ -176,7 +171,6 @@ epi_archive = DT = as.data.table(x, key = key_vars) if (!identical(key_vars, key(DT))) setkeyv(DT, cols = key_vars) - # Instantiate all self variables self$DT = DT self$geo_type = geo_type @@ -211,8 +205,10 @@ epi_archive = collapse = ", "))) if (is.null(compactify)) { cat("----------\n") - cat(sprintf("To avoid warning, please do the following:")) - # !!! + cat(sprintf("Note: redundant rows found. To remove warning, + set compactify to TRUE or fix these rows")) + # print redundant row numbers + # stub } }, ##### From 56859cbe6c6027bb79548643b48d61f99fe79356 Mon Sep 17 00:00:00 2001 From: kenmawer Date: Tue, 17 May 2022 10:17:38 -0700 Subject: [PATCH 06/96] More code updates, including updates for vignette. --- R/archive.R | 14 +++++++------- vignettes/archive.Rmd | 7 +++++++ 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/R/archive.R b/R/archive.R index 3a9bb627..558ae8d5 100644 --- a/R/archive.R +++ b/R/archive.R @@ -164,6 +164,12 @@ epi_archive = df # stub } + # Warns about redundant rows + if (is.null(compactify)) { + Warn("Note: redundant rows found. To remove warning, + set compactify to TRUE or fix these rows") + } + # Create the data table; if x was an un-keyed data.table itself, # then the call to as.data.table() will fail to set keys, so we # need to check this, then do it manually if needed @@ -203,13 +209,7 @@ epi_archive = cat(sprintf("Public methods: %s", paste(names(epi_archive$public_methods), collapse = ", "))) - if (is.null(compactify)) { - cat("----------\n") - cat(sprintf("Note: redundant rows found. To remove warning, - set compactify to TRUE or fix these rows")) - # print redundant row numbers - # stub - } + }, ##### #' @description Generates a snapshot in `epi_df` format as of a given version. diff --git a/vignettes/archive.Rmd b/vignettes/archive.Rmd index 90997fbc..77c1f826 100644 --- a/vignettes/archive.Rmd +++ b/vignettes/archive.Rmd @@ -117,6 +117,9 @@ object: * `geo_type`: the type for the geo values. * `time_type`: the type for the time values. * `additional_metadata`: list of additional metadata for the data archive. +* `compactify`: TRUE eliminates redundant entries using LOCF for efficiency; +no entry also brings up a message to indicate which columns should be removed +for efficiency; and FALSE does not eliminate redundant entries Metadata for an `epi_archive` object `x` can be accessed (and altered) directly, as in `x$geo_type` or `x$time_type`, etc. Just like `as_epi_df()`, the function @@ -199,6 +202,10 @@ quite as dramatically. Modeling the revision process, which is often called + +Finally, we can use compactify to remove variables that match the LOCF (last +observation carried forward) + ## Merging `epi_archive` objects From bd4daf35ccb73a4740d8e16ff9de589be1631512 Mon Sep 17 00:00:00 2001 From: kenmawer Date: Tue, 17 May 2022 15:30:17 -0700 Subject: [PATCH 07/96] Put in new changes to vignette. --- man/epi_archive.Rd | 15 ++++++++++++++- vignettes/archive.Rmd | 22 ++++++++++++++++++---- 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index fff4e714..585cb9eb 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -98,7 +98,14 @@ are documented in the wrapper function \code{epix_slide()}. \subsection{Method \code{new()}}{ Creates a new \code{epi_archive} object. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{epi_archive$new(x, geo_type, time_type, other_keys, additional_metadata)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{epi_archive$new( + x, + geo_type, + time_type, + other_keys, + additional_metadata, + compacify +)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -122,6 +129,12 @@ apart from "geo_value", "time_value", and "version".} \item{\code{additional_metadata}}{List of additional metadata to attach to the \code{epi_archive} object. The metadata will have \code{geo_type} and \code{time_type} fields; named entries from the passed list or will be included as well.} + +\item{\code{compactify}}{Determines whether redundant rows are removed for last +observation carried first (LOCF) results. Set to TRUE to remove these, +FALSE to leave as is. Not specifying the argument does the same as TRUE, +except it also notifies the user of change in the data, as well as the +methods that can be done to silence the message.} } \if{html}{\out{}} } diff --git a/vignettes/archive.Rmd b/vignettes/archive.Rmd index 77c1f826..313c9a3f 100644 --- a/vignettes/archive.Rmd +++ b/vignettes/archive.Rmd @@ -108,6 +108,24 @@ x$DT$percent_cli[1] <- original_value To make a copy, we can use the `clone()` method for an R6 class, as in `y <- x$clone()`. You can read more about reference semantics in Hadley Wickham's [Advanced R](https://adv-r.hadley.nz/r6.html#r6-semantics) book. + +## Removing LOCF data to save space + +We need not even include LOCF values, as they use up extra space. The +`compactify` argument is a way of removing LOCF values, as they use up extra +space that can slow down computational time. There are three +different values that can be asssigned to `compactify`: + +* No argument: Does not put in LOCF values, and prints a list of LOCF values +that have been omitted but could have been placed. +* `TRUE`: Does not put in LOCF values, but doesn't print anything relating +to which values have been omitted. +* `FALSE`: Includes LOCF values. + +For this example, we have one chart using LOCF values, while another doesn't +use them to illustrate LOCF. + + ## Some details on metadata @@ -202,10 +220,6 @@ quite as dramatically. Modeling the revision process, which is often called - -Finally, we can use compactify to remove variables that match the LOCF (last -observation carried forward) - ## Merging `epi_archive` objects From 084bf8c1c8b835926d96c5a28f2129ef0fe59773 Mon Sep 17 00:00:00 2001 From: kenmawer Date: Fri, 20 May 2022 14:49:49 -0700 Subject: [PATCH 08/96] Added function to remove LOCF. This was copy-pasted while working outside this project. --- R/archive.R | 56 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 47 insertions(+), 9 deletions(-) diff --git a/R/archive.R b/R/archive.R index 558ae8d5..458d3133 100644 --- a/R/archive.R +++ b/R/archive.R @@ -159,24 +159,62 @@ epi_archive = compactify = NULL } + # Create the data table; if x was an un-keyed data.table itself, + # then the call to as.data.table() will fail to set keys, so we + # need to check this, then do it manually if needed + key_vars = c("geo_value", "time_value", other_keys, "version") + DT = as.data.table(x, key = key_vars) + if (!identical(key_vars, key(DT))) setkeyv(DT, cols = key_vars) + + # functions for LOCF + ### + order <- function(df) { + arrange(df,version,time_value,geo_value) + } + + # Check if previous entry is in group. + mutate_in_group <- function(df) { + mutate(df, in_group = + replace_na( + (geo_value == lag(geo_value) & + version == lag(version)), + FALSE + ) + ) + } + + # Remove LOCF values + rm_locf <- function (df) { + df %>% + order() %>% + mutate_in_group() %>% + filter(!in_group | percent_cli != lag(percent_cli)) %>% + select(-in_group) + } + + # Keeps LOCF values, such as to be printed + keep_locf <- function(df) { + df %>% + order() %>% + mutate_in_group() %>% + filter(in_group & percent_cli == lag(percent_cli)) %>% + select(-in_group) + } + ### + # Runs compactify on data frame - if (compactify == TRUE || is.null(compactify)) { - df # stub + if (compactify == TRUE | is.null(compactify)) { + elim = keep_locf(DT) + DT = rm_locf(DT) } # Warns about redundant rows if (is.null(compactify)) { Warn("Note: redundant rows found. To remove warning, set compactify to TRUE or fix these rows") + # call elim with for loop } - # Create the data table; if x was an un-keyed data.table itself, - # then the call to as.data.table() will fail to set keys, so we - # need to check this, then do it manually if needed - key_vars = c("geo_value", "time_value", other_keys, "version") - DT = as.data.table(x, key = key_vars) - if (!identical(key_vars, key(DT))) setkeyv(DT, cols = key_vars) - # Instantiate all self variables self$DT = DT self$geo_type = geo_type From d6a5feda654ea390f3ccadeb6e92b8cf03a5adac Mon Sep 17 00:00:00 2001 From: kenmawer Date: Fri, 20 May 2022 15:51:36 -0700 Subject: [PATCH 09/96] Still need to figure out how to get this to not give me a null dataset. --- R/archive.R | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/R/archive.R b/R/archive.R index 458d3133..77736488 100644 --- a/R/archive.R +++ b/R/archive.R @@ -212,7 +212,10 @@ epi_archive = if (is.null(compactify)) { Warn("Note: redundant rows found. To remove warning, set compactify to TRUE or fix these rows") - # call elim with for loop + # call elim with for loop, up to 6 + for (i in min(6,nrow(elim))) { + print(elim[i,]) + } } # Instantiate all self variables @@ -530,8 +533,9 @@ epi_archive = #' time_type = "day", #' other_keys = "county") as_epi_archive = function(x, geo_type, time_type, other_keys, - additional_metadata = list()) { - epi_archive$new(x, geo_type, time_type, other_keys, additional_metadata) + additional_metadata = list(),compactify = NULL) { + epi_archive$new(x, geo_type, time_type, other_keys, additional_metadata, + compactify) } #' Test for `epi_archive` format From 87c487d141a8836bf50dbb7220448d3fd96ebab0 Mon Sep 17 00:00:00 2001 From: kenmawer Date: Fri, 20 May 2022 16:31:31 -0700 Subject: [PATCH 10/96] Fixed a typo on the code. I still haven't finished it. --- R/archive.R | 2 +- man/as_epi_archive.Rd | 3 ++- man/epi_archive.Rd | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/R/archive.R b/R/archive.R index 77736488..62f9db7a 100644 --- a/R/archive.R +++ b/R/archive.R @@ -110,7 +110,7 @@ epi_archive = #' @return An `epi_archive` object. #' @importFrom data.table as.data.table key setkeyv initialize = function(x, geo_type, time_type, other_keys, - additional_metadata, compacify) { + additional_metadata, compactify) { # Check that we have a data frame if (!is.data.frame(x)) { Abort("`x` must be a data frame.") diff --git a/man/as_epi_archive.Rd b/man/as_epi_archive.Rd index bb32ebfd..a5f5542a 100644 --- a/man/as_epi_archive.Rd +++ b/man/as_epi_archive.Rd @@ -9,7 +9,8 @@ as_epi_archive( geo_type, time_type, other_keys, - additional_metadata = list() + additional_metadata = list(), + compactify = NULL ) } \arguments{ diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index 585cb9eb..61efb105 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -104,7 +104,7 @@ Creates a new \code{epi_archive} object. time_type, other_keys, additional_metadata, - compacify + compacify = NULL )}\if{html}{\out{}} } From 22937ae626a7b61dbe57f30caea3dc58d7aa3676 Mon Sep 17 00:00:00 2001 From: kenmawer Date: Fri, 20 May 2022 17:00:55 -0700 Subject: [PATCH 11/96] Changed some text and fixed a typo. --- R/archive.R | 17 +++++++---------- man/epi_archive.Rd | 2 +- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/R/archive.R b/R/archive.R index 62f9db7a..2b678ce3 100644 --- a/R/archive.R +++ b/R/archive.R @@ -152,12 +152,7 @@ epi_archive = } # Finish off with compactify - if (missing(compacify)) { - compactify = NULL - } else if (compactify != TRUE && compactify != FALSE) { - Warn("Non-boolean value inserted for compactify. Resetting to default") - compactify = NULL - } + if (missing(compactify)) compactify = NULL # Create the data table; if x was an un-keyed data.table itself, # then the call to as.data.table() will fail to set keys, so we @@ -169,7 +164,7 @@ epi_archive = # functions for LOCF ### order <- function(df) { - arrange(df,version,time_value,geo_value) + arrange(df,geo_value,version,time_value) } # Check if previous entry is in group. @@ -203,19 +198,22 @@ epi_archive = ### # Runs compactify on data frame - if (compactify == TRUE | is.null(compactify)) { + if (is.null(compactify) || compactify == TRUE) { elim = keep_locf(DT) DT = rm_locf(DT) } # Warns about redundant rows - if (is.null(compactify)) { + if (is.null(compactify) & nrow(elim) > 0) { Warn("Note: redundant rows found. To remove warning, set compactify to TRUE or fix these rows") # call elim with for loop, up to 6 for (i in min(6,nrow(elim))) { print(elim[i,]) } + if (nrow(elim) > 6) { + print("And so on...") + } } # Instantiate all self variables @@ -223,7 +221,6 @@ epi_archive = self$geo_type = geo_type self$time_type = time_type self$additional_metadata = additional_metadata - self$compactify = compactify }, print = function() { cat("An `epi_archive` object, with metadata:\n") diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index 61efb105..d070aa30 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -104,7 +104,7 @@ Creates a new \code{epi_archive} object. time_type, other_keys, additional_metadata, - compacify = NULL + compactify )}\if{html}{\out{}} } From 7310d22dc1a63816388285b3eca0e7db75e65a93 Mon Sep 17 00:00:00 2001 From: kenmawer Date: Fri, 20 May 2022 17:01:30 -0700 Subject: [PATCH 12/96] Add a comment about code --- R/archive.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/R/archive.R b/R/archive.R index 2b678ce3..1491af0d 100644 --- a/R/archive.R +++ b/R/archive.R @@ -162,7 +162,7 @@ epi_archive = if (!identical(key_vars, key(DT))) setkeyv(DT, cols = key_vars) # functions for LOCF - ### + # orders data frame to observe potential LOCF order <- function(df) { arrange(df,geo_value,version,time_value) } @@ -195,7 +195,6 @@ epi_archive = filter(in_group & percent_cli == lag(percent_cli)) %>% select(-in_group) } - ### # Runs compactify on data frame if (is.null(compactify) || compactify == TRUE) { From 5eb4bddbd52c5f2c91b875f378410ed13fc4581e Mon Sep 17 00:00:00 2001 From: kenmawer Date: Tue, 31 May 2022 13:10:04 -0700 Subject: [PATCH 13/96] Put in a code stub for compactify testing. --- tests/testthat/test-compactify.R | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 tests/testthat/test-compactify.R diff --git a/tests/testthat/test-compactify.R b/tests/testthat/test-compactify.R new file mode 100644 index 00000000..c9c5e2fd --- /dev/null +++ b/tests/testthat/test-compactify.R @@ -0,0 +1,3 @@ +test_that("compactify gets an error", { + expect_equal(1,1) +}) From 82756d531b0a08dc9a096b6742853a68aead1796 Mon Sep 17 00:00:00 2001 From: kenmawer Date: Tue, 31 May 2022 16:20:29 -0700 Subject: [PATCH 14/96] Made tests and fixed a bug. --- R/archive.R | 3 +++ tests/testthat/test-compactify.R | 40 ++++++++++++++++++++++++++++++-- 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/R/archive.R b/R/archive.R index 1491af0d..ae098412 100644 --- a/R/archive.R +++ b/R/archive.R @@ -200,6 +200,9 @@ epi_archive = if (is.null(compactify) || compactify == TRUE) { elim = keep_locf(DT) DT = rm_locf(DT) + } else { + # Create empty data frame for nrow(elim) to be 0 + elim = tibble() } # Warns about redundant rows diff --git a/tests/testthat/test-compactify.R b/tests/testthat/test-compactify.R index c9c5e2fd..69179eb9 100644 --- a/tests/testthat/test-compactify.R +++ b/tests/testthat/test-compactify.R @@ -1,3 +1,39 @@ -test_that("compactify gets an error", { - expect_equal(1,1) +library(delphi.epidata) +library(epiprocess) +library(data.table) +library(dplyr) + +dv <- covidcast( + data_source = "doctor-visits", + signals = "smoothed_adj_cli", + time_type = "day", + geo_type = "state", + time_values = epirange(20211101, 20211201), + geo_values = "ca", + issues = epirange(20211129, 20211129) +) %>% + fetch_tbl() %>% + select(geo_value, time_value, version = issue, percent_cli = value) + +dv_duplicated <- dv +for (i in 1:5) { + dv_duplicated[i,4] <- 6 +} + +# These are object pointers +dv_true <- as_tibble(as_epi_archive(dv_duplicated,compactify=TRUE)$DT) +dv_false <- as_tibble(as_epi_archive(dv_duplicated,compactify=FALSE)$DT) +dv_null <- as_tibble(as_epi_archive(dv_duplicated,compactify=TRUE)$DT) + +# No compactify applied +test_that("LOCF values are ignored", { + expect_identical(dv_duplicated,dv_false) }) + +# compacity applied +# No compactify applied +test_that("LOCF values are taken out", { + dv_unique <- distinct(dv_false,percent_cli,.keep_all = TRUE) + expect_identical(dv_true,dv_null) + expect_identical(dv_null,dv_unique) +}) \ No newline at end of file From 97a21617e0c05420e6a1d6586bf5a36badf98629 Mon Sep 17 00:00:00 2001 From: kenmawer Date: Tue, 31 May 2022 16:32:39 -0700 Subject: [PATCH 15/96] There's still work to do in ensuring the LOCF rows are printed properly! --- R/archive.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/archive.R b/R/archive.R index ae098412..6c2797d4 100644 --- a/R/archive.R +++ b/R/archive.R @@ -212,6 +212,7 @@ epi_archive = # call elim with for loop, up to 6 for (i in min(6,nrow(elim))) { print(elim[i,]) + # TODO: This needs to print rows properly! } if (nrow(elim) > 6) { print("And so on...") From 21d3d5540ac9d3ee56e0bce3881dced05ea4d34e Mon Sep 17 00:00:00 2001 From: kenmawer Date: Tue, 31 May 2022 16:53:48 -0700 Subject: [PATCH 16/96] Fixed tests, LOCF rows still need to be printed. --- R/archive.R | 2 +- tests/testthat/test-compactify.R | 8 ++------ 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/R/archive.R b/R/archive.R index 6c2797d4..f5fdfc15 100644 --- a/R/archive.R +++ b/R/archive.R @@ -211,8 +211,8 @@ epi_archive = set compactify to TRUE or fix these rows") # call elim with for loop, up to 6 for (i in min(6,nrow(elim))) { - print(elim[i,]) # TODO: This needs to print rows properly! + print(elim[i,]) } if (nrow(elim) > 6) { print("And so on...") diff --git a/tests/testthat/test-compactify.R b/tests/testthat/test-compactify.R index 69179eb9..8d1cccca 100644 --- a/tests/testthat/test-compactify.R +++ b/tests/testthat/test-compactify.R @@ -20,18 +20,14 @@ for (i in 1:5) { dv_duplicated[i,4] <- 6 } -# These are object pointers dv_true <- as_tibble(as_epi_archive(dv_duplicated,compactify=TRUE)$DT) dv_false <- as_tibble(as_epi_archive(dv_duplicated,compactify=FALSE)$DT) -dv_null <- as_tibble(as_epi_archive(dv_duplicated,compactify=TRUE)$DT) +dv_null <- as_tibble(as_epi_archive(dv_duplicated,compactify=NULL)$DT) -# No compactify applied test_that("LOCF values are ignored", { - expect_identical(dv_duplicated,dv_false) + expect_identical(nrow(dv_duplicated),nrow(dv_false)) }) -# compacity applied -# No compactify applied test_that("LOCF values are taken out", { dv_unique <- distinct(dv_false,percent_cli,.keep_all = TRUE) expect_identical(dv_true,dv_null) From ffb32fe38fe200082ace305c7ba0ad931c3f29be Mon Sep 17 00:00:00 2001 From: kenmawer Date: Wed, 1 Jun 2022 10:07:51 -0700 Subject: [PATCH 17/96] Added warnings for LOCF. --- R/archive.R | 17 +++++++++-------- tests/testthat/test-compactify.R | 2 ++ 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/R/archive.R b/R/archive.R index f5fdfc15..78a5b2f2 100644 --- a/R/archive.R +++ b/R/archive.R @@ -206,17 +206,18 @@ epi_archive = } # Warns about redundant rows - if (is.null(compactify) & nrow(elim) > 0) { - Warn("Note: redundant rows found. To remove warning, - set compactify to TRUE or fix these rows") + if (is.null(compactify) && nrow(elim) > 0) { + warning <- "Redundant rows found. To remove warning, + set compactify to TRUE or fix these rows \n" # call elim with for loop, up to 6 - for (i in min(6,nrow(elim))) { - # TODO: This needs to print rows properly! - print(elim[i,]) + for (i in 1:min(6,nrow(elim))) { + warning <- paste(warning, + elim[[i,1]],elim[[i,2]],elim[[i,3]],elim[[i,4]],"\n") } if (nrow(elim) > 6) { - print("And so on...") + warning <- paste(warning,"And so on...") } + Warn(warning) } # Instantiate all self variables @@ -249,7 +250,7 @@ epi_archive = cat("----------\n") cat(sprintf("Public methods: %s", paste(names(epi_archive$public_methods), - collapse = ", "))) + collapse = ", ")),"\n") }, ##### diff --git a/tests/testthat/test-compactify.R b/tests/testthat/test-compactify.R index 8d1cccca..1b858a3a 100644 --- a/tests/testthat/test-compactify.R +++ b/tests/testthat/test-compactify.R @@ -20,6 +20,8 @@ for (i in 1:5) { dv_duplicated[i,4] <- 6 } +as_epi_archive(dv_duplicated,compactify=NULL) + dv_true <- as_tibble(as_epi_archive(dv_duplicated,compactify=TRUE)$DT) dv_false <- as_tibble(as_epi_archive(dv_duplicated,compactify=FALSE)$DT) dv_null <- as_tibble(as_epi_archive(dv_duplicated,compactify=NULL)$DT) From fd1c26a03dc6e1b9ba59c0aafd188410c006ed55 Mon Sep 17 00:00:00 2001 From: kenmawer Date: Wed, 1 Jun 2022 11:35:18 -0700 Subject: [PATCH 18/96] Updated testing and warnings. --- R/archive.R | 13 +++++++------ tests/testthat/test-compactify.R | 2 +- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/R/archive.R b/R/archive.R index 78a5b2f2..6a6c7629 100644 --- a/R/archive.R +++ b/R/archive.R @@ -207,17 +207,18 @@ epi_archive = # Warns about redundant rows if (is.null(compactify) && nrow(elim) > 0) { - warning <- "Redundant rows found. To remove warning, - set compactify to TRUE or fix these rows \n" + warn <- paste("\nRedundant rows found. To remove warning, set", + "compactify to TRUE or fix these rows: \n") # call elim with for loop, up to 6 for (i in 1:min(6,nrow(elim))) { - warning <- paste(warning, - elim[[i,1]],elim[[i,2]],elim[[i,3]],elim[[i,4]],"\n") + warn <- paste0(warn, + paste(elim[[i,1]],elim[[i,2]],elim[[i,3]],elim[[i,4]],"\n") + ) } if (nrow(elim) > 6) { - warning <- paste(warning,"And so on...") + warn <- paste0(warn,"And so on...") } - Warn(warning) + warning(warn) } # Instantiate all self variables diff --git a/tests/testthat/test-compactify.R b/tests/testthat/test-compactify.R index 1b858a3a..56c9ad44 100644 --- a/tests/testthat/test-compactify.R +++ b/tests/testthat/test-compactify.R @@ -16,7 +16,7 @@ dv <- covidcast( select(geo_value, time_value, version = issue, percent_cli = value) dv_duplicated <- dv -for (i in 1:5) { +for (i in 1:8) { dv_duplicated[i,4] <- 6 } From cbdf9dd9340ba5e442490404719d12a7d17ce9cc Mon Sep 17 00:00:00 2001 From: kenmawer Date: Wed, 1 Jun 2022 11:35:30 -0700 Subject: [PATCH 19/96] Updated testing. --- tests/testthat/test-compactify.R | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/testthat/test-compactify.R b/tests/testthat/test-compactify.R index 56c9ad44..261903cf 100644 --- a/tests/testthat/test-compactify.R +++ b/tests/testthat/test-compactify.R @@ -19,13 +19,15 @@ dv_duplicated <- dv for (i in 1:8) { dv_duplicated[i,4] <- 6 } - -as_epi_archive(dv_duplicated,compactify=NULL) - + dv_true <- as_tibble(as_epi_archive(dv_duplicated,compactify=TRUE)$DT) dv_false <- as_tibble(as_epi_archive(dv_duplicated,compactify=FALSE)$DT) dv_null <- as_tibble(as_epi_archive(dv_duplicated,compactify=NULL)$DT) +test_that("Warning for LOCF with compactify as NULL", { + expect_warning(as_epi_archive(dv_duplicated,compactify=NULL)) +}) + test_that("LOCF values are ignored", { expect_identical(nrow(dv_duplicated),nrow(dv_false)) }) From c417f56579ecccdeb2306477963f9789fe003c64 Mon Sep 17 00:00:00 2001 From: kenmawer Date: Wed, 8 Jun 2022 18:09:30 -0700 Subject: [PATCH 20/96] Still needs updating so tests pass. No longer using delphi.epidata. --- DESCRIPTION | 3 +-- R/archive.R | 15 ++++++++----- man/as_epi_archive.Rd | 12 +++++++---- man/epi_archive.Rd | 36 ++++++++++++++++---------------- man/epi_slide.Rd | 12 +++++++---- man/epix_as_of.Rd | 12 +++++++---- man/epix_merge.Rd | 12 +++++++---- man/epix_slide.Rd | 12 +++++++---- tests/testthat/test-compactify.R | 31 +++++++++------------------ 9 files changed, 79 insertions(+), 66 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 7244167e..ceb5b29c 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -51,7 +51,6 @@ Config/testthat/edition: 3 Encoding: UTF-8 LazyData: true Roxygen: list(markdown = TRUE) -RoxygenNote: 7.1.2 +RoxygenNote: 7.2.0 Depends: R (>= 2.10) - diff --git a/R/archive.R b/R/archive.R index 6a6c7629..8d0ec4ac 100644 --- a/R/archive.R +++ b/R/archive.R @@ -152,7 +152,12 @@ epi_archive = } # Finish off with compactify - if (missing(compactify)) compactify = NULL + if (missing(compactify)) { + compactify = NULL + } else if (!rlang::is_bool(compactify) && + !rlang::is_null(compactify)) { + Abort("compactify must be boolean or null.") + } # Create the data table; if x was an un-keyed data.table itself, # then the call to as.data.table() will fail to set keys, so we @@ -163,14 +168,14 @@ epi_archive = # functions for LOCF # orders data frame to observe potential LOCF - order <- function(df) { + order_locf <- function(df) { arrange(df,geo_value,version,time_value) } # Check if previous entry is in group. mutate_in_group <- function(df) { mutate(df, in_group = - replace_na( + tidyr::replace_na( (geo_value == lag(geo_value) & version == lag(version)), FALSE @@ -181,7 +186,7 @@ epi_archive = # Remove LOCF values rm_locf <- function (df) { df %>% - order() %>% + order_locf() %>% mutate_in_group() %>% filter(!in_group | percent_cli != lag(percent_cli)) %>% select(-in_group) @@ -190,7 +195,7 @@ epi_archive = # Keeps LOCF values, such as to be printed keep_locf <- function(df) { df %>% - order() %>% + order_locf() %>% mutate_in_group() %>% filter(in_group & percent_cli == lag(percent_cli)) %>% select(-in_group) diff --git a/man/as_epi_archive.Rd b/man/as_epi_archive.Rd index a5f5542a..729ca870 100644 --- a/man/as_epi_archive.Rd +++ b/man/as_epi_archive.Rd @@ -43,11 +43,15 @@ examples. } \details{ This simply a wrapper around the \code{new()} method of the \code{epi_archive} -class, so for example:\preformatted{x <- as_epi_archive(df, geo_type = "state", time_type = "day") -} +class, so for example: -would be equivalent to:\preformatted{x <- epi_archive$new(df, geo_type = "state", time_type = "day") -} +\if{html}{\out{
}}\preformatted{x <- as_epi_archive(df, geo_type = "state", time_type = "day") +}\if{html}{\out{
}} + +would be equivalent to: + +\if{html}{\out{
}}\preformatted{x <- epi_archive$new(df, geo_type = "state", time_type = "day") +}\if{html}{\out{
}} } \examples{ df <- data.frame (geo_value = c(replicate(2, "ca"), replicate(2, "fl")), diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index d070aa30..902fe1e2 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -84,17 +84,17 @@ are documented in the wrapper function \code{epix_slide()}. \section{Methods}{ \subsection{Public methods}{ \itemize{ -\item \href{#method-new}{\code{epi_archive$new()}} -\item \href{#method-print}{\code{epi_archive$print()}} -\item \href{#method-as_of}{\code{epi_archive$as_of()}} -\item \href{#method-merge}{\code{epi_archive$merge()}} -\item \href{#method-slide}{\code{epi_archive$slide()}} -\item \href{#method-clone}{\code{epi_archive$clone()}} +\item \href{#method-epi_archive-new}{\code{epi_archive$new()}} +\item \href{#method-epi_archive-print}{\code{epi_archive$print()}} +\item \href{#method-epi_archive-as_of}{\code{epi_archive$as_of()}} +\item \href{#method-epi_archive-merge}{\code{epi_archive$merge()}} +\item \href{#method-epi_archive-slide}{\code{epi_archive$slide()}} +\item \href{#method-epi_archive-clone}{\code{epi_archive$clone()}} } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-new}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-epi_archive-new}{}}} \subsection{Method \code{new()}}{ Creates a new \code{epi_archive} object. \subsection{Usage}{ @@ -143,8 +143,8 @@ An \code{epi_archive} object. } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-print}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-epi_archive-print}{}}} \subsection{Method \code{print()}}{ \subsection{Usage}{ \if{html}{\out{
}}\preformatted{epi_archive$print()}\if{html}{\out{
}} @@ -152,8 +152,8 @@ An \code{epi_archive} object. } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-as_of}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-epi_archive-as_of}{}}} \subsection{Method \code{as_of()}}{ Generates a snapshot in \code{epi_df} format as of a given version. See the documentation for the wrapper function \code{epix_as_of()} for details. @@ -163,8 +163,8 @@ See the documentation for the wrapper function \code{epix_as_of()} for details. } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-merge}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-epi_archive-merge}{}}} \subsection{Method \code{merge()}}{ Merges another \code{data.table} with the current one, and allows for a post-filling of \code{NA} values by last observation carried forward (LOCF). @@ -175,8 +175,8 @@ See the documentation for the wrapper function \code{epix_merge()} for details. } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-slide}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-epi_archive-slide}{}}} \subsection{Method \code{slide()}}{ Slides a given function over variables in an \code{epi_archive} object. See the documentation for the wrapper function \code{epix_as_of()} for @@ -198,8 +198,8 @@ details. } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-clone}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-epi_archive-clone}{}}} \subsection{Method \code{clone()}}{ The objects of this class are cloneable with this method. \subsection{Usage}{ diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd index 1e42dbf3..cc3ae6be 100644 --- a/man/epi_slide.Rd +++ b/man/epi_slide.Rd @@ -104,12 +104,16 @@ incomplete windows) is therefore left up to the user, either through the specified function or formula \code{f}, or through post-processing. If \code{f} is missing, then an expression for tidy evaluation can be specified, -for example, as in:\preformatted{epi_slide(x, cases_7dav = mean(cases), n = 7) -} +for example, as in: + +\if{html}{\out{
}}\preformatted{epi_slide(x, cases_7dav = mean(cases), n = 7) +}\if{html}{\out{
}} -which would be equivalent to:\preformatted{epi_slide(x, function(x, ...) mean(x$cases), n = 7, +which would be equivalent to: + +\if{html}{\out{
}}\preformatted{epi_slide(x, function(x, ...) mean(x$cases), n = 7, new_col_name = "cases_7dav") -} +}\if{html}{\out{
}} Thus, to be clear, when the computation is specified via an expression for tidy evaluation (first example, above), then the name for the new column is diff --git a/man/epix_as_of.Rd b/man/epix_as_of.Rd index 94961074..969ae868 100644 --- a/man/epix_as_of.Rd +++ b/man/epix_as_of.Rd @@ -29,11 +29,15 @@ examples. } \details{ This is simply a wrapper around the \code{as_of()} method of the -\code{epi_archive} class, so if \code{x} is an \code{epi_archive} object, then:\preformatted{epix_as_of(x, max_version = v) -} +\code{epi_archive} class, so if \code{x} is an \code{epi_archive} object, then: -is equivalent to:\preformatted{x$as_of(max_version = v) -} +\if{html}{\out{
}}\preformatted{epix_as_of(x, max_version = v) +}\if{html}{\out{
}} + +is equivalent to: + +\if{html}{\out{
}}\preformatted{x$as_of(max_version = v) +}\if{html}{\out{
}} } \examples{ epix_as_of(x = archive_cases_dv, diff --git a/man/epix_merge.Rd b/man/epix_merge.Rd index 59bade72..b688bd06 100644 --- a/man/epix_merge.Rd +++ b/man/epix_merge.Rd @@ -35,11 +35,15 @@ examples. } \details{ This is simply a wrapper around the \code{merge()} method of the -\code{epi_archive} class, so if \code{x} and \code{y} are an \code{epi_archive} objects, then:\preformatted{epix_merge(x, y) -} +\code{epi_archive} class, so if \code{x} and \code{y} are an \code{epi_archive} objects, then: -is equivalent to:\preformatted{x$merge(y) -} +\if{html}{\out{
}}\preformatted{epix_merge(x, y) +}\if{html}{\out{
}} + +is equivalent to: + +\if{html}{\out{
}}\preformatted{x$merge(y) +}\if{html}{\out{
}} } \examples{ # create two example epi_archive datasets diff --git a/man/epix_slide.Rd b/man/epix_slide.Rd index 231f2646..c4763a38 100644 --- a/man/epix_slide.Rd +++ b/man/epix_slide.Rd @@ -115,11 +115,15 @@ should never be used in place of \code{epi_slide()}, and only used when version-aware sliding is necessary (as it its purpose). Finally, this is simply a wrapper around the \code{slide()} method of the -\code{epi_archive} class, so if \code{x} is an \code{epi_archive} object, then:\preformatted{epix_slide(x, new_var = comp(old_var), n = 120) -} +\code{epi_archive} class, so if \code{x} is an \code{epi_archive} object, then: -is equivalent to:\preformatted{x$slide(x, new_var = comp(old_var), n = 120) -} +\if{html}{\out{
}}\preformatted{epix_slide(x, new_var = comp(old_var), n = 120) +}\if{html}{\out{
}} + +is equivalent to: + +\if{html}{\out{
}}\preformatted{x$slide(x, new_var = comp(old_var), n = 120) +}\if{html}{\out{
}} } \examples{ # every date is a reference time point for the 3 day average sliding window diff --git a/tests/testthat/test-compactify.R b/tests/testthat/test-compactify.R index 261903cf..d14c0d65 100644 --- a/tests/testthat/test-compactify.R +++ b/tests/testthat/test-compactify.R @@ -1,35 +1,24 @@ -library(delphi.epidata) library(epiprocess) library(data.table) library(dplyr) -dv <- covidcast( - data_source = "doctor-visits", - signals = "smoothed_adj_cli", - time_type = "day", - geo_type = "state", - time_values = epirange(20211101, 20211201), - geo_values = "ca", - issues = epirange(20211129, 20211129) -) %>% - fetch_tbl() %>% - select(geo_value, time_value, version = issue, percent_cli = value) +dt <- filter(dv$DT,geo_value=="ca") +dt <- select(dt,-case_rate) -dv_duplicated <- dv -for (i in 1:8) { - dv_duplicated[i,4] <- 6 -} +test_that("Input for compactify must be NULL or a boolean", { + expect_error(as_epi_archive(dv_duplicated,compactify="no")) +}) -dv_true <- as_tibble(as_epi_archive(dv_duplicated,compactify=TRUE)$DT) -dv_false <- as_tibble(as_epi_archive(dv_duplicated,compactify=FALSE)$DT) -dv_null <- as_tibble(as_epi_archive(dv_duplicated,compactify=NULL)$DT) +dv_true <- as_tibble(as_epi_archive(dt,compactify=TRUE)$DT) +dv_false <- as_tibble(as_epi_archive(dt,compactify=FALSE)$DT) +dv_null <- as_tibble(as_epi_archive(dt,compactify=NULL)$DT) test_that("Warning for LOCF with compactify as NULL", { - expect_warning(as_epi_archive(dv_duplicated,compactify=NULL)) + expect_warning(as_epi_archive(dt,compactify=NULL)) }) test_that("LOCF values are ignored", { - expect_identical(nrow(dv_duplicated),nrow(dv_false)) + expect_identical(nrow(dt),nrow(dv_false)) }) test_that("LOCF values are taken out", { From fef7377fb95a488da716a1693ef75bfb0542e4b8 Mon Sep 17 00:00:00 2001 From: kenmawer Date: Thu, 9 Jun 2022 14:11:48 -0700 Subject: [PATCH 21/96] Changed how LOCF checking will work. (Still needs work.) --- R/archive.R | 4 ++-- tests/testthat/test-compactify.R | 31 ++++++++++++++++++++++--------- 2 files changed, 24 insertions(+), 11 deletions(-) diff --git a/R/archive.R b/R/archive.R index 8d0ec4ac..e0b425af 100644 --- a/R/archive.R +++ b/R/archive.R @@ -169,7 +169,7 @@ epi_archive = # functions for LOCF # orders data frame to observe potential LOCF order_locf <- function(df) { - arrange(df,geo_value,version,time_value) + arrange(df,geo_value,time_value,version) } # Check if previous entry is in group. @@ -177,7 +177,7 @@ epi_archive = mutate(df, in_group = tidyr::replace_na( (geo_value == lag(geo_value) & - version == lag(version)), + time_value == lag(time_value)), FALSE ) ) diff --git a/tests/testthat/test-compactify.R b/tests/testthat/test-compactify.R index d14c0d65..cce84b7e 100644 --- a/tests/testthat/test-compactify.R +++ b/tests/testthat/test-compactify.R @@ -2,27 +2,40 @@ library(epiprocess) library(data.table) library(dplyr) -dt <- filter(dv$DT,geo_value=="ca") -dt <- select(dt,-case_rate) +dt <- select(archive_cases_dv$DT,-case_rate) +dt_unique <- dt +dt_unique$percent_cli <- 0.1 * 1:160 + 20 + +head(dt) +arrange(dt,geo_value,time_value,version) %>% head() + +dt_full <- archive_cases_dv$DT +mutate(dt_full, in_group = + tidyr::replace_na( + (geo_value == lag(geo_value) & + time_value == lag(time_value)), + FALSE + ) +) test_that("Input for compactify must be NULL or a boolean", { expect_error(as_epi_archive(dv_duplicated,compactify="no")) }) -dv_true <- as_tibble(as_epi_archive(dt,compactify=TRUE)$DT) -dv_false <- as_tibble(as_epi_archive(dt,compactify=FALSE)$DT) -dv_null <- as_tibble(as_epi_archive(dt,compactify=NULL)$DT) +dt_true <- as_tibble(as_epi_archive(dt,compactify=TRUE)$DT) +dt_false <- as_tibble(as_epi_archive(dt,compactify=FALSE)$DT) +dt_null <- as_tibble(as_epi_archive(dt,compactify=NULL)$DT) test_that("Warning for LOCF with compactify as NULL", { expect_warning(as_epi_archive(dt,compactify=NULL)) }) test_that("LOCF values are ignored", { - expect_identical(nrow(dt),nrow(dv_false)) + expect_identical(nrow(dt),nrow(dt_false)) }) test_that("LOCF values are taken out", { - dv_unique <- distinct(dv_false,percent_cli,.keep_all = TRUE) - expect_identical(dv_true,dv_null) - expect_identical(dv_null,dv_unique) + dt_unique <- distinct(dt_false,case_rate,.keep_all = TRUE) + expect_identical(dt_true,dt_null) + expect_identical(dt_null,dt_unique) }) \ No newline at end of file From b14244298d64ab47e25e61ebefb4b8fe679be55a Mon Sep 17 00:00:00 2001 From: kenmawer Date: Thu, 9 Jun 2022 14:37:45 -0700 Subject: [PATCH 22/96] Added updates and changed some names. --- R/archive.R | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/R/archive.R b/R/archive.R index e0b425af..e57628e6 100644 --- a/R/archive.R +++ b/R/archive.R @@ -172,8 +172,9 @@ epi_archive = arrange(df,geo_value,time_value,version) } - # Check if previous entry is in group. - mutate_in_group <- function(df) { + # Check if previous entry is an LCOF value, and adds this column of + # Boolean values for the sake of filtering; NA values are FALSE + mutate_is_locf <- function(df) { mutate(df, in_group = tidyr::replace_na( (geo_value == lag(geo_value) & @@ -183,11 +184,16 @@ epi_archive = ) } + # Checks if a value is LOCF + is_locf <- function(row) { + FALSE #stub + } + # Remove LOCF values rm_locf <- function (df) { df %>% order_locf() %>% - mutate_in_group() %>% + mutate_is_locf() %>% filter(!in_group | percent_cli != lag(percent_cli)) %>% select(-in_group) } @@ -196,7 +202,7 @@ epi_archive = keep_locf <- function(df) { df %>% order_locf() %>% - mutate_in_group() %>% + mutate_is_locf() %>% filter(in_group & percent_cli == lag(percent_cli)) %>% select(-in_group) } From f7b353b83c5692bcb70322f3adedcf18815eb243 Mon Sep 17 00:00:00 2001 From: kenmawer Date: Thu, 9 Jun 2022 14:51:03 -0700 Subject: [PATCH 23/96] Changed LOCF check --- R/archive.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/archive.R b/R/archive.R index e57628e6..78e9cc5c 100644 --- a/R/archive.R +++ b/R/archive.R @@ -184,9 +184,9 @@ epi_archive = ) } - # Checks if a value is LOCF - is_locf <- function(row) { - FALSE #stub + # Checks for LOCF's in a data frame + is_locf <- function(df) { + df #stub } # Remove LOCF values From 46628ca863025a8bdfdd8be513ab5cf9da339604 Mon Sep 17 00:00:00 2001 From: kenmawer Date: Fri, 10 Jun 2022 11:31:56 -0700 Subject: [PATCH 24/96] Updated archive code. --- R/archive.R | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/R/archive.R b/R/archive.R index 78e9cc5c..f1528175 100644 --- a/R/archive.R +++ b/R/archive.R @@ -171,22 +171,15 @@ epi_archive = order_locf <- function(df) { arrange(df,geo_value,time_value,version) } - - # Check if previous entry is an LCOF value, and adds this column of - # Boolean values for the sake of filtering; NA values are FALSE - mutate_is_locf <- function(df) { - mutate(df, in_group = - tidyr::replace_na( - (geo_value == lag(geo_value) & - time_value == lag(time_value)), - FALSE - ) - ) - } - + # Checks for LOCF's in a data frame - is_locf <- function(df) { - df #stub + mutate_is_locf <- function(df) { + df2 <- select(df,-version) + df2_lag <- lag(df2) + df_is_match <- ifelse(!is.na(df2) & !is.na(df2_lag), + df2 == df2_lag, + is.na(df2) & is.na(df2_lag)) + mutate(df,is_locf = apply(df_is_match,1,all)) } # Remove LOCF values @@ -194,8 +187,8 @@ epi_archive = df %>% order_locf() %>% mutate_is_locf() %>% - filter(!in_group | percent_cli != lag(percent_cli)) %>% - select(-in_group) + filter(!is_locf) %>% + select(-is_locf) } # Keeps LOCF values, such as to be printed @@ -203,8 +196,8 @@ epi_archive = df %>% order_locf() %>% mutate_is_locf() %>% - filter(in_group & percent_cli == lag(percent_cli)) %>% - select(-in_group) + filter(is_locf) %>% + select(-is_locf) } # Runs compactify on data frame From 5479dd80d48e630a9abaaed21f5515b8d7d69498 Mon Sep 17 00:00:00 2001 From: kenmawer Date: Fri, 10 Jun 2022 18:45:46 -0700 Subject: [PATCH 25/96] Improved LOCF filtering by treating NA's as if they are LOCF's due to being redundant. --- R/archive.R | 34 ++++++++++++++++++++------------ tests/testthat/test-compactify.R | 18 ++++++----------- 2 files changed, 27 insertions(+), 25 deletions(-) diff --git a/R/archive.R b/R/archive.R index f1528175..68186c34 100644 --- a/R/archive.R +++ b/R/archive.R @@ -172,32 +172,40 @@ epi_archive = arrange(df,geo_value,time_value,version) } - # Checks for LOCF's in a data frame - mutate_is_locf <- function(df) { + # Checks for LOCF's in a data frame (this includes NA's) + mutate_is_redundant <- function(df) { df2 <- select(df,-version) - df2_lag <- lag(df2) - df_is_match <- ifelse(!is.na(df2) & !is.na(df2_lag), - df2 == df2_lag, - is.na(df2) & is.na(df2_lag)) - mutate(df,is_locf = apply(df_is_match,1,all)) + df_is_locf <- ifelse(!is.na(df2) & !is.na(lag(df2)), + df2 == lag(df2), + is.na(df2)) + is_locf <- apply(df_is_locf,1,all) + + df3 <- select(df,-version,-geo_value,-time_value) + is_na <- apply(is.na(df3),1,all) + + is_redundant <- data.frame(is_redundant = is_locf | is_na) + bind_cols(df, is_redundant) } + # NOTE: compactify removes both LOCF values and all null values, + # as to enable better handling of NA's that can also be redundant + # Remove LOCF values rm_locf <- function (df) { df %>% order_locf() %>% - mutate_is_locf() %>% - filter(!is_locf) %>% - select(-is_locf) + mutate_is_redundant() %>% + filter(!is_redundant) %>% + select(-is_redundant) } # Keeps LOCF values, such as to be printed keep_locf <- function(df) { df %>% order_locf() %>% - mutate_is_locf() %>% - filter(is_locf) %>% - select(-is_locf) + mutate_is_redundant() %>% + filter(is_redundant) %>% + select(-is_redundant) } # Runs compactify on data frame diff --git a/tests/testthat/test-compactify.R b/tests/testthat/test-compactify.R index cce84b7e..177e0daa 100644 --- a/tests/testthat/test-compactify.R +++ b/tests/testthat/test-compactify.R @@ -3,20 +3,11 @@ library(data.table) library(dplyr) dt <- select(archive_cases_dv$DT,-case_rate) -dt_unique <- dt -dt_unique$percent_cli <- 0.1 * 1:160 + 20 head(dt) arrange(dt,geo_value,time_value,version) %>% head() dt_full <- archive_cases_dv$DT -mutate(dt_full, in_group = - tidyr::replace_na( - (geo_value == lag(geo_value) & - time_value == lag(time_value)), - FALSE - ) -) test_that("Input for compactify must be NULL or a boolean", { expect_error(as_epi_archive(dv_duplicated,compactify="no")) @@ -30,12 +21,15 @@ test_that("Warning for LOCF with compactify as NULL", { expect_warning(as_epi_archive(dt,compactify=NULL)) }) -test_that("LOCF values are ignored", { +test_that("LOCF values are ignored with compactify=FALSE", { expect_identical(nrow(dt),nrow(dt_false)) }) -test_that("LOCF values are taken out", { - dt_unique <- distinct(dt_false,case_rate,.keep_all = TRUE) +test_that("LOCF values are taken out with compactify=TRUE", { + dt_unique <- dt_false %>% + distinct(percent_cli,.keep_all = TRUE) %>% + filter(!is.na(percent_cli)) + expect_identical(dt_true,dt_null) expect_identical(dt_null,dt_unique) }) \ No newline at end of file From 4f0ee0dc5661b84be684d47d24d65ac645acfb64 Mon Sep 17 00:00:00 2001 From: kenmawer Date: Mon, 13 Jun 2022 08:44:43 -0700 Subject: [PATCH 26/96] Modified arrange function to also account for other_keys. --- R/archive.R | 4 +++- man/reexports.Rd | 6 +++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/R/archive.R b/R/archive.R index 68186c34..99454c13 100644 --- a/R/archive.R +++ b/R/archive.R @@ -169,7 +169,9 @@ epi_archive = # functions for LOCF # orders data frame to observe potential LOCF order_locf <- function(df) { - arrange(df,geo_value,time_value,version) + ifelse(is.null(other_keys), + arrange(df,geo_value,time_value,version), + arrange(df,geo_value,time_value,other_keys,version)) } # Checks for LOCF's in a data frame (this includes NA's) diff --git a/man/reexports.Rd b/man/reexports.Rd index b633e86c..c63d54cc 100644 --- a/man/reexports.Rd +++ b/man/reexports.Rd @@ -21,10 +21,10 @@ These objects are imported from other packages. Follow the links below to see their documentation. \describe{ - \item{dplyr}{\code{\link[dplyr]{arrange}}, \code{\link[dplyr]{filter}}, \code{\link[dplyr]{group_by}}, \code{\link[dplyr:group_map]{group_modify}}, \code{\link[dplyr]{mutate}}, \code{\link[dplyr]{relocate}}, \code{\link[dplyr]{rename}}, \code{\link[dplyr]{slice}}, \code{\link[dplyr:group_by]{ungroup}}} + \item{dplyr}{\code{\link[dplyr]{arrange}}, \code{\link[dplyr]{filter}}, \code{\link[dplyr]{group_by}}, \code{\link[dplyr]{group_modify}}, \code{\link[dplyr]{mutate}}, \code{\link[dplyr]{relocate}}, \code{\link[dplyr]{rename}}, \code{\link[dplyr]{slice}}, \code{\link[dplyr]{ungroup}}} - \item{tidyr}{\code{\link[tidyr:nest]{unnest}}} + \item{tidyr}{\code{\link[tidyr]{unnest}}} - \item{tsibble}{\code{\link[tsibble:as-tsibble]{as_tsibble}}} + \item{tsibble}{\code{\link[tsibble]{as_tsibble}}} }} From 5420261e280514b73b42211757c3f37a4d331e67 Mon Sep 17 00:00:00 2001 From: kenmawer Date: Mon, 13 Jun 2022 09:44:07 -0700 Subject: [PATCH 27/96] Update comment on RStudio. --- man/reexports.Rd | 6 +++--- vignettes/archive.Rmd | 9 +++++++++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/man/reexports.Rd b/man/reexports.Rd index c63d54cc..b633e86c 100644 --- a/man/reexports.Rd +++ b/man/reexports.Rd @@ -21,10 +21,10 @@ These objects are imported from other packages. Follow the links below to see their documentation. \describe{ - \item{dplyr}{\code{\link[dplyr]{arrange}}, \code{\link[dplyr]{filter}}, \code{\link[dplyr]{group_by}}, \code{\link[dplyr]{group_modify}}, \code{\link[dplyr]{mutate}}, \code{\link[dplyr]{relocate}}, \code{\link[dplyr]{rename}}, \code{\link[dplyr]{slice}}, \code{\link[dplyr]{ungroup}}} + \item{dplyr}{\code{\link[dplyr]{arrange}}, \code{\link[dplyr]{filter}}, \code{\link[dplyr]{group_by}}, \code{\link[dplyr:group_map]{group_modify}}, \code{\link[dplyr]{mutate}}, \code{\link[dplyr]{relocate}}, \code{\link[dplyr]{rename}}, \code{\link[dplyr]{slice}}, \code{\link[dplyr:group_by]{ungroup}}} - \item{tidyr}{\code{\link[tidyr]{unnest}}} + \item{tidyr}{\code{\link[tidyr:nest]{unnest}}} - \item{tsibble}{\code{\link[tsibble]{as_tsibble}}} + \item{tsibble}{\code{\link[tsibble:as-tsibble]{as_tsibble}}} }} diff --git a/vignettes/archive.Rmd b/vignettes/archive.Rmd index f9f430d4..15122880 100644 --- a/vignettes/archive.Rmd +++ b/vignettes/archive.Rmd @@ -126,6 +126,15 @@ For this example, we have one chart using LOCF values, while another doesn't use them to illustrate LOCF. + +We need not even store rows that just look like the last observation carried +forward (LOCF), as they use up extra space, and we apply LOCF in +epi_archive-related functions, such that the results should be the same with or +without these rows, as long as use code does not rely on directly modifying the +epi_archive's fields in a way that expects all the original records and breaks +if they are trimmed down. + + - -We need not even store rows that just look like the last observation carried -forward (LOCF), as they use up extra space, and we apply LOCF in -epi_archive-related functions, such that the results should be the same with or -without these rows, as long as use code does not rely on directly modifying the -epi_archive's fields in a way that expects all the original records and breaks -if they are trimmed down. - - +```{r} +locf_omitted <- as_epi_archive(archive_cases_dv$DT) +locf_included <- as_epi_archive(archive_cases_dv$DT,compactify = FALSE) + +head(locf_omitted$DT) +head(locf_included$DT) +``` ## Some details on metadata From e4d15c6cb38e46c72e1e2140acd20e7f2b39ad09 Mon Sep 17 00:00:00 2001 From: kenmawer Date: Wed, 15 Jun 2022 18:52:15 -0700 Subject: [PATCH 32/96] Updated a few changes based on pull request comments. --- R/archive.R | 17 ++++++++++++----- man/as_epi_archive.Rd | 4 ++++ man/epi_archive.Rd | 10 +++++++--- 3 files changed, 23 insertions(+), 8 deletions(-) diff --git a/R/archive.R b/R/archive.R index a3fa9257..28de4f93 100644 --- a/R/archive.R +++ b/R/archive.R @@ -104,9 +104,13 @@ epi_archive = #' fields; named entries from the passed list or will be included as well. #' @param compactify Determines whether redundant rows are removed for last #' observation carried first (LOCF) results, as to potentially save space. -#' Set to TRUE to remove these, FALSE to leave as is. Not specifying the -#' argument does the same as TRUE, except it also notifies the user of change -#' in the data by mentioning which rows are LOCF. +#' Optional, Boolean: As these methods use the last (version of an) +#' observation carried forward (LOCF) to interpolate between the version data +#' provided, rows that won't change these LOCF results can potentially be +#' omitted to save space. Generally, this can be set to TRUE, but if you +#' directly inspect or edit the fields of the epi_archive such as the $DT, +#' you will have to determine whether compactify=TRUE will still produce +#' equivalent results. #' @return An `epi_archive` object. #' @importFrom data.table as.data.table key setkeyv initialize = function(x, geo_type, time_type, other_keys, @@ -168,9 +172,9 @@ epi_archive = # Checks to see if a value in a vector is LOCF is_locf <- function(vec) { - ifelse(!is.na(vec) & !is.na(lag(vec)), + ifelse(!is.na(vec) & !is.na(dplyr::lag(vec)), vec == lag(vec), - is.na(vec) & is.na(lag(vec))) + is.na(vec) & is.na(dplyr::lag(vec))) } # Checks for LOCF's in a data frame @@ -491,6 +495,9 @@ epi_archive = #' @param additional_metadata List of additional metadata to attach to the #' `epi_archive` object. The metadata will have `geo_type` and `time_type` #' fields; named entries from the passed list or will be included as well. +#' @param compactify By default, removes LOCF rows and warns the user about +#' them. Optionally, one can input a Boolean: TRUE eliminates LOCF rows, +#' while FALSE keeps them. #' @return An `epi_archive` object. #' #' @details This simply a wrapper around the `new()` method of the `epi_archive` diff --git a/man/as_epi_archive.Rd b/man/as_epi_archive.Rd index 729ca870..77ae1868 100644 --- a/man/as_epi_archive.Rd +++ b/man/as_epi_archive.Rd @@ -32,6 +32,10 @@ apart from "geo_value", "time_value", and "version".} \item{additional_metadata}{List of additional metadata to attach to the \code{epi_archive} object. The metadata will have \code{geo_type} and \code{time_type} fields; named entries from the passed list or will be included as well.} + +\item{compactify}{By default, removes LOCF rows and warns the user about +them. Optionally, one can input a Boolean: TRUE eliminates LOCF rows, +while FALSE keeps them.} } \value{ An \code{epi_archive} object. diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index 7db708cd..ce6fa35b 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -132,9 +132,13 @@ fields; named entries from the passed list or will be included as well.} \item{\code{compactify}}{Determines whether redundant rows are removed for last observation carried first (LOCF) results, as to potentially save space. -Set to TRUE to remove these, FALSE to leave as is. Not specifying the -argument does the same as TRUE, except it also notifies the user of change -in the data by mentioning which rows are LOCF.} +Optional, Boolean: As these methods use the last (version of an) +observation carried forward (LOCF) to interpolate between the version data +provided, rows that won't change these LOCF results can potentially be +omitted to save space. Generally, this can be set to TRUE, but if you +directly inspect or edit the fields of the epi_archive such as the $DT, +you will have to determine whether compactify=TRUE will still produce +equivalent results.} } \if{html}{\out{}} } From ff6446365e3ad671299871a99061a9958f85990f Mon Sep 17 00:00:00 2001 From: kenmawer Date: Thu, 16 Jun 2022 10:22:48 -0700 Subject: [PATCH 33/96] Cleared up ifelse, improved printing and updated descriptions. --- R/archive.R | 37 ++++++++++++++++---------------- man/epi_archive.Rd | 12 +++++------ tests/testthat/test-compactify.R | 9 ++++++++ 3 files changed, 34 insertions(+), 24 deletions(-) diff --git a/R/archive.R b/R/archive.R index 28de4f93..bd6744d1 100644 --- a/R/archive.R +++ b/R/archive.R @@ -102,14 +102,14 @@ epi_archive = #' @param additional_metadata List of additional metadata to attach to the #' `epi_archive` object. The metadata will have `geo_type` and `time_type` #' fields; named entries from the passed list or will be included as well. -#' @param compactify Determines whether redundant rows are removed for last -#' observation carried first (LOCF) results, as to potentially save space. -#' Optional, Boolean: As these methods use the last (version of an) +#' @param compactify Optional, Boolean: should we remove rows that are +#' considered redundant for the purposes of `epi_archive`'s built-in methods +#' such as `$as_of`? As these methods use the last (version of an) #' observation carried forward (LOCF) to interpolate between the version data #' provided, rows that won't change these LOCF results can potentially be -#' omitted to save space. Generally, this can be set to TRUE, but if you -#' directly inspect or edit the fields of the epi_archive such as the $DT, -#' you will have to determine whether compactify=TRUE will still produce +#' omitted to save space. Generally, this can be set to `TRUE`, but if you +#' directly inspect or edit the fields of the `epi_archive` such as the `$DT`, +#' you will have to determine whether `compactify=TRUE` will still produce #' equivalent results. #' @return An `epi_archive` object. #' @importFrom data.table as.data.table key setkeyv @@ -172,7 +172,7 @@ epi_archive = # Checks to see if a value in a vector is LOCF is_locf <- function(vec) { - ifelse(!is.na(vec) & !is.na(dplyr::lag(vec)), + if_else(!is.na(vec) & !is.na(dplyr::lag(vec)), vec == lag(vec), is.na(vec) & is.na(dplyr::lag(vec))) } @@ -198,18 +198,19 @@ epi_archive = # Warns about redundant rows if (is.null(compactify) && nrow(elim) > 0) { - warn <- paste("\nLOCF rows found. To remove warning, set", - "compactify to TRUE or fix these rows: \n") - # call elim with for loop, up to 6 - for (i in 1:min(6,nrow(elim))) { - warn <- paste0(warn, - paste(elim[[i,1]],elim[[i,2]],elim[[i,3]],elim[[i,4]],"\n") - ) + warning_intro <- paste("\nLOCF rows found. To remove warning,", + "set compactify to TRUE or fix these rows: \n") + + # elim size capped at 6 + len <- nrow(elim) + elim <- elim[1:min(6,len),] + + warning_data <- paste(collapse="\n",capture.output(print(elim))) + warning_msg <- paste(warning_intro,warning_data) + if (len > 6) { + warning_msg <- paste0(warning_msg,"\nAnd so on...") } - if (nrow(elim) > 6) { - warn <- paste0(warn,"And so on...") - } - warning(warn) + rlang::warn(warning_msg) } # Instantiate all self variables diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index ce6fa35b..161f4a85 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -130,14 +130,14 @@ apart from "geo_value", "time_value", and "version".} \code{epi_archive} object. The metadata will have \code{geo_type} and \code{time_type} fields; named entries from the passed list or will be included as well.} -\item{\code{compactify}}{Determines whether redundant rows are removed for last -observation carried first (LOCF) results, as to potentially save space. -Optional, Boolean: As these methods use the last (version of an) +\item{\code{compactify}}{Optional, Boolean: should we remove rows that are +considered redundant for the purposes of \code{epi_archive}'s built-in methods +such as \verb{$as_of}? As these methods use the last (version of an) observation carried forward (LOCF) to interpolate between the version data provided, rows that won't change these LOCF results can potentially be -omitted to save space. Generally, this can be set to TRUE, but if you -directly inspect or edit the fields of the epi_archive such as the $DT, -you will have to determine whether compactify=TRUE will still produce +omitted to save space. Generally, this can be set to \code{TRUE}, but if you +directly inspect or edit the fields of the \code{epi_archive} such as the \verb{$DT}, +you will have to determine whether \code{compactify=TRUE} will still produce equivalent results.} } \if{html}{\out{}} diff --git a/tests/testthat/test-compactify.R b/tests/testthat/test-compactify.R index 0814ec8a..87ee4471 100644 --- a/tests/testthat/test-compactify.R +++ b/tests/testthat/test-compactify.R @@ -17,6 +17,9 @@ row_replace <- function(row,x,y) { dt } +# Rows 1 should not be eliminated even if NA +dt <- row_replace(1,NA,NA) # Not LOCF + # Rows 11 and 12 correspond to different time_values dt <- row_replace(12,11,11) # Not LOCF @@ -60,3 +63,9 @@ test_that("LOCF values are taken out with compactify=TRUE", { expect_identical(dt_true,dt_null) expect_identical(dt_null,dt_test) }) + +dt2 <- dt +dt2$percent_cli <- 1 +dt2$case_rate <- 1 + +as_tibble(as_epi_archive(dt2,compactify=NULL)$DT) From e9bd8ab328cd1898b5449436f2303eaa0208b1cd Mon Sep 17 00:00:00 2001 From: kenmawer Date: Thu, 16 Jun 2022 11:08:07 -0700 Subject: [PATCH 34/96] Added a few more changes on the vignette and with the warning message. --- R/archive.R | 12 +++++++----- vignettes/archive.Rmd | 12 +++++++++--- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/R/archive.R b/R/archive.R index bd6744d1..50e03d2c 100644 --- a/R/archive.R +++ b/R/archive.R @@ -198,19 +198,21 @@ epi_archive = # Warns about redundant rows if (is.null(compactify) && nrow(elim) > 0) { - warning_intro <- paste("\nLOCF rows found. To remove warning,", - "set compactify to TRUE or fix these rows: \n") + warning_intro <- paste("LOCF rows found;", + "these have been removed: \n") # elim size capped at 6 len <- nrow(elim) elim <- elim[1:min(6,len),] warning_data <- paste(collapse="\n",capture.output(print(elim))) - warning_msg <- paste(warning_intro,warning_data) + + warning_message <- paste(warning_intro,warning_data) if (len > 6) { - warning_msg <- paste0(warning_msg,"\nAnd so on...") + warning_message <- paste0(warning_msg,"\nAnd so on...") } - rlang::warn(warning_msg) + + rlang::warn(warning_message) } # Instantiate all self variables diff --git a/vignettes/archive.Rmd b/vignettes/archive.Rmd index d3e74dc2..344ae1fb 100644 --- a/vignettes/archive.Rmd +++ b/vignettes/archive.Rmd @@ -147,9 +147,6 @@ object: * `geo_type`: the type for the geo values. * `time_type`: the type for the time values. * `additional_metadata`: list of additional metadata for the data archive. -* `compactify`: TRUE eliminates redundant entries using LOCF for efficiency; -no entry also brings up a message to indicate which columns should be removed -for efficiency; and FALSE does not eliminate redundant entries. Metadata for an `epi_archive` object `x` can be accessed (and altered) directly, as in `x$geo_type` or `x$time_type`, etc. Just like `as_epi_df()`, the function @@ -157,6 +154,15 @@ as in `x$geo_type` or `x$time_type`, etc. Just like `as_epi_df()`, the function object is instantiated, if they are not explicitly specified in the function call (as it did in the case above). +Note that `compactify` is NOT metadata and is an argument passed when creating +the dataset, without being stored in the end: + +```{r,message=FALSE} +# `dt` here is taken from the tests +as_epi_archive(archive_cases_dv$DT,compactify=TRUE)$geo_type # "state" +as_epi_archive(archive_cases_dv$DT,compactify=TRUE)$compactify # NULL +``` + ## Producing snapshots in `epi_df` form A key method of an `epi_archive` class is `as_of()`, which generates a snapshot From ba4f933d39d125a3b076805cf5bea3d2f2b7da0b Mon Sep 17 00:00:00 2001 From: kenmawer Date: Thu, 16 Jun 2022 12:03:14 -0700 Subject: [PATCH 35/96] Fixed as_epi_archive on covidcast to be more clear about what is does. Also, improved warning message --- R/archive.R | 5 ++++- data-raw/archive_cases_dv.R | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/R/archive.R b/R/archive.R index 50e03d2c..7648ba8b 100644 --- a/R/archive.R +++ b/R/archive.R @@ -209,7 +209,10 @@ epi_archive = warning_message <- paste(warning_intro,warning_data) if (len > 6) { - warning_message <- paste0(warning_msg,"\nAnd so on...") + warning_message <- paste0(warning_message,"\n", + "Only the first 6 LOCF rows are + printed. There are more than 6 LOCF + rows.") } rlang::warn(warning_message) diff --git a/data-raw/archive_cases_dv.R b/data-raw/archive_cases_dv.R index ff005f82..330945a0 100644 --- a/data-raw/archive_cases_dv.R +++ b/data-raw/archive_cases_dv.R @@ -14,7 +14,7 @@ archive_cases_dv <- covidcast( ) %>% fetch_tbl() %>% select(geo_value, time_value, version = issue, percent_cli = value) %>% - as_epi_archive() + as_epi_archive(compactify=TRUE) case_rate <- covidcast( data_source = "jhu-csse", @@ -27,7 +27,7 @@ case_rate <- covidcast( ) %>% fetch_tbl() %>% select(geo_value, time_value, version = issue, case_rate = value) %>% - as_epi_archive() + as_epi_archive(compactify=TRUE) epix_merge(archive_cases_dv, case_rate, all = TRUE) From 146b99dcea765a7a24609b29c183ed19f6e7fa5a Mon Sep 17 00:00:00 2001 From: kenmawer Date: Thu, 16 Jun 2022 12:12:45 -0700 Subject: [PATCH 36/96] Updated. --- tests/testthat/test-compactify.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/testthat/test-compactify.R b/tests/testthat/test-compactify.R index 87ee4471..63197679 100644 --- a/tests/testthat/test-compactify.R +++ b/tests/testthat/test-compactify.R @@ -69,3 +69,7 @@ dt2$percent_cli <- 1 dt2$case_rate <- 1 as_tibble(as_epi_archive(dt2,compactify=NULL)$DT) + +test_that("as_of works correctly",{ + +}) From 716f8f3dc3d778d498d0ac5b99e52bd32c93e260 Mon Sep 17 00:00:00 2001 From: kenmawer Date: Thu, 16 Jun 2022 12:13:23 -0700 Subject: [PATCH 37/96] Added a test (not finished yet) to test as_of. --- tests/testthat/test-compactify.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-compactify.R b/tests/testthat/test-compactify.R index 63197679..b1f4e1ed 100644 --- a/tests/testthat/test-compactify.R +++ b/tests/testthat/test-compactify.R @@ -71,5 +71,5 @@ dt2$case_rate <- 1 as_tibble(as_epi_archive(dt2,compactify=NULL)$DT) test_that("as_of works correctly",{ - + # pls test }) From 3712481596e754d7cde22e787d691fa705c436c6 Mon Sep 17 00:00:00 2001 From: kenmawer Date: Thu, 16 Jun 2022 13:36:01 -0700 Subject: [PATCH 38/96] Added test for testing as_of in conjunction with compactify. --- tests/testthat/test-compactify.R | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/tests/testthat/test-compactify.R b/tests/testthat/test-compactify.R index b1f4e1ed..c80ad627 100644 --- a/tests/testthat/test-compactify.R +++ b/tests/testthat/test-compactify.R @@ -64,12 +64,16 @@ test_that("LOCF values are taken out with compactify=TRUE", { expect_identical(dt_null,dt_test) }) -dt2 <- dt -dt2$percent_cli <- 1 -dt2$case_rate <- 1 - -as_tibble(as_epi_archive(dt2,compactify=NULL)$DT) - test_that("as_of works correctly",{ - # pls test + ea_true <- as_epi_archive(dt,compactify=TRUE) + ea_false <- as_epi_archive(dt,compactify=FALSE) + + epix_as_of(ea_true,max(ea_true$DT$version)) + + # Row 22, an LOCF row corresponding to the latest version, but for the + # date 2020-06-02, is omitted in ea_true + as_of_true <- ea_true$as_of(max(ea_true$DT$version)) + as_of_false <- ea_false$as_of(max(ea_false$DT$version)) + + expect_identical(as_of_true,as_of_false) }) From 244341947524d280a9c72cddd04d1d826e6439d8 Mon Sep 17 00:00:00 2001 From: kenmawer Date: Thu, 16 Jun 2022 13:43:30 -0700 Subject: [PATCH 39/96] Made test more descriptive. --- tests/testthat/test-compactify.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-compactify.R b/tests/testthat/test-compactify.R index c80ad627..bb1dfa2c 100644 --- a/tests/testthat/test-compactify.R +++ b/tests/testthat/test-compactify.R @@ -64,7 +64,7 @@ test_that("LOCF values are taken out with compactify=TRUE", { expect_identical(dt_null,dt_test) }) -test_that("as_of works correctly",{ +test_that("as_of utilizes LOCF even after removal of LOCF values",{ ea_true <- as_epi_archive(dt,compactify=TRUE) ea_false <- as_epi_archive(dt,compactify=FALSE) From 6185b7836c375521954c9ffe9a98ba3d11102daf Mon Sep 17 00:00:00 2001 From: kenmawer Date: Thu, 16 Jun 2022 13:44:37 -0700 Subject: [PATCH 40/96] Fixed error with if_else --- R/archive.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/archive.R b/R/archive.R index 7648ba8b..3dcd9733 100644 --- a/R/archive.R +++ b/R/archive.R @@ -172,7 +172,7 @@ epi_archive = # Checks to see if a value in a vector is LOCF is_locf <- function(vec) { - if_else(!is.na(vec) & !is.na(dplyr::lag(vec)), + dplyr::if_else(!is.na(vec) & !is.na(dplyr::lag(vec)), vec == lag(vec), is.na(vec) & is.na(dplyr::lag(vec))) } From ec2960d05fcddb04f8eb8c613e656e144d78002f Mon Sep 17 00:00:00 2001 From: kenmawer Date: Thu, 16 Jun 2022 15:32:12 -0700 Subject: [PATCH 41/96] Updated vignette. --- vignettes/archive.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/archive.Rmd b/vignettes/archive.Rmd index 344ae1fb..6db9f8a0 100644 --- a/vignettes/archive.Rmd +++ b/vignettes/archive.Rmd @@ -154,7 +154,7 @@ as in `x$geo_type` or `x$time_type`, etc. Just like `as_epi_df()`, the function object is instantiated, if they are not explicitly specified in the function call (as it did in the case above). -Note that `compactify` is NOT metadata and is an argument passed when creating +Note that `compactify` is **NOT** metadata and is an argument passed when creating the dataset, without being stored in the end: ```{r,message=FALSE} From 07808fec3da124fe69f71d3064d0f410af37b4cc Mon Sep 17 00:00:00 2001 From: kenmawer Date: Thu, 16 Jun 2022 15:38:47 -0700 Subject: [PATCH 42/96] Updated warning message to be more clear. --- R/archive.R | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/R/archive.R b/R/archive.R index 3dcd9733..166db32f 100644 --- a/R/archive.R +++ b/R/archive.R @@ -210,11 +210,15 @@ epi_archive = warning_message <- paste(warning_intro,warning_data) if (len > 6) { warning_message <- paste0(warning_message,"\n", - "Only the first 6 LOCF rows are - printed. There are more than 6 LOCF - rows.") + "Only the first 6 LOCF rows are ", + "printed. There are more than 6 LOCF", + " rows.") } + warning_message <- paste0(warning_message,"\n", + "To disable warning but still remove ", + "LOCF rows, set compactify=FALSE.") + rlang::warn(warning_message) } From f6fde05d35afba7544185e0522edcb0724a6d8db Mon Sep 17 00:00:00 2001 From: kenmawer Date: Thu, 16 Jun 2022 16:42:26 -0700 Subject: [PATCH 43/96] Updated comments. --- R/archive.R | 3 +++ tests/testthat/test-compactify.R | 10 ++++++++++ 2 files changed, 13 insertions(+) diff --git a/R/archive.R b/R/archive.R index 166db32f..f7d00b09 100644 --- a/R/archive.R +++ b/R/archive.R @@ -177,6 +177,9 @@ epi_archive = is.na(vec) & is.na(dplyr::lag(vec))) } + # LOCF is defined by a row where all values except for the version + # differ from their respective lag values + # Checks for LOCF's in a data frame rm_locf <- function(df) { filter(df,if_any(c(everything(),-version),~ !is_locf(.))) diff --git a/tests/testthat/test-compactify.R b/tests/testthat/test-compactify.R index bb1dfa2c..a3393e48 100644 --- a/tests/testthat/test-compactify.R +++ b/tests/testthat/test-compactify.R @@ -2,6 +2,8 @@ library(epiprocess) library(data.table) library(dplyr) + + dt <- archive_cases_dv$DT test_that("Input for compactify must be NULL or a boolean", { expect_error(as_epi_archive(dv_duplicated,compactify="no")) @@ -20,6 +22,11 @@ row_replace <- function(row,x,y) { # Rows 1 should not be eliminated even if NA dt <- row_replace(1,NA,NA) # Not LOCF +# NOTE! We are assuming that there are no NA's in geo_value, time_value, +# and version. Even though compactify may erroneously remove the first row +# if it has all NA's, we are not testing this behaviour for now as this dataset +# has problems beyond the scope of this test + # Rows 11 and 12 correspond to different time_values dt <- row_replace(12,11,11) # Not LOCF @@ -77,3 +84,6 @@ test_that("as_of utilizes LOCF even after removal of LOCF values",{ expect_identical(as_of_true,as_of_false) }) + + + From 7c4d9b4ebd635560679d6436aee6e0c5278aa7a7 Mon Sep 17 00:00:00 2001 From: kenmawer Date: Fri, 17 Jun 2022 08:59:46 -0700 Subject: [PATCH 44/96] Fixed spacing with tests. --- tests/testthat/test-compactify.R | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/testthat/test-compactify.R b/tests/testthat/test-compactify.R index a3393e48..8ccdeee5 100644 --- a/tests/testthat/test-compactify.R +++ b/tests/testthat/test-compactify.R @@ -2,8 +2,6 @@ library(epiprocess) library(data.table) library(dplyr) - - dt <- archive_cases_dv$DT test_that("Input for compactify must be NULL or a boolean", { expect_error(as_epi_archive(dv_duplicated,compactify="no")) @@ -84,6 +82,3 @@ test_that("as_of utilizes LOCF even after removal of LOCF values",{ expect_identical(as_of_true,as_of_false) }) - - - From c042e1db42fff7c7b687f75d21e3f78c9f0d6e6d Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 17 Jun 2022 17:27:07 -0700 Subject: [PATCH 45/96] Fix a missing `dplyr::` Avoid mixups with `stats::lag`, both for humans and for R (we don't have `@importFrom dplyr lag` right now). --- R/archive.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/archive.R b/R/archive.R index f7d00b09..842c07d1 100644 --- a/R/archive.R +++ b/R/archive.R @@ -173,7 +173,7 @@ epi_archive = # Checks to see if a value in a vector is LOCF is_locf <- function(vec) { dplyr::if_else(!is.na(vec) & !is.na(dplyr::lag(vec)), - vec == lag(vec), + vec == dplyr::lag(vec), is.na(vec) & is.na(dplyr::lag(vec))) } From 8f36d44c2379a847da27a642e335eee7a4b17c48 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 17 Jun 2022 21:19:46 -0700 Subject: [PATCH 46/96] Fix warnings&errors in examples&tests, keep archive_cases_dv in sync - Fix unmuffled warnings and errors in the tests and examples - Introduce the S3 class `"epiprocess__snapshot_as_of_last_update_version"` to selectively muffle this type of warning. - Fix some things that only arise when running checks or examples from a fresh R session / one that does not have epiprocess, dplyr, and tibble loaded. - When re-building `archive_cases_dv`, use `compactify=FALSE` to not mess up the current tests. - Don't store `archive_cases_dv` directly, as its R6 implementation will be stored with it; instead, just store its DT and reconstruct the archive object when requested (use a promise and mark it to @export). --- NAMESPACE | 1 + R/archive.R | 6 +++--- R/data.R | 5 +++++ R/methods-epi_archive.R | 26 +++++++++++++++++++++----- R/sysdata.rda | Bin 0 -> 2196 bytes data-raw/archive_cases_dv.R | 15 +++++++++++---- data/archive_cases_dv.rda | Bin 40398 -> 0 bytes man/as_epi_archive.Rd | 2 +- man/epix_as_of.Rd | 20 ++++++++++++++++++-- man/epix_merge.Rd | 4 ++-- tests/testthat/test-compactify.R | 12 +++++++----- 11 files changed, 69 insertions(+), 22 deletions(-) create mode 100644 R/sysdata.rda delete mode 100644 data/archive_cases_dv.rda diff --git a/NAMESPACE b/NAMESPACE index 11f488e4..bbed96cc 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -20,6 +20,7 @@ S3method(tail,epi_df) S3method(ungroup,epi_df) S3method(unnest,epi_df) export("%>%") +export(archive_cases_dv) export(arrange) export(as_epi_archive) export(as_epi_df) diff --git a/R/archive.R b/R/archive.R index 842c07d1..4f137f39 100644 --- a/R/archive.R +++ b/R/archive.R @@ -196,7 +196,7 @@ epi_archive = DT = rm_locf(DT) } else { # Create empty data frame for nrow(elim) to be 0 - elim = tibble() + elim = tibble::tibble() } # Warns about redundant rows @@ -280,7 +280,7 @@ epi_archive = Abort("`max_version` must be at most `max(DT$max_version)`.") } if (max_version == self_max) { - Warn("Getting data as of the latest version possible. For a variety of reasons, it is possible that we only have a preliminary picture of this version (e.g., the upstream source has updated it but we have not seen it due to latency in synchronization). Thus, the snapshot that we produce here might not be reproducible at a later time (e.g., when the archive has caught up in terms of synchronization).") + Warn("Getting data as of the latest version possible. For a variety of reasons, it is possible that we only have a preliminary picture of this version (e.g., the upstream source has updated it but we have not seen it due to latency in synchronization). Thus, the snapshot that we produce here might not be reproducible at a later time (e.g., when the archive has caught up in terms of synchronization).", class="epiprocess__snapshot_as_of_last_update_version") } # Filter by version and return @@ -525,7 +525,7 @@ epi_archive = #' #' @export #' @examples -#' df <- data.frame (geo_value = c(replicate(2, "ca"), replicate(2, "fl")), +#' df <- data.frame (geo_value = c(rep("ca", 2), rep("fl", 2)), #' county = c(1, 3, 2, 5), #' time_value = c("2020-06-01", #' "2020-06-02", diff --git a/R/data.R b/R/data.R index 06ae93b8..0222d98b 100644 --- a/R/data.R +++ b/R/data.R @@ -41,5 +41,10 @@ #' #' COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University. #' \url{https://github.com/CSSEGISandData/COVID-19} +#' +#' @export "archive_cases_dv" +# Like normal data objects, set archive_cases_dv up as a promise, so it doesn't +# take unnecessary space. This also avoids a need for @include tags. +delayedAssign("archive_cases_dv", as_epi_archive(archive_cases_dv_dt, compactify=FALSE)) diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index cedb0374..845acefa 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -27,9 +27,25 @@ #' ``` #' #' @export -#' @examples -#' epix_as_of(x = archive_cases_dv, -#' max_version = max(archive_cases_dv$DT$version)) +#' @examples +#' +#' range(archive_cases_dv$DT$version) # 2020-06-02 -- 2020-06-15 +#' +#' epix_as_of(x = archive_cases_dv, +#' max_version = as.Date("2020-06-12")) +#' +#' # When fetching a snapshot as of the latest version with update data in the +#' # archive, a warning is issued as this update data might not yet be finalized +#' # (for example, if data versions are labeled with dates, these versions might be +#' # overwritten throughout the day if the data can be updated multiple times per +#' # day; when we build an archive based on special update-data queries all made at +#' # the same time, the latest available update might still be subject to change, +#' # but previous versions should be finalized). We can muffle such warnings with +#' # the following pattern: +#' withCallingHandlers({ +#' epix_as_of(x = archive_cases_dv, +#' max_version = max(archive_cases_dv$DT$version)) +#' }, epiprocess__snapshot_as_of_last_update_version = function(wrn) invokeRestart("muffleWarning")) epix_as_of = function(x, max_version, min_time_value = -Inf) { if (!inherits(x, "epi_archive")) Abort("`x` must be of class `epi_archive`.") return(x$as_of(max_version, min_time_value)) @@ -74,10 +90,10 @@ epix_as_of = function(x, max_version, min_time_value = -Inf) { #' # create two example epi_archive datasets #' x <- archive_cases_dv$DT %>% #' dplyr::select(geo_value,time_value,version,case_rate) %>% -#' as_epi_archive() +#' as_epi_archive(compactify=TRUE) #' y <- archive_cases_dv$DT %>% #' dplyr::select(geo_value,time_value,version,percent_cli) %>% -#' as_epi_archive() +#' as_epi_archive(compactify=TRUE) #' #' # a full join stored in x #' epix_merge(x, y, all = TRUE) diff --git a/R/sysdata.rda b/R/sysdata.rda new file mode 100644 index 0000000000000000000000000000000000000000..f9a80ea5d8ed0db64dc72f037224fd98761064c6 GIT binary patch literal 2196 zcmYk8i#yYcAIHD5nPF<~HkTH&7^0aYx7utnqUJ6kmtk(@RynF;m_@pbI9Tb-WpfFq z94EQt7IHVYB~plrDD_LJTz-1a?>x`%^*o=?`}2N3??2#U?Zq%VKsteSIT3tPbsdn5 z-~N9Rcg)sy%g<5e05$?CWZ>YA|*l#S&w z)<8TB!L>%73l+4d-hkx}S2M_@C|3VP&XYVKf>OT%l1g_Ya z5XP`Vc&0#WY*>&oCPczhh(Z)YC?f+Vm6iw@qgrS>(2W8JLKGa!gkEKH=^9i93RH{` z2K?6A+)Q*9HxiQrI5lXP7zJSjNkKh(^J%p>mQPkyXRdiF&MitLZvcznz^scn#pliHank$vXbcBY z4JC(|P;PD!XB#V+G=(r-wam5{t^7rl6TUK9 z$a|S99=5kEj6c7rPI%J*sOHM<0(jBffoMm(ac6xPKS(@z{W`S^dD6939-Y6 zzkY2SngHa1z*J<8Wif36o9Vb_0LhaXm`dvDDS@waNi$+d2ZXBb|=N@khd05~a z=xGrcw)kd?=;onInfO*$@K*W7{rYnh<1mS1}M0BN?q=Uu6T3uyaj5U3l zj+59sbYgAfY1~Rww&RC5-}77FoqPqcJnH4bge5Dz{RK)75Sh&`>tH}0(Cg3Lq1CjxBG7L>s zfr&w3Xloe2Oh$Vml2y_WY1U8>lbCfV7J!>##BD6WL3+a;bA{)gUtW&<92ogszJ@|_ zpS`5-Us7aejGLS@QGh@)Sla%G1{&apA8+qf=r|kz&Ps*)M<~8InCAOgKw3N8N?U1; zhm}`p&!-s|riBpJ$4U>%7G0G4dS)(IdA!4F*&iWDmsflat73Ge!_sZISYK9HWv%l$xd1c6;`Yb%nLigC1YJJ(%o)LuA z`kZbw6E|-+3@M+Opq65H!t21wY*IIpa>;|548T)7Y+lG2wic zo^ZbIMFU^2KsZ6AVl*Zq`j!vIc#7 z1PgJkxhygYO7z~EHaXRr{B6eaS#5`eZN%e1w*%@=Dz2MNzUylD8ygY4zCE+be>L!I z_4?3UX{!9LN39gaB;XTBVwIY`2FUT89!t(_RVF>J9qGHdPRoqPV7sJ0u_U!T!O!(R zj928w{Jr?3b}+E}L8fFHxC;)$h6grPE^;!VII0xl%bsNghqm9{rd*C_J|vfJRzKF6 z*K#$mNWv7%)i+pvUe%^sOM(zl_iOI9R4-Q*kUrl)?@9t^Q^+y^n8ELKrA%j11@GA1 zBAuJq49^MsR{yhpB>kY`2!KsLR}rtAYG`3@_R!W%;-TseT4mJYmvNgIHSc7MfOJ-(~`FL5Vez&3E=s@IL z+Q`LIayL2}z4*)*p-lq;RpmK_tjyb~b8{adMd*ZShDa(#2&Qf#fj)_oy2(T$3;K(7 z7Fv#wj&@M!5hLAUaxm(y7qsLdEto%-F>U*_IVHs1AT&oNbe|Z+qDlLzU-$U9lgqYp MjLktB=jRvlUyHHFfB*mh literal 0 HcmV?d00001 diff --git a/data-raw/archive_cases_dv.R b/data-raw/archive_cases_dv.R index 330945a0..74448901 100644 --- a/data-raw/archive_cases_dv.R +++ b/data-raw/archive_cases_dv.R @@ -8,20 +8,20 @@ archive_cases_dv <- covidcast( signals = "smoothed_adj_cli", time_type = "day", geo_type = "state", - time_value = epirange(20200601, 20200615), + time_values = epirange(20200601, 20200615), geo_values = "ca,fl", issues = epirange(20200601, 20200615) ) %>% fetch_tbl() %>% select(geo_value, time_value, version = issue, percent_cli = value) %>% - as_epi_archive(compactify=TRUE) + as_epi_archive(compactify=FALSE) case_rate <- covidcast( data_source = "jhu-csse", signals = "confirmed_7dav_incidence_prop", time_type = "day", geo_type = "state", - time_value = epirange(20200601, 20200615), + time_values = epirange(20200601, 20200615), geo_values = "ca,fl", issues = epirange(20200601, 20200615) ) %>% @@ -31,4 +31,11 @@ case_rate <- covidcast( epix_merge(archive_cases_dv, case_rate, all = TRUE) -usethis::use_data(archive_cases_dv, overwrite = TRUE) +# If we directly store an epi_archive R6 object as data, it will store its class +# implementation there as well. To prevent mismatches between these stored +# implementations and the latest class definition, don't store them as R6 +# objects; store the DT and construct the R6 object on request. + +archive_cases_dv_dt = archive_cases_dv$DT + +usethis::use_data(archive_cases_dv_dt, overwrite = TRUE, internal=TRUE) diff --git a/data/archive_cases_dv.rda b/data/archive_cases_dv.rda deleted file mode 100644 index ce3e0ce34679a412812cfca04092a084948c49dd..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 40398 zcmagFV{oQV)IRvcwr$(CZBLSkZBA_4wry)-+nLz5wfVj8f46G)%U*TbRb748eNLaN zPge_Tn{zPEDj$g*PAI0UO8)FuVQx!I?7~!WC%t zgjJD}8Gzmg4GlsF08kB_1OcGAF3>IJvM!(jEv)sOPMQ7#WhY>DCW1dxgGwv4SP@s+ zD4L_5wC3U;uZ;e5Li~)t&#tW4xrH_a`I!USS@7h;3cRzY7QjIMZLZywx&l1%9{>== z*r_~dyO!!1RuUql5f#o$M*yHa0B{i%6=+gnoM)t)R$&Ee{wR7MEBJy?9UR{|;Mz1K z#}B~WG-8L{)0IC3K6Mk@WR zIi-d39zp9Uv)D)h7U_%%stdF8#p6LYOiAksqN9db34F3pVa$-ql0;b8 zgR8VRbklM2}5~pg$k1$>< zLqin?0%C=nw}E_s_P{3331GHlFMlh|h?`Q7IAI05$-N^)sCu>5JotR5f;*$gf|PvJkpl98N_S^w6bq>v=Y#%4njqu*6JBj*UH5 z0FTVs@%}9{8y;UinzYgSydr({e(PYq1u>zb>(aB6N6q!50I__8ldE??)B!Z>gp|^u zAnX8&WHN5dCumoY%vLoBu64-NMdlPV{(uSm`al#vlvWZX-e+lJKQZ}$1QQhR+aS)S z6f1(ZuqxJVBnz&qVUdh=PNevnkLvHH7oScil8b$9q7_>fJGry=xnddXzV+xore7gA z8nU(;3oPK)lQu3-NMd+g$IqV;JgDc92x8#fqGC$e-a)?@!qM|&aom01?_ z>-i$&U)^1Mk8#OlV!H`GbOGhDP>reUnwmnOdaO|%Y%s)G(kM+Q$pSt+9?-QWHvmH7$c4c z9qaKL=nQkxl=*`fts)N<91bixFdaSC|2*o7+O!=f^mU~BJJd>v zvA;QvNQBgtMtq%ZizHlmwzP^y-<=Q(4yTy#Dh+5{PjzRq7^)4@Z!_BDgMrFLVe?k6 zDLhR@E<@-;Lb>ssIp-ptDg$agC-RWQ(1iKVr$x1;RHt|A3)+;hWAkat0yw z5fb(6chtI#6iFsTx24n<%UdSOCFTcKw$fH=ZZC~h{Ds8Std^`>L3YZ9;B%o&sH+0& z?_XO`2Ne~Stw}M1({5}ODv?(|8SaFNMT?Da`j~;=z-z~1Z(K;XyclV&Zm6y*3pXSd zLotQE&DyEWFRNraF*xIMOn=a#t3gX^#lu?G%eoJr z3y5-7;)4Mfk3h9tFj-7x|A)H94L)N-dYonuZl9A-P8kqd<_!V8hl-PYn$Ch2Jr1MM zZ07SCqP~~yjj^cwN2^8sL~)C{#f$5UAbvLw(XKx`!0W-&rQXTOjVGH<&SPb=wPeEv z(;%94E(9|~I`K`ySV)XkU8S{FU-3BSO2Y8S+bQwJ?kTTtg7#|Ep~WoO1TI$FLusHf zL7U~A`iXCZYQY9mYx#m)TW%q|qES8P;DJ`2!Evb6q&pTZ2#Z!7&O-q)^Bt^>>!p)|eS@X{)tk?N%#Mp)r{>3z|*V*vKa3E`RCWQdJ<6c=~ z>kVr05|!gl!W4|Og$}bLktM=`OEfk5DYa4k^s*Mzto`o#0H1OtmltwX6#|u7XAFvQ*N799uHe7+p)na(zopH&{jnv_Tqy=5b!W*7M`C_@aER zV@XQWlB>1P&fZq^;w`!+9$2g3`)#i0uULiqPXV5(OI|$BT$bY(x41Wk!6P9t zeuwg?@+9fNG&~3agzzCrsGFAxFUtWIz=)o<@LyFY%6I#;-+$3@4y_1OtCp#sBgL+3 z8<$m!>GqPg47YWDCt3UjI8fUyEq z&PH}IS9j*3c9%g2rU?oxvshM27UGC8a+ ztNKZ@ArWHB<&xmFAw8AL!#so1f2ExD3f@~e5UR{vi!``Xx*J^9@I_Y2<<OX^MZ*aa&wUhCTjl z8?UC=W0#wyrHSH<%4i&`GW7>aV?l`uLK>FIGY-TM2?F0=l8<*sNKeer{{srgt3*wu31>BFqu;qni|u-=e4+PJZeD#TjAR}i zdD7>~#54nuVy~|iyhYRFJmbQ=n@TEgmh!1kos;=Ro<-L$g)~E{`NmB8twi{WijYNx zpiP!6H#u7O<|H#;Hx*yIkY;%uXBZneYE$kz)UD-R?xds=wnGHon?J{ss=CeGu(Qe= z>v2{2A@wQjaWM}|@&TKgaDPN*c=7+di;eO4k|}83;!|T!I=;5jAu%;o>$zreAC6L* zu1)yX%`X18>xpCjW`QvcG;9?vp=q7wfi6W=f~sl0o9!rOJy}){)5R!N`NK;=zNwk^ ztA%=2yAeuTDhXFh9lq-IxRw^FM7Na^d;OR`!h{|~Q3-T`izThZBWj`CSHNenimOWv zT5^YRK2A|B^!3f~K85W8&y_@ub>$XcoJfn91gY9_^}L0_^U1sO`t#ZD1s5M<`^85f z_4v*HB}ufd!$64DG-djj&WLOw!RLBLlqDA02l7#gx7IFlHZM)*Vdwub=A zkEuYw7K}Nm7KGSZ+Cm-Uc6avrI26Hv68D%9zVl zv=mcszVcqC@RaT0xx(2P0SbQl3#%;!EEh&u#bI5k*j=;vvF$B#Svfc~G_=Es1W{Dk zdQl}+R5DOeR5elin3eOVg|QF$uOqmu zxx2jaBxWA#_{cdidy6*$N2Z&E@v~kg;B5M1m?vCiF$TQi5}Is7b-l z?)I(>wYLr&w$n(rPt)e`txkYX(y{}dk}$%G04ieb0Q0m8lwNQM72F)ly3GDT5hB|x zE|)NswdFvmIWAT=JVs!xa3L(_uKr6G!LZ-I`J)ou=|?`ov9 z;Kcw3tm)K13FYy`c-i!-S(>q&AG=IHQb~Be!7$f0w?7xa5AXtfm4xppkn+ZT3i9`HfSOGk-1!;v^?J+3`V$xBO=%syR>Z*=7P zK42Kf5rPI6eER@yBNtycy1?!Q=FK*Eul!eoiBo$~gNXwOT`D6Za|eR$Ke*3d^9k+A zDLTlv-$)_-a&o_K+-bp%Hf_9Gv@e#m{^-C3i1Tx^>&fWkE{bCgvnPd4VOPPAea&L4 z14S9Z(<_Bu#%~=^Ywt0^h7cfx4(#5%c=6-i-rlUI&WS4Fha?7W+L&y-#?m&7S5!Tw zXqlvZwT?dVD~!DgkpDFPR~8-kGFap59Dhwb-tQhRnHLteC#n9Lc1_He`VQTmM{Vk` z6q2-}|C$ERgOz`Q%=0Dm2D}M%;QR3t4Gf7~fb2t@Lku9)zYD%ue+Tjx3Z&=s@%8!k z{Q>O1_D<~9f`5|5@%(lGT=|ocf+i)UddSN{QPGgyyLI+@e|tO?+`W3`Im@vURE?nB zaMVcm=`5o46$ASHzkovE$mALOZ2boOLqR|y5(qGvqApW7Tsk&wtI@$>nHm#+5PSb$ z9*2ne;dLZJ6gO(jj0GDa?C8ku_yZ>@uB5E2mr_!a{d2X7CCg^*nmxvrOxEsL-MqxC zp4+wov4GKhFegZv^HnCrsjyR1T4uA`r^jwPom7F@p2^PeKmQAN3m@;436qA69NGUb z)Pe>F2Zvk^e^;WQpwmSa&h7wmdQ0DX?LIo-IYV(BxRcu7I)0#KznqcfFfcJOF@8V8 zd%x*Zvox;u-pu!diEzhlOyXhu4gbD>g+cZ)Nd5oJLRzu^SpeCxwap5%{{Y!oHp6%+ z|HGe|T#nfy@u%@WB5VvLgbzAaBBxakl_(o=*t9HU!`e1Gx+cH?Z4H*MZ8WspIIV@z zfWZr+*|dlWCCDSFeQ5R*?`hctoviK~Zb5`G>-ooS`5F1roa!0;_pLGP?;aD{1rG!~ zE~mrohl_bv2tQ0d7#J9IRO9DOcKr@Ap?jDs6crhTe(lQnBUGRC{uLlRBv5RFe2&h9 ztkuAyS`D!P1_obKNrN$eXm*eK6&n|v0tCg|MnHh)&gP!}LY+zzi;IgBf)1P2;^@}H zxm>weI+&%8CVtiaA(DTk|Xcmg8F1`&pTJkiDi?44_q$I1UNH_5Q`KrUG42xq0YUg994Osl>W`iO5x@>9 zv0nYzK1&kLYq}92{$37f9KYzZWByQW*Rpe#DvD#QA*C#pX$%m~X>mzSvPszi#z~s9 zBXsgiOw7Jkx2V(#Z(LHIPoH;+MxA@##^T~!*!kUoP#NhE^o*ZLc6N3$C!GAB7j%7U zo2+qmd4r0jU4hxaX7*PTCe1attym6l@tR}=!9m@}KqRHHxw)$W<_`G3VXe&b+UbOZ z1O)z5yTbSHOD~1+t-Ho!ryh^#`>VBcjmlF`T`xx|(uDs<+?qeC?WkkjI_o$4#TL)3 zo@d({BKxYAO{UJ6v=7%qnma1!OFA;72Wg4>Nv)nxux<`Bavd@^y)~ka1PK1jex!t( zzgqmiV6IVTZ-{Xx1x%=Ome!TULVfH~@(-eFb&5@{Op?%i2+l+BuQ@{GR&zv=$*P4B2^MBZ#B04(8 zRu8W1TpmTIHa1r%?U{S`p1GkArL*_ixA%9q7uMDew|8?EF3B%Bt<^U@&;2)DR8}(b z+1$>j3#Ee}cM@G3P!xXBLOo-|!>Z^y;S}hiCd9tli z>9m{w)6Qz`W~!x*sSJr`r^?)#gHD`nR)=X;s7k;!0P7*Mzva{dL_PX zOI$9P$B%!WN`oD2wiq|6k^TG4bDwFC{&>{|I!_Kq)Z+g4w_&$FT74iotslu9FWkcC z^W^r0i$`O7*Jm^zAGZhnuhmvu)mjEn1m63aj%#6qyI!X&=P!ZEtqhBne+=G}>>;x# zed{3RFsh!H{)(FQlk}S(FsZC?pnuC4%@E&Ko5Ewse?z_KGe9(6yz|?Y=km8yxZCfF z;qmKMr(hby6T9B z)~ii*8>-i`Tq>Q(HBmEr#r>u8{{CP-_u~gYWyU$i4^KDIp!4lej%;D}pF{5=1K&S0 zCW#KWy2C2NEwm(+pcT~wIhf+s)(%TdO-&s+I=AHi#=hcwq%AL1@ff@k&f4U!q}$}LXh&NSEhkM7!{(aaU!m2aPUnx_@TGY@ zxR{K;JAXEq)?W_CS$1&w&{QI@^b6-cbeGZI3jVAMaql=s8yi#cJaf-%?2NSA{gFVBE6Iiqru5?C zV%BLXGU=q#FM9pf_W#i(my7-n#>Gw21ftR(ILSsQ45w`)tWC?JUzw2O2J&-jgy7nP zG7ZGyE29L`rIRMe=jgK#KJxgK7WVNG_@i{<;9xYF&*i@oIdpWwk^JyF>C|+1+ZEIV z$)E&hR{~#aO?55e?ewf>i?OieO7#XB=;twWtv0BwgQ%HdU!&-527;B&MYZC4lZjON zUs_E2BQdaO8Szz}8y%-Zv<(wvQt$ZQKTEw=FiG)M&B&4nlo=HTDh@Sf`j4cdvzp^- zXMyg_NQ~ zyHTG_p!}&f;PT(;5>Nv{tILY5=Bo zvG>d)lNaCUdL~~$zev8GDiQkf6q#SJmxvje(o(|D-T*DAb;xYsu#M&)I$Z$K$=fzT z*dLJXM{b`EZ(3a(EW7RT1V+Qr1=$}pU|nE+>n0$*bd?!-Os7f7ss`L?X<<7*1}yqiPV7KPB)LB|u z`j4M4X9YkK27vl9WM=l3Og96`r13s8QHYjBV439I7>|4Wc0{fv{g<3T!C9LGxA$`g zfjq~phbzqBz3^VCOD|+(6>*v3P{|xGKN5@2>wa#e5)ygMdp{$ka&dRxY^l1xwN|ro zvy|MxNWECDTuQ4lcsCX^uynRxZ3GqxeAE0Zsf6+vu~{0#gpiPsY!A6usQGa78o_Tv zFWK_NeygkMWoL*-+~lO`C$tyIztMR7)feFB(7HLuKD@e)WP_w6SZ@QfcybAa1C=7D zg_RZ{qL0wo6TAKF*||^|D?hEHD-V-da79b6>$-PdRrQGS*B9O`!}Nbx$B&JMQXD^g z=$}8dCubBf^diou-?Ar>R`+p20Cr$xO4>AAUi%M%+NbSFf0_O|h6(uE^dufF-)~|9 zaG4-SvC-@f+><p9%-Y}mYIKnZ5E1rh=V`*gh$;RA@Uh8f{QN4{cN?u{J0{OP zOuTWtehgZcL+F|YE-TrQ=rk|&*Mc96e?bw29^QMc4ju3AhW4K=pEBRktu;AubrJJR>jHlP2T2Y)fXjqW+ki9y4XRTBBYsfS1m2 zBd7r0F1^pDkZTvR<~lke_4$S+$j_P4DeirY@%Qt?WAmRgF9n&Yhi*EmI{J=tg{7sr z%JQmhRD{1E*FEpoV@{XeI$e+LclIK6+s1jthot}MxP)M$Y>jmDT_ifUoqgzw1pI2m z3!y~A{U*fcba$njs_Y>xo)8u)d(&; zEpIbTVp76HqplvZ#tHH?^;9Bcb3a$`MuR5#$A&9ZufCTd2;>K3!KLR#OhEvg@mL}` zZxwaJ>WjOtU1fSnM((yeBc{qVD0E(U5XyyV8qMBJ$6zXnzMNJIXUlS+>&U$H#!9Is zd~?=j(YtdMV&`lm-Wx-ItfZnsKXKd)z={HMTBI>@x*=FU%P294zcS?M7L(rCcJA<4T~Pz8(9?yPwMkVrp=0x62wH%M%LtY| zpq7)CjXeLz3*Fqy_VyfMNF$~HVhQZqp}p;Rg=vQF?iwV!tpU-Yk_w?Fy;IlIZM~(Z zMJyBvoSU+5whN`E;y)xB+af1a;Gy~%Uy7S<@~8QJ0PJRF%V`>>F0L#PfzQTNvRV+1EAAd(?PbF}=3?ZLG zLd)7Gm2>~ZFmgsNAC`yGgDJS?i@;sf{?P+3H&eH=_3VqydZC3-wV75+LQOAGmw)Oh ze14Tq+_oN3NfR3j@ifP1p^n#Qw>sVHlZ;`GqWd5)uxC>bi}Ypcy~J<87YGE8u|=w3 z4!`%NLnt$~(XRuWk=mdVkfbz@&rLhgb-YR|fA&_M@L%UQqNclo)6!n_JS&8N`y0O` z1}@WeEd#v)eK7eNkGU*snMI|un0bcAUp#j7&|f>{?`9Oc->#5yi%bAx8Ay&A5`wya z)&q(q@25Er5yC+=RyaOz1vnnRMl4KHGp|Dv0=MoB zwK#16>?Sw*@8~BtI_0mCgJCw~kWdKnJiy9~(p0sTfTu4*?!{M`mtHjZ;YK0mu=CYW zc)L5(%eFp*_wwUA2}6y}Qy3CAU^pswgNILHKeENe%ZIDida37;sPhB27b^Gg1rzH* zw3_H?44*T0r?T8H-55~|CF37aS^}yVq;FniAF~|NAFj28J;IA{V|>Ll*wg+@0$g>b z_bh70E9glK9zCpaME}vlTOOPtWdD=__3Dk5Rp`z1r&rTe%cXxCu9r(+AB~!pO9QTB z#7fhma=GxfU-Y#Tjt(GMz!hN;8vaBy zZ42B+0IZhn{GM))@o@8a^<2p~s7~}>-_6pfEtNW04OMEfOAm`>?s&D0C6ek=$AM)GA6Ex-o&*huFKZ5Y@g7>I}A*( zV!r6uilqf5K#1C_yQ)hWQYiJV?S$M=t()m}4FMJ#BI-vVZQ_g|!v$I|oSn~$8n8qJPk`t=G>|aDml#gAcHG}xe3(~k0=ANl+fM+goL{5r#@Kp&= z?w@8DoMl||!tSo!$?;r0++Zd9o8~Y}#Usszp6yP8BtRozh3{tx`}H0RR`nz+1F=Te zig{Z>HGN2~4c4;@KZubb>WM{jn4N8t{5=fL+3Gk z{ADA}Xcpt|;r~saieb`XQrgF*Aa0gS+tcoBstuffrz3EAqb3vwZ@b2Wqu4XNJh4Z^ zP7g1Wle_faO1?nmKuoIS-&0m8wjRKq*naudWf=OeLAcoH5Q%w~WN%U+{4>1pKT0sY z=m5)PjFy!_2m5r|t$ZkDPT9*y7mHrr^c6yR@hUEtLfQFysZ1nnrRxQY;sQ@CT|UHF z0fPYqOvwnxZhNrK?i&jR@>VDjAVkh;bZ>!}&&xcJ3xYlmJ4NDYi`--}+5$mAi8!$p z{Al>%ErB!uGbuO@MZgWdTn7eZ&%+ob!Qp*yNd}wBKOX|1C@soclRKI&hTR(GC`vE^%mYzTn-0X=m zNxe2#kNq(6m7d6LrQA|bBxPAsXl4s}sqoe0%E8~8E;IPh^`(Mt5ai5bto*`hf(WSv6l;SQf(=+crD=B6j2Mtb|om*HGSPXSrLCD7tm3f0_t7jXqq) zr*3>liL-Fo!XlR+hnXxezFda+Uiq)hJ_2juLj?j50e|kVqh9g8pb-({yCI2>h(XRL#)WS$EF9W8ZPN zW?{x6zCBcmx2b4k(P+~dNTI)HNC$A|VcDHFM5^7oNvp~_3}2~f(!7T9(RCb$O>BD| ze1+?h%*iSgk0@^Ak7#B3V?)a;l>}j2r+{CWIIn=E0pd>821HD0=Q0p^fo>hC4k_Vj zh2^qMQ6<{RJY<%;blHv-eE8 ztclv#He}eZLsfdUB`VAnH0XQSuc~aAo4?JLz9?aUiQ&Hf9;%AG9EAko6BQ7y)JwdT zCKD+Pd^SeDFok>95BCxS)=(3ilF@?$dOxpcOSH`*eA5?s8;2Br&-RtSl1Bs@eXyUp z%#srmKVC`c<%_DQsmbW>V=QlyoqY%Yf_Glct5$ zT7uksQqyrPAqNrk#v}|vyf>h%2g42`K>oWhQ&0LvdZ9CzWNz_Cm^jS_ zqqouHU_@(oDj7c4E)FB~xwaU5*^SIUP)xDFKC!Vm^>7OhR8)c+4|lGru&aNqp0jnB z09&LsJ4Q;efR=V8CX`=P*ms01jw!3UBSMsC^F&!sBG92vnjc5{FWS>38T{mPSgvP- zYnws&XOcGJUXBInq~=#_KAAl_-5KIb|7=$yZ$(Hj7=!_506NzqjAXN?BAr3*c2IDj zy&KND63Xikor8pNSXH3esALS3(`ar$Ls>giV+vSh+OAv17Hzgj1C4~mA^?3O6K(yJ za1K0l52KuUA(Fwvg3JiU0}&5&%(f^^B3^Hhq~w$qcQ(*9NtaX|vV&!r*#Jm%8}qJ% zaH}W(c?^WQKu4BM%{g=@dsmhqXQ@1WJ5&5try4qRp4b;1XI{*Ye6D#=XN$6zbSM0S z%^_GMQs06hqU)TZ{hq`!Ai@(u}_Zun&)EfbTGX!R!c)cDV!jNUV?-l^%@WoDuJyUT`oQLG`^}nbIDfBz<$$fy%(D!x-0vX* zr-xJF6wBpdZ`ct>n6`0U{UgM8_|VLk3;Wnj52J_pPsws}aMxtwHtJq(V6dZgKfity z6v4S;%!9fiM*HL>s1mX#Fpp`0AV={}SPft{x|2U*GpZ`{TqbwB>>lTPgA^O-GV&iCDVd8*4DJw|Nz(Pzcv4Orsf` zgX~ty&d4_Tz6XMVo+#$Zq+U&=Q{L2}Q`im;8bFGBAR0M0f>D<>@&Xy=QHSf^!bElToh93sNwxm#xxzV_g31D~oy zFJ(Y|d;9vOVfbz(jW`rtAD0QS{cAm5@eXcG=%5>*11A@Xkhp5 z(~KJD^6#K13j(*BSz1-bcT_YHdhbqlsvaSR3}KV66AGXR1BLhWCGu7 zXu&xUViL3rj}>_3Y{{Awp$rE-F3Tteq1TrV)qWA+*AneX7w~(YNeWREM^lZ|AU)0m zsFM+9Z4XnG;`xo`qt0e+GG}P2}S#!$&@G>)lQyWb<2N?9#UUk}LP$`ju9q zg3b$$1)G_Fngrx{d6+a6v~YxF_G`denuw|zgfITofOP_!f6y%IkdV}Y7A>9*+*&fY znd4IbXd|)PITRh1QA#P}uIcLT&ec?-c{A1+c%c!J@DP*3-l0u>sg-E&s790XLIBs}Qb>8YMF(g`_7(%T^TKS>wf} ztIkL5ny3m;KlPXKIWz};BxsVtKny1EFbS!Hb39_9G(ewyNm66abM6Zt z{}EViy}j=EFi2lJ@vdZa6a!Y3ge(h4BZP`b49v*E{R3G z-eCWUb2)0NeDpTR{P}}T@2EugR)u6_${LkNkd-~Og^oKLk2a5vdNr*CiJT*Q(_f__ zaOo*L=*2wRz?N(iZ9CLYDyiZywrW-?Xph`GwA@lZ!D2^AZ2uw_@j(3&QTK()PYY)v zCf}&8s1_i#C7eb_@5HG`ht6S$r})UzPpL%kKqwZp{BSQzEW|pWUxTiVCfXEdzx+|) z4|j@F6z%hS2@ISOQBtg1L%El^1p%{v`!bDpXJI&Emmw=~`WlOOMvrq0FH~AkXbTzt zHz36ScbRDze{+Bte@pw!->SGdsFEya{Lhu!Wy+yJ)nx;*D1?8ikad6M6)voQ2LR0h z0KxYA&yK@Q5%sXs_EiNlw}gC9dd;bhBjb}L!Fo=R3H1m8HXw?aRGSYCv!;-Q_;CP8 zzRG!zK9iRWF|;TV2ZnG;1G>x8q8=ayW&b-^j@w`QGqJmyJ(A=2ORM9&2wAqnBfIed$+b+hMGHT zM9Zk3CXkF}GtkB=g?LR&1UsS6Fc0i9^#RdHKZ$J_`~W-Mj4!{3?s$w-C)cb~-mx$h zvO)w#C&6ApSLmyT6vm7YH{yHdo3x-H?|-{w!?X{@^HoHx%!nL>aOxI%3rl#lcXmzeIIc27IT zJw2X#0_`r7!2HuPg?qh#q5<%nfCn2!5wZc;fOXsj`J2A~W30AdH;bkg3|#OntqHPI zR%=2^;Ld1n@QR>6B?{F37-L&N&@~CWC$b)=#Js3F6T&B{8n89a3B|@Bi#*{s*}&B zvEcQ{GHUIiH3y7XjcDLqh94p#YW*#umwQ_!{B;Uqp_h})Wi?&KrEDl(9!8oIYR zpDE#r2=y`q$$oM$*0p-%rib9vQ-2&Zj_mdcvJ|^|A~Jc+cc7?4vN*#s#9lq^9}Jg| z-z&Nz5>TBLd73im3ldH#EEF5@PqXcXB*5nG{UuJ=;ErgEP^r^#w0|hTR#;9{`mRXHEcRLLC`cjkUmZN;bU+Qcsf8QPir8cX`mY zRXT1&+D=i%8Yc=4=|w^?%rW-Z#cw&(?9Z{+gaB($(wJ{JF@A*;+`aI{xAEVWypCEl zyCn<3LhNl6M_s81IpyH>P-9PM9;}5*$7%~!j%!F`p|Hs{Iu{Y73PTo zp62!sFSgmw(7l0ItzOasSW@BC!p?$5Xtg(Iac{|!rh99zSZd7kv>aobw?OnXOc7NOIi@8X85wM9JF# zVrw3bi%|?bDCzntOVB)uD|A+5%Y>3QLsdzd-0v821oLz}j$ z(G!>j{lzA;V~YaId&v75L$;XNYj}?ZQD3zDSF+5iv!y^#(^%W2@aDTg#Uf;Qf+21@ z+_}>x!S|@aX46)h0WL;&TW$;&Sn}ENEQs+8%4<&(R;9n)3wY)JN}6esav$sHY(Ou& zvJXuDHIyOAVB3bT5TB#*ahGe74qh@Ni0scMwhw4Gw-qvFku=$JCr^@L87e%w>*lq< z??yQfi1>Etu~P}^UXbZ>I5H^-p)X4rE7azb4fHJqmgY~+H$4&D|DS6^BZ|K6~EcK6wdf)o@;$(eo4 zMy8FFIde(Ym4dCXL|L4g8VHKW zqQACXNx@de-`t%W;|&)vXbwY`G5l#lQ5jM&ij0&^eY;&XA0-fAre?pr^J0)*0&!}Q z_>#<{|K6Vp3TaVQJQa0;8%{ML{cY}2gKGFc14!ook}+KhoaliJd(rkDNAe%CO8goI=??MO2-@JCCUn0N~}hO|Oy zJ2sxrtLcC~tuQp%C=&hGmoJ!q57eXS0>lv#E7q`CrtI13UCT;X^?V^psWh2hH6x^U z*9dw(%&}lo#L_A710nYb!ru|>@9p?PMy<0*0temaEw#d3u!)jwK5b? zjtTbgMn)Xa((d{$GSec6!qCBM+Wd(dTQsys zo6jo4HE2(Szr`1|=xwZ603Z;8qb2|_b~@kLvZ$b2@B72&J6=6ptk1aQ=NpGnk9cr> zzzS8tzw8;`j8=xL6EW5!mtJrry(c$Q|7VV+^~s~_bhIsTikCw|wGVjDO@GGmP0T|GUm5(^CeEfpQVnai&}V(`E-A^5im5Y3OtR}Q(QVr-kBF#qOeJH%V}N8R+JTtEFVxL z3`+zFf=MqC4TRnnK<&dPK(MJ|h4hJLYx%)lG4gOWod2?|^6FdqB_O>tfPC|Es+z6Z zQ~^%C1Ef@po^5AQ=|Exvl}8p9a<9zjL= z0TIfmZi4B26Z9r(Zf8N()10!Zv-ug;H>&SLV;k+>jd0kRpWA|h)rB^MXa#?&g3RKX z;%OR3>HI=hd-*r01ic8xV%KXzT^9Gu{b+XX+$@=C*7dv_ThVTv!Po=Q>~!`*%i2Ru zxvEy~S;Ie{(9{VDp|b+RLCodXS$F`6XP91Tsq(KTJZJ)DhF;(aiXNxDwAo*?3?V)S zFyq)t3IXv*zn9U5Wi(3&ys!B+#`Rdj+dls00TBXxdbduG*BhvOj>f8w2CcM|D(?%3 zR-yHQZuS`2J2pRK=}a_kav*<6%%mF6z<61oe|Y^p>15>Xu*wZ!36I@drE%h~*wT8B z{uVc?NGBLul*!7hU3#X1ItqFA>Cwm;`KIY%z=+nmSv>5C5g>@ou3bF(c_@)emOgQC zm*pc6BD1MKymI?A$xlBN+`q@k-`mOZtiykK9kLxi=Jma`-QITN+UXfROk?t>$FM1v z!+jdvv2EMB)x9xy^ULK$&Sd;Bbor$|ixerqTvos3Gf7%JxA2CK45i*MASPTun@L^XgUSfcoBXqwo ze&|TCYC7Fnzw#rgN$nN43L?B&Rl^^* zKfll{q3u8Z%hiaHzJ6jh9_<@OYV#<0G)IDWielRN?Hwk}QRWwrEX=&o5!7Q^Ki^*4 zi1VbiT0gx(x3;9YV|&0M<@((w%z*_Pp>x%WQ$3&Dxk=2{w~^l%Q-Ji~?u$C2jm2a= z2mI1L{Oi$_%bC&_!git{c0YUWhAb1D;OE_oXBGMR`_~67mYlDE8I@Y^b>mZUEJS<% zuJ?ow&ew71wqj#cEU#Z@D#~(j7^W3BmOOVQaGA#<7&Ym`ebc$$K9xw5syp5uv1_;= z1cGGB%ZzIHmM3HlDDKuk!^Qf)q-(S_CcSDn{+!3N(r?ZBB@Z?aHZA}JiANA90y5EW zv$!A^#JX%=oV7(_TQ~0Hk)C6ZsI27|1i4{`9_n!}%N3Z8iZ|2L5>HS`NUcqPqw9k<9 z*q4A?|3rua(ael~8cNfa1%`gm_0ErkhWUTVrSTv>9f-sbF6p zP}TI=V@8KKEP*!@KAA0L)-%h?DD{k6d)xN)vo z&ryVGjy$tFrXVDexMnb96Ra>+<|bcri4N~u8k)x&chs}Qhi|3`^_0l37%MGZcLs=X zPIsV4(FuSn2v+8Fow{KHOhWp$LTRRkstAix{{IDJK%2kguM!^&V;BQ^XeegY4Q8N2 zEX?H^VVb!GLX()lLoxHGheJa^n2H}3jV%D3)BZ>Id^`7oWw(6YUZ0Q^emr_lNz z7P`&&iFpI`tD*+}EjoW1E+^!eJ}2~ae-}R=ht#Mi$O(-ykolxWmrJHZ6Cpx~7Cw}J zv|BGqVSq89C%@`NVyU${ojoegqc(OBS1IJ9MWdllS&-x*e~y6&MG6rK;72NQhLe%W z@c5W;T5zaRhZHEGp`b_o|7a20qtx^?yVbkUkb zejMi3qifddZ1$Z$r%$Z)b+#KFTkGrS9ev-mU%s=w@AayC!Q$}gf9b46im6l9_iWoq zY+SzWHmklq`&nDeEH)f9mR-RSTyXlPN>Y{DRBpMi!p`bO(ywQ0=|{@WkrDZ5`3%2V zYx{AcLw5aSH*eBMlgHTqdlc(1)6)18UhIF!U)w;4r}3mi>i*PeI~|%d*JkOnDXaPH zeT_zeYUAm&NUBvjNYVfvT*Ltbb(PW4>AEXXtii34eJG-+>m;8;v@F7T2%yo=CE|-- zhbbc>u#D5$7CR1_>2b?G0mt{TuVN5|{LrVWV6p>{LqME+CaOd6NTr5QQt^sfp4 z23i)ONdUty%%aTmsJfa)Il4^*KT9jE#cI0m?B7ga0If;IK)~r&T*kVQl^Szq6mVd` z5H%VWib=VtN(?jn>IEgS0nqbwd8;VK0+Me}3fcFf!Sr6vaXxjO^U#kkilr|4ep*U3 zMKWKaogb6k)#TJI{!~?9+QHDT1Ud&(suW=#bqMi!I+Z>@BHm2bk@mu8HB9paXS7r4 z`yb#Je)lb+pyi_+z;n>WmW>+ibTYXLQkQS#RTh7jM@7t|(a}+G`HdesT>TZ;gik<*+^%}ibBjD2SeO$DI06)q~N@Rxr4OiRtecxASb=PFcm4l1N<75`FdM%w8 z-xn){!anzY_xJCg)ED)2x?XPgi^iSB+!o`7!cfFuyocpvGe76`9kiIkGcz`ClkG8# zfdKyXj%!|y5A4}xNF|qij2Taypgw~`IE;={B?DiP>yXYcaaXBFH#s-WO4<%`7{KU|)Kr~NS(HI|PSZ70@+ZZd z&(x+ec7kpia)OScp_E+lfsF--1_mN9F^H1{|L;dm3*};!uWTYbYG^t~IOv^guZ;+) z=~Shw=S!c_=`5ogG)8VF()_Rdg@6fX&r1#$wODpbBdgEq$W(FHi{s$6;3491 z{j~ZxUgurqJ?{3qp5-es^P@eO@A}i<&3l|fh;jol2y+n0v4V4yLdMK}8H~c0k!aA_ zB{W+qkIOBUWSJd4RSbJQ-r`s_uH@3>WX`J2O$cf8qvOY8aX9>H*sq@zRD#I+zIGAx zF>ei!Q}g%Sj_0c->rK6b4w`*_?g}Ftj4Cl~atOnG*$T{tW@8$*Qi6N= z<#DF^%s!5O@fB*dS{UDwqhY=1`Zyn--q&YB``?3t`#V8V4dFZa}l*5gEn<};|9&**c>d+*J|CK2n0bxr!u zgT!@1nmo7ZIi&Zffk~K*x>&#{W>Rxz|853xOU=#wsqtKV9e?kBwKj`K5f?i_Jj3tz zd#Jdy+)7pyem$f|IyaeCVEe=A(OXcK z4po|`oA~-aHW0!PYEbunojj3*;Zo@S){y+PM?PLF=8nvd&WvDI<* zU#~fXU9C5R_Eme=q0GcXT=KbZeh&fU+U~WkE41{q-|m}Drucu0rCNV4M&DD?)neu$ z;pGTi7`2c7k#Y6lJPBLE-53VVR<<4h#xa1@&+bq_$eDBe25PrQRJOZDg|4#g?dn>% zJHcPJ`F+2Bzjxbkb}0y#qO^>U6dm_=WzMV;C6Cpgf2u3FwrM4M!>+>hp`wqYZY;WGpf|MS1HktJ~c<+a@_AHnA9>Qc~RIOr=k75>C zO6n;}QmPcCDP1KgTdrDNVtF=gzT4f}?P@ljuGiJE+V-}6uWr+0lpP= zjU(%1)gSe`)a!P;O_s?_9Gg~#u6<~B0jef^ePwK&$)T3E!HY~ImcSIUUa`9(TG77i zCLdif>rTJRS8&--mOiQxwbr5zzI@o$X*JsA4OW{}#jARC zvZ7KFK*5*9o}QMEON;6)p3}o=I8^$IT6Q{`{;DDVBhO*|;Iu$(xXX<=(bd?l6Mi^Gn5B{E)sdTOG?e&z)-R9AygH}{jR0}6eXhsw& z>Uz&n=08`=eElsqSugbceJWf(zVY{BQk11n zDT#ntWtKvTO3KZt)_R_ro|Cf2PpAE}_fEfNle5~Vr>BdjiGg`fPay_d4R+xxvt`%p zYF0|s8lJMXeQ&iI>V6NAEcg5W-|y_Lul%`Ozed$=C%^Z~zWA@VTF-a3x4U%JH=EZ6 zodN|9vY~V;;TnDjhXzB3By&oNJD=A6hMS6*k1x z@{Bdfx9hGKJ$F@aR-b7Y`vFe7!K*s9dekFZ(etACQ?t<|a*>DFudYS`Fg%Ddc9vpou1B@6$y0SeVu)- zp_Ze4^7}igi5+vo!s=1g%WXA@ecEw(9F7;=Xp4#Hbh;Wj?{0r9MM+_>%m40M@1MiD z>b;dPiXrd3?d9Xv?%DSnkNw&Gw;tWb=W)3IJ+aZ>Omq#OXWROAK6}#q2xzIfiEA(I z9l+nuN5{vMnijkxX8%H&9}{T+rQgS_3F2Kz0S{Lvh4Od zJ1UtO8}AOg{8u=e=@Aub^yBiGPuAg)$YXO?MvGRm^0Q7%08s)WAPg~Vbnu+H?r6-+ z%=yF|X5{cX-@LW{XQX#j*jRiFwC+F|*^=SS|Jrhja zcB+9i8-J^9`u@92#ZKSqob7Ejt?WNvLwP7;7{Jno0;wr#V<}o((p6*0q5=zAeAWTRwib6sv843bK zD43F<9_KLxT`-E^8nHkPETS_2EDWfc)+&aX6*55#LR2WS1xkTZ#ED@^3=(GHAjwG# zM9qdED3&5-1p?3`0>~*uETF=oRglC`SWzV@5mf@w6C^_gkQoM0VU!$Y3`j|wl>Hu; z1er<`L?kj&7_tJzpay{`Tu1?!Lm(&t z$WS3tAq8a+(1NgvxC9{*z{sqyhO-uc3P3fOAwWnbmkC6fjcE)8CPXwDf`JBz){#>a zL_}h-B19nz3Xn(&Y6c~RfhZPX0fZo22|&mf8AK>Sr3f@wDWxS`VHuE!mT01p2p|$Q zU;q*SZ|LTfQdj^ZYZq|>UdK&Vgvh^hY=RaH*t z9q2yF?nB+4yl(ww<5WZj_$db^$Oqq92x$DQ<#~(YjZbhQHrQsMntpuNisrtBA#-^-=}ib zwW=RPW3w(M(%Lv{&$i2nNYV_6S_VlW(Lt%fFdaL)XD?@Mhq=RpXmgI7$vG?n<;}Gj zA&?w8xuSN(-fKS`n?;x!Of=;3=Gv{eMkglZIBe_~>rm|<8%FO*gU5#3@wP6Ljwz}$ zHnc$zMH{fnn1eB~)}2>Q2FF)$YgHQT+1Z@9*^`XZSvF>7VS}CAIs^g)N^Xd6!PO0j z|0HEp36QGl+}i{L@BERAsWP zT6JlsTxYWy*XnN<95=0R>tT~iFD|Ui@L;>JM+FB{aacyOIW4b2B^V2Q9>HP>jeE?v z<0b}(;WbS`_XUK*bsT0cVwx6aV9{q}aG+`17rZP5qtz9>!}FAo9g{_`A7eF{VX+aIKRcUhvvPGJl8R4Tul${3xE}<=0w6Yf{BMU2 zK0ZWScB8aH%Seqfl1eDDcBOaN&$k(H{H1_`1}f?kyIsmG4SD&aFb^pR=4XN zq8lZ8g3>D?+K9qGTNCPGG7!mzmPM7MjG>s*@d+ZMS%3!UrD1Oa0~#!XnadT})|9P7 zQW$0sP7^_umSdu52W-%7L?s(JjZD!g36fA^(Tk}~l%Oc0O@ms~q9!#-+RSXM!YHY0 zA;uoOQWVS+H4}1piE8y+HLoh}mt=tH^NHE78SDJxlnEbJTy z53IEau_n+Z8x5qxF@rF{g201TnhG~-L8uL^q>W7BjwP!AgKTu^#wI4$Q0F6~ryA|( z!uH|j^RnF0JeR`hB+-v=jux(7JeYeJPCT)~t_g0G6IdqJgQclLkR&iks*`1;tY|HE zMFCkRaLP{t(K}gXS(Zg+WWG6Pk6FjTGjxwP?|1 zvkRPZI+D6my7%85BqjG4TaNs1;l~)qEJGC*L8B}~TE#L9kF}Q}R+VkBO`@6tf}`(T za1OS`VDUXfcWvdJi%dLu?Oe&*c365of%fq3H8t_{owpTcStdxb845;>p|L8s%2i!* z!{pRgXiOXwt>5jg*by=hEae&aDYt@}TN;9pR>(Ap%8D&b?w9?=^%X((U;dQXCJ11L zy);=GP}d5!d9E^LLojqZ5>S*KaExi3;l8;X@MFb}jP^P@C7!*ui%npLfEJg?nyv1z zc?8tGD0DKnV&o#M2}4Mg%$+_HB~)ZIUDYo$7*exMGm_H_vjFUfrgIZvYrBpkqieS} z&wEc;8LMSGR${G_rvlTeoq5mSa-8PFqhA|3Pn(7#dk2$y7`>c##*R*&uZ?s|5+0n+ z_OMzYk_NpTaoaK|LVFlty2%XIc=GH`B{+>FA&WM0nQ4!VxVRGHFFFWu_IGd_IQMHQ zu}QVU1k9n=TmV>aA^53>5!n*L1vB{vWEyY*hTJ)0df`Cu`y{u-?Ki3`3!^eTDb0hS z0#gxfV6-C`vXr3Ku;v>Fu-YVn!AfITvAtr}u+D9L?Tll&ZJl{OU)l zt*Rxci3?!WQb`dE5<*I(2m*lwT16F^T7UpW4~ZZa!6HZN>PSdp!WNSGSOVmQD4{|F zNPq}ZpwI#VN`RDtB0nNk9ztq}DXA0@^zgrR3#nBAKLE%f&1-kK;kU-Sq5U^u1YgaH zs*(soK!O?Z&Q5HeK*AOMWOGC)OsT!0GdKnent7JvemP%DxYh-P34 zW?(9%3rHYRzf~xXN~$!CBBcSS08Er_ryw%?eB zEZuOTs!|X*VyHcTtK+yfftdg)TBcG8REVHyB0wn+iBg(CDOysJAczSNLPebv4T$%>keY=7oe&!5bpJS{b5oLp~~J znc=nS2OCiaLJ<)SBGS^dRV5=ZGgT=O6iqb65U@oNF+~!TsYxs`F%t<(Kt%;Z5HUbh zK{U|CNK!NuG=Vb}0Zd6%6af;9qeV3oB}Ee?Qw;+Lw5Kyo(BB?b611StN(KJ#;6-2QKK@c%hN&<}(g$z_c zkxfM;(ImtP5>ZJ3Qo>bHF*Fn^2|yJ{2*i^G0;>d)RUs=%w5%0UNd+X7N>Nlq5fH*4 zPz5VX5=ksdtW_xrNikBC6%s*21yImaLjOG9XaYO(`-{Nf1LIP)igv zswF4|AxO+KQk4>gB?T!_QiM$f08j-D0T9rWBSi%S%0$5p4FxpNQ9{&01vH9LNQEjT z1ye}%Whe)+83ii$BoJ~y5!xR_lz2eWfPBaW$pKONXjdf>^stKA2twXy=*v}Q`1Ra3 zM7kx>&2l1nFclx6754EI>%kVtsIwEx4YVo@JT7{(&_vI5cXV`Oy%TpvOqHT2n#h>p zA|8!{#SwO310l%}Q$q-0El@O{8ps+DMwBFlK>w78`0(|r3JugL9WY%{8sH}!0M`}Q z5!VG%Ob2=d>`JFFgV%s)e8nYEPv;>CsS=c0R6W!Q_sIwTk}JYW2Sf!M;#Cux5ewiZ z38zhlnVxRTCM_4$e0L|dCj|x-fhho>QtQCsM8J&@EQu140!0KAqyq>(gqoEo1Xbs^ zN{XPAD26(w9aQS{n2mMUUq;k-reW?_Hrt}be-=>4LR!~D^JjV-7gs)DWI&|Q- zPTgFrRN4-37FHhA84W4^0;y?;q8Un|3YlVwX-jCLhruNzCuvs0 zjSC6V5FHx9gJ^pDvG4E24vV`lT3jOLQYwX!1H*s;k|MRK1rx!aZxuzf5^{qzGe*3e zps2FV%XDbXhesv)af+H5SDG<|N(q)H!-2z_5X6K!vcF)!Aa|iq&_FOTGbBhL!Vg{w ziYTCoffVS3d_ssaXJbhSmBCQcWmZ{c{+$+Qmts^-TmklKN`b{lP){I*M|2zFtE%=h zV&f9~_|m&!Wx4Lpn`-;-$D>|E7pWBXkwg8ktcVLVFewtMJ!uA8go8?^C`r!380+ir z*HR<>sz7}&E+RyZ0T`(O*kT(DUIIc6@DMS_8>Ct#aU`UwE;1Z$9B|`2W49bg=ZBu{ z#?;_}meCU-1VfETi>@I3!~}lk2%iEFUfOa71hD|rA~c-xtJ$kwtu+=yOmt1@g%upI zL+oMoLI~IOD1fvGu1O$#5ou8MoAl#B#p_(4K*)b<#+U&XDhUK?v)r%G^ShVTW$Cb{JP&Q&Alw!G9<$@cb z1>HzEVPR#Ns?D1=)%ontlO{wqpaY#?ieA7AqFXiR&zjuXvtq@j@#M-tCwe0z9X2(O>)$c1( zhz+7F7l>z$mes++lC%AUbeb@1WWi#99o zzOE^cKE}`PXk82L7KqRkkWjiwG|@A_7L?Cl4s5wOFl4+u?lA0>+OD7#uq0H5%3u%* zks5#!GLe8Hw$l}12vztZsQ^^hseh8e5HKkMC@@U0azUyHO0w6NCQaXqh4`_@BY7yR zM2v&J1P6R#Eh;tFlCAuB_3OkQSH#JNv{!f_-qqODX;6$Hkm6|MnJZ}+U3J;$6#5n}X}`BF{fdI??_$;hcv`oi}Xt^F28|RgjPT|3(=YqOqb?La5?84Lu)A z^1L-v^;EC`CioKKl=)XW+vfjEwUWvOsTH#Q;#Rma-$UyBJG!)S)|V;P_HRc8$!*^( z?+-PXh71Yn=wfhA&cO#MI(m0?zOO&(w70P(K6XLTAlv*O$x7>oneJ5s8MZJ)bg8s4Jm*sN4wFX4&l@Z^jXwquYGTF&Z?loLnnnMdR&+0AGF|w`Zqr&9JqFZ*DiYI3~x7%i4A9+~VzWV8fBSn{v7x6mBc6T20b&Kc0u( zFz{&J^_!g)-rP-9vo7|hYe#9bE{WFQ@SNC_4eo9XvflDL99x-iVaVCV!ev}t6|t+i z;iJ;h9gZwaBV$p9S{PZqWZqd@IT;$TJDJU^T9u=7$Hj(K#m!o{v~Zm#6w9sJ)mdfe zc1_CCI}KgTyCrZ}{Yq$>8{B7!(c0{0eG_cBOm#6P&M@Nk&ZiT|-0pDkyIE@1cuO#; z-ln9piR-(auN!k`bE65t$%iRS6<=)ZVjYghM#hS&;+5;2s$gzYM#{mw>oj+}9Zb94 zX7p^!bGLMGc#X-(`5Gs_-OYA84vt*fFz{j2;(IQs*52&xbmd-GoOE<$;P)k(y4%jT z7e-th+c!71S8Ksq8l7(UQw|O8B*@rdywXVGaA#JH4GNmNBDW~)ob6b{spD$uGSM3B zY*~|4IJ;Fiu+gJOR@pQ+w=*3UYU?m?nx%D_jLRN}Q!i6Vz1rU5@{G4NGS!z^%4}s* zwM?~~=XQ%)PDx6g?$N!$pVoFbGTFFwo;M>1$;HMwwP^7)F!CB4#dS$;z#9VZqqfqO#`Dt;URo z69K;AGnWu93Mag>5!>vU;0HgHc0o13N~vRR3*X5!`LEXl0b)MbX% zZXDIk+^+{_=IxrCT;^SFrY05KS5Yd#?9+0*j%+1xd0Q}eE0d$yad{lgWZ64hU2SF! z8m97^yQU)w(PLz(-tA^Lv@qz=xz^P;t7zQZ#Oi8wOsdXK#Fb7hF-IJvcH!>P*OcIeroi^%VDY>m;{_RRd|~^o88lMda9Ex3_7d1xq~&u(S}?Xzp7Rmu)~mPHdiMDi{+Zm624n| zwZe6~&et|tZ*{uaZ1Fg{-bS-V<|7Xi#OC65Ig>7p%rf3M8`Icm)v3Z|gz#wKaW-%& zHxzZv3^*#aIHnC|=Tk$Ht20M3p@~zPq~PjkFI5s?vw00}V@H|bO6xGeimW><=JDW-1;-(GFTaJ1i_T0HU-LA&1 z&GEF%-gq$Wb+u(wpD9V~sP)x1m9q>fOgZ^2!f#eir*kh8#PFRY*s~MK;_z~raCn_h z6w`IOHelS2EuSga;NYigR|Xxa=4mFZm`-dk*((l4lS5VxW0~4K?!vb_({gY;CMPq* z=__I!9yNOD#mw#9VlA~TC&B~Wl_B{keceL)sPjkGrUU2O?>^z@TR*&OCJMDiOWL}g#lC(6+uFD=2TVrV1pKk zVxXo9KPE+1AVEr0_Mq)=-=I7Rex4VMBiQ^VAU&T8$|8Dm=I8>@)!oc*EA@H}O(c=i z7{d%>=Np3=j9giF*Y1aq{8?soaKnwyha62%zThCvu4sVqo=0||@<7}i4?F1ibYG^f z1vFIU*enTS3L&L&69pG20Du2gwe z`Q_<2FUsr1*G=bLt=^kn?DG9E6bG5*-m$o&~ zhtGcw+3q+V2RFFl`pmx5nr=UVZ)og!-G}gxLl1$gi^TU_-5i_Eoy}X{G=7!uy-wHZ z8$LHjq2yuky4y5Uz4hLt+4S9yn(??CM-vCI_l;jY`6~KvEA+1So%CG;Y}qzW(C^Pe zbB;5|Kg4GBX9qV9G&?mHzLG46_HhsgvxA)}Mop#fvxOB0(1fPmw+<8RB;P3#eH426 z&_^sdB(U=OoYnLS8L`xrPP`Ck9+*M#R6J;vLH^8sz4h;<4LWq6i-(`l;s~eF{O4Cx zlnL3D?~JLfORgP{Z`s^&UO=jv0+Rc$3r{e7Ey<6RiMbe4!~#|Ihnu(rIeorjgh_Xp zLnxYJpegG}>rWfZj~+WRo#}Y?adpF7;+gU$PsyjxfFXcEf@5U}$p}IOieidTJlxGQ zLdd6s7T7sslgE8A+>szad#nr}j9&NDC~h@{3Hv%=GBLMEI8=}jJETe+mB|Nou!zeF zk|3nircJmFs-+T%^9vfNiln9>`jtb5Z+C-{xHLR4;ZW{Uh9HKbgn|hYsE8t%lV(#a z4h%b`(YtoJ9i3c_lv$XBa06|{LT-~|eZBeP44Psh8Y&_pl(*r(e!}7A#vInxoDkgR zjc^EfQUlW@fkl9e`Wp#64&k!qAhF6nSiU zB}Srxs4K$EYKo$$qNu8(s<~{cs;H`~moloVDvGMAs`0Zbs;a80s;a80xvZ+FqN=N! z%8Dwgs;kD#psJ#)shCw&6xC5wRaJP|g;iBmQB_41MNvgn6;)i>m0e3Bs`7Bks;Y{j ztG#7aR8>`XvkEGztI6x;sw=OEsQLcy?*9vauRS>TCH2e1E*KePJK*>JJQQBV6(4wQ z-3NwXaSE#ZwP9K6``xwtu76f`eJ;a?IGrSiWeym66~WhK-wV+y+P?v*e@K^prm{Zm3v8e8FV;J?xq;# z-;9`Hc$-|f>5R(r`CvU&A1zZg?pU6gQ`153Ssx{mZ?JpRJ?bYVT(AMlC>nx_y)GG5 zQXJM)(N$d8jU_$lJexv*^m@nXeYLWNc%p{&0KGK0b*Gbp9lBc4Mx&b)MWoA<4N;zf zEQ=PfYQ<04V|k+-Pz*NfH%Mz#Q*9(N6&@TOJoxe@wi_V=l$QdCID-uv@S9=*T`*@_ zm<(hK0{p8p9ZXfKW?wM9pE*9KjhVx?q-Qi~ANgBslD z3Yc0GTB1XK1kVG^AQcitsMr>`Z-NB2nI&6=g%fr5*8Y}33=ko)*>z~Ba-c|uHHx#V z)#~?smv`CsU0g|6Zh_sfd~-NG@y{p~59#=-VMsgeElDX!Djl|V*$mOwD9cuxEYWXm zWxs2L59jxOlfpgM;Mjs0pc|x-46y@#F8mV&Lrev-C|MESo6rqidLV1pL?Ei~&npAc zl%0<-?4e=QmBPC1v%v057dB2=+F>g^KC|BG^m2YXejB`BW1rtS>z@8s#UFXI=rM3L zu-Q+$@7Kevi{H=k&F71_a>=R9S{ghR#iIpvWUc>RetN(8=_?n1*N4&z>^CeofC)_G#qvO-#`1Eb@{M^3k zxF^zv6SJGKN{u7pmI!vQ0QdZ}?2$2l=~1+ApLTGSko0HX9W#4Mh-?IkL64deP8Y6tJI-Fd!z#ZXf^{u(D9-Clg!!l zN@}XzhnUL=gF3wIo^48$;@V*!i2N5f6WDC}jT#)4&573HuEFdMskVw5L>lF}X0^4B zSYj~4f!I(0cz|XI$cW+-oIMCO4P(Sr=)D)d_5Z_`^8Y$XIkX_W<}sl6nnsRQa_Z6-7)d zB8oV8)0R1?bd>}hP#_KA29H1-I1dCgZVdw{{&D)>2rlSE>nc38pb9CwEa%KGd!MYL#vFy((1&6Hyz*9^~4N=n( zGQc(0yVmJjuce1pAbTGAo=5)l0W%82G*XdTQzkv^nJ`-aMDpSPGKB$vrkJ>)rGR0A za|b1sS(7E)Idx*a3}^Dcus;o;_h>un*n5SM(!Hwp61~Dp;by&8TcYh3-*`obAW``q z2k>A)ER30<$P;yl@&pR&uTGshbm7r`>&?ubt$ht>xFm~wOW>ChRaN`@I4!jdGwnTJ zk-J(Al1Gsvd`>`sY-Y`6mT&T0y^*S*(ieAJq(;hnY>&tgg>lC~Wxz;ER zVddV=^gR1pXnDYK7(FMTEDygr2%a7j#Ek4hMoLDvo*DCy#}Vxy{lDV-JO}}@G8Kqh zz63`oLy|G-MvKOmy2q&qiGY=emIISbIcHA0G}tPtup{D_0s#;Q6YGB0ph1Y>KN9(b z1R)hEA92f$HriBfK?Lv#K2Sn}#b2aCdtW?(5I{nCm+Q~oXwMYjGlJmI*h`rFMZg9*<~rn*9*U4<#!=_#ojR z^O%)qW_G8Xm220c>#oyoEHp6UF_3*~FXwz(kPbMjYJmlWM@2L3<;$Yf^yK$`C&1=` zc&63?jTuAmEhiU{N0iO&8>?BN2l~BSs^<#nw+s9hm>*srAh#kzjcZN+BX;%vE@j@$ z+wJ_I?8cfO6ThwQxJ|RgVjg%}{U`<)unayB40w5v?;x;NJ*YtN zVSHNR;Aw*gt^TK<@V7BtyOZ-jzKv_-{k2dA;9{}0)bj;RBqulJ{reQQ?H|`MMWT1! zav5qp`WEWN*LAV9e_@O&#!sfF$3x+P=ojJ-%1i3Lu0;6y+_@Z9?*&`dytH9*rEr#-H%cs1+Q(7W!j0r~#>-R~+1 z_aW+3KRgpEs5PReIYCT3g8RY%e)UM2#DpDMnyP`=ejQyKEZN!U*|Th46w(uKw>)X2SpJThz4H8JbFV+O$ zYOIEZDA1lia-nIM52M(sgRq(a9EfrtnLt7FXMp&71dNaY00krkfQ3~P&tU~%gCsZ; z(;y%e0W`zw{#H{^ekV`??2Q1?P)uE=h<<>d*#6p`=jUfqKv9?n%y8&gg%%-(6z$KR zOyCeW0Rw=55n;^W9ULJf>ctK+Y(deZk-rAdKWl31ws7mmynilH`aJDt^!``(EEF(; zz|#b#y>zc$uBN?QI{1I9ksgYMR711vR3C((q&a*=aM}U-R6SfO6KBYw(#W9gs1qa& zm^B*dUDfipH!j@6w{9dKVq^pBYB{l200}Zc1Pmkdkg=<>%PfwqvX-wMPyuEjJ1qbZ z>hTX(LLlM;NNT3GYuB$^&LXZP)APct; zL@SDp%mq!kWE9R1gdpKn-OuKON`jn~wxXv%6H2H##^K045Kl{Lpq~!_`<}-HJ>7; zK(nx(5yP3{JKrrqf8_#9mjs=EpU~>a4W=s}3 zE_?2Vd*i7-aDd{dh;_AAx~#EuJ#(ql-2(&P5a!2XyrP50a)AR=^|mBJMh@$)+5*;$ zgx0qP2ff)cN0$Zs#t!g9*L%e}{%?u)yGuf;RZG9@Ke7G{uQqOzW}CaU?(WdCZNqL{ zlfd{nUaCTCib1j2{U4=1cp(Z>5R8$M7`ZN)MEGu^f|p{pPVf@@KUzpfxCiLsekoF}CK64Bg^Gf8!6GX} z6XTq4^y$6(J*t9wk@vt4QR$t!jxS0S*`WqNkt74qR$*5HTX`Kt<3a-<8RY#;9} zON;@p8$=TVd!@>LScZ7b(Vz3yTiMOWmbM$<-6tTQWG9P)$(*!mF9M64?&dt1jX?{R z)wzx|G$3NZ)+(whDyoR8s!1rQf=a21A}Nv>Vj?2C&|pbL1|dYjA&r3}iBlmdMgbI& zfq@fML@>mmq+Tky*Q3)9EgAtjox;7t(W|nsI~y$R?e09mm|@W%Q992Q!gw;w#LE>` z@3wu0SW0;c(fDc>5Gx!5b6az=)P+wev|B<1BCz#kBZ9Z9D&X1E5+yEw$c z<@VQ1-7%cA^~X~O1wU`nAD2gXqxm1ZVjv$1hpdGmP;E__Oy5?`i7O)$05I!T30FsXjo8?R3OGfN4wy1%wC)Wr<=@gfNUHD(hlH zyZy;9CRB}BVI*NA-&ENTN@#w>`>~_w$@P1;Cg3$k{An*{_pqz|aXpvlObmUBdWw0DZ7XalsDlAT|G{Z(`Nfp-WlgyEU+CIBoS~oYbI?Niqw;4E_w!!#yJ;TSZGu% z3`dS$0_b_g3Iz|VvE75^iQ#f^^Lq}5TdTZl4-^DRX5(I5?&I_F zB>WeutXf$dBA8M!D8#EQDT#!^V4VjR&(8*lWSC%BX}E2`xb;M0XW6c@sX$01bb1Y) ztZ_Hsja@+3R<>k^sL6x1vvTW8;?MHH3iN!f30&ux^Od=REO;esi1r>IRVR%d)wFFr zO7o5l6X2eC(ImmR!K|L@lS#5{t;nxb;Ax&gML z4_rzb!qxqVj^P1TO7R6t2>F?2Nb{vlqX_2SKHyxI2dd|VwH(mu(tw3Yc}|VKF4G@I%-sFT-yn8l!26~un(jYu)=aGO%lCO%hog7hc}i=Qkv4Uub&alGNl3s zwNv=L>hqNAYuSfIlsE#FXsr~b0J8xyc$FYO7N;ip#L*G0w)MT?Qix*npo=KbrpRwJ zteF_oPV;X*oxQt-r6ZXH24G^mC^3vKkR7=ghi1wu%3al^!z@PpM_-2xceXg2^O(naP%iQI`XBo6NFeZ<%b#qno31cFmky-)!ADI-DjOrx62q-9b~CH6&*_8Z(*}mv(}qKt%bxxQt*;>^M~#W~Gn+GyRuglyjA!cA1rb9T5lH#wP|Pk~%aCngRi z9UD2BN~(^Il6N^YIybR9oLm~DofMk870tlHIvo0UZHty!!eYZ0EN13mNUjCRy{fRp z>UChj+^M5~JSRRm(X#I1(;1kll~pa5JT1z#bZ}|7Y&T3rnHw}TYSEd&p{7k<=*!(| z)yc(y-?v*l+{)a#!jjyJG9g|P!9dFuWMlzt8-fs=60mV)gu=v@Wl@s~G-((zd#S2)R)fRAY7(U{6zFDx`q;Mwv%72z@(dKY>ti)bCqTSF*FxDL=teR+Ua!IN@ z6K2e$IH*icDcTcPwl^N1roi_mGu&}F_??M_Xg>dcdD0bL3XDoBwONH_RbnZP-m`fh z6nx9>aF0rV7;l5;yXV+JcoV_x{b8Myls0GcSGdbz;v;)>cMd zRuS-x4}5DhY{Z#t))Bi`zBekGJ=K%YaB)JlI7I!)meM#(3_38(fWW4|1iY&Hy{1ut zg4I&AVa>e5)>f~HvEJm=aG85!b23q)u!_S9!BmvG=1wT+ZYSTjw-!{Xmq+g^>X$AZ zSaM-huw^24DCU#K(ndj)qQc{#;RV`V4-vu9JUnJ1;_{%OdTtO?^qL=Bv6{kspnBdF^X$-S2q&1=xbJ2I5)U~8TKN~%F)~WnjSyn`<)rQQpVa<`DGO39SDKM2V zEMdKdifGWOhb9}EkmU_V{(GY@b9Yu7+nC&A^RW;?v6$htQF$KCr3^bxmq;gcVU8LOim(CHv zHrmR9)yiQtky5bRCfP6EAZgXyX7mP%ObRs|I*K98>X z?*^~kON4c8xcaJ}UG~^RX$PpnNi`}}MOcwag33ho+*69Hq38*$jw%&lWq_^D5$KVk zeS=vTNYtfSHE)DW2AkwGXqr1o+IuETpCL{c`qQg8gbBuM@$qhPadB~WNA2e$L%%-a zqj*qy!gIxUK-x$YC@V%Ay-AtSYLi2=?ga7Azd)!5pHFqZybzIS#;Vcx(K9pPW}f5 zoZccLbaIvUQP_0&)q|CisIam}yi{grj{2~qLm0;~gs={&QTSvH18IAZv`qMHt4E#5 ztFY0l3^46w#ghrEMvkUhI-hYjrdV%rXi4H_g|bW@cJCjr5?wzcererCH zDb}{a*rc#2q9KG}kcfx~AZ7rSBw!(u2}qP-1|z}_h2BTbiJcvZh~n|NoRUzmQj038 ztJ3ZDOYM5S-+7-FpV{y{Z@cA&Te}_Z_pW!3ViAaYWe?nEiHvb##Pq2qJt&`c`YuzO zqDb|ajJ*svse_A)?;1(b3B7G1aYwLEM$q%|_Ag z$Uk-|qu)1Asq*`idCp5z&Dry|I~29pHLlSLO~R6t28LpS)Ich#Tb!6pC5;t=rzGZC z^PN3j=hS&-`A2edLS+07a!!KElIbMgIWR^dWPs5=c_NrgUA&M!+XHDNSn5D(YJ zKEs1~Tx1~$j1Uwu1LTD4I05J(j+#)Vi8@%8SQINO%vE{I3W~Ev2*N0=EY>O#l?)v6GKAw*_fTSJ0=)CG|`;u(VduA zei)~VMh(SjDO9Y+3@a?B@5M@~WYK}4F(ln9&cg=NLnUgAEQ~`97-dP9nT8~aRZcO6 z_E?&+3c$gPA~%lW9VEa)>|u@(fw7JTk@EszCZ-q(G_My22+gca*)_2oeHe0Zm`au7 zuc#AzF$vh-dkd?RRA9ykY@WwYHYdMro6`J`nIkCdma2Nio`TKL_gtrXtM4npI*k-C zs`k?ZBdx<|Nu+HeYN*PF5_SbrS*bNf9`tv*Oq|lQlB9$;u!1p`q!?0-+WT&Rh zpxLME{twrC@1xP~W|bw0vZrWi)@zd^`8%00O%Y?`!sV|c29R8x!cACkJiI`q2!usx46 z_-|*?+xDBM+bo3dfyGo=hmA865R>hyNb~$52f!d0go#!N7?6krYO)dJWhJMQPpz57 zG)0Y0GMr>iC4Xtr%5P~K)zeB{7+R{M^iu@GEG82y92KPg0fq`jsah~)I)N&D*7MN) z{ND{`rX)GYdD5X`c2g;<(;?h6P7+fw^rAd!rHelx;>)b}q@PO<=6<#e+f(cc;QTk4 zgS|@dWnt?oU`gROI+a!2vS}OE4#kJKm&A8us464bd_TJQ)gxa+q#dXSf)w5=&(gR8q3xpfJ#R(u1tVX%rfy);AfDC7%SLDUSw>U05ckjvfPdEE{qIi#-!O0^I?Q5^ZdFv zIP9~Rn_%&OiBH|ujbV?XMAMF@hm_fbQBthRW69mH;BEl?+mu9S)E9J2L zi|aLx3V*M-6KF>em`rh!l_QLjQTE?bC&=_RN53iX&C)_xLVn$FaZOk-)%l8L@|qY- z%4;f&E*Xcm`R6a28J}3L2~DR2!?Wv1gu&;B>ONE6(ySk9ypwLxmlQni;+CGNq%}7r z*gIH8g7C!bf5%vm+9BA3`=eTFvc463lN(q^iuIwOvc! zi4i-&*-FY5oIo#Rn9v|Z`ku^Tq@3K{(NVmyW0IuNVpWy_hfyX|EJTGPA$k}Ib zVFCq#@_F6)AmTU7CkWNL@DXR&H_WfZug z&aS-~8zCJ54&JK69Jf9vZgpMPTid@Cck7(6oYlSAHOrG{WZd34HOpsf zRu!p-2365oG)Cs>*(_Nq#+4Afuq-k?5P|j=dTw%>Z`Lksb+y2^EQ}%~ndfB4LQR(! zPe>857EevZ}--1hUrA1>AJ-YK;mA%l;So{>@6XdQxR57=$>gA9yIH4FcT#8v1sx)FoP{5 zo@9}gf;3MajxNVye2+G=GW_zYpEvQECpChVI8E4+P3zo`JS%>vgxi&t#VIUGb)^Z_ zBxO)W4p?AhT6)Sm5|NO~O(v3(%Q#H6VWiR(t25C0 zthI7A6^2$Ct7CJEbZp?R22GimOw2bNRW&!VWh_?ZnntFECk7I?v|+H~ty(<;4l{&t zYMbRo7-5=kl3}IciKH{RUc(M$YGqiI)niiCLpbI0FwvqoBKvZhN7j|S5Ssd9l2WXR zqY0sw77fP~klzuu`{z2GRp3=a#CnGu^Pza(eN~xio<1dn3FPl zSdjSS?L_9nHh~N>vPkV^_e&(5VOpxq$FK6s8Zj&3r^EEzdX4jfyh>LF_4e2zdP~3Y zGp$Qx(mHeeFT&9HV|I95CXc#It8|-DYSmKB8(58FIoULA6G%99ysGMXr=De&frHsh zDyowZs)bQvDpSz$B#bCd;5$O+lwcPtSuc_dsNmHfg5(cz>3WsIM4Vp})}I^KjgMabEt=MicR}1D$kJ4+ zAhWfD!e(M+&Q(~H?NUrTN8vn?=8flSrFj@e(vO)U5RJlh%z#9~gKAW7yRUIHR2;F4 zk`Mw6oV~0CWR=%m=O-o?-YK4!myST`kEkQF<&+ORf?){?Q2<0B#Sp71N>rC~Mqp+n z)=x=Bv55%Q4)%?$1 zrG0Arbb9Bpdd}zt5b3(9_E$6d_cXc;0ze9f2 zTCBs_o9+y`FwV^#eq45SYZ7`V0}S;iH`0%lB9*GYco2aB9Qlb5CBV8!iE$DhZ#N80 z3WmgZswu_A!)FHm@t=?r=JYQ>#fuI-y)qe+bIx9_Iym4QW1>6p%OTdDX!NtpsB(x% zKuHlhU{mKO=6qnVRQHn6kkg@JS|X>OP|_FZq|;NUGehhb<=!A0ONTw%Kmb4kD}+F4 zcbgLo4k%p)*j+bat~hnkTv}+D&#A3$28^06-qEiLVhmtonS%o$%4Vc237YG&d}7mE|G7HMy#@N3chAY; zF{-cfS5JJo|1wPudjS5cXi6*i!-n%;$3X?R>Ie?s~lrh1Ya2{!ke| zZ>gl|A#Q7o*}SH1?%OWg>@N$(5-=el2u4XG2(dyKu=PT)u61%*v2D3N>*oC~PXFL> zW=zDCvdStgSr$tVL5{y1^@go=*IvH6b8ULx%wMaiPgtg%{k+Mc1CTial@;!2vdtOu z`;W5va~c2;b`S&J$u+HOb@}ok+yTt=8khwL0B5RIQn00#Qk>r(ljd$@LCB;eAqYlE zAu^@zHn(E1uR!eto$FB;F@mUx|gA7=%ODnFtS+{Xp zchvk}{*E+#RIvAFTR+i-WCmJ%t6nF@j=C!nP$#!KG#&5{GdaTAoN zwPM>#ZJ_1nH_Yj$K6`n4_xE_gho|kV#=CeDfd(Uwd`*hl_c%0Lqej-8o4T_4@0t3) zzpYPq-=V13*r5p|lT)WYdhC`HUApeN?HivU|Bf!Vv^t*8wsW8wG-qi|6=>ex(e{5w z|8E@n%?X#6u87DX8HbUotoifj-QDQo#l8LS`QtJK2#|ybBpz~1u2&`lZZWNGj@iw1 z*{`=hir-i+G8X_qs#Ph?ZD>`tz4p@CO3XjiLI8$Jg0fLk6&E&6cqyl#91naay|jRK zs2B@U`z^NHVF);Vo{RaYK!{ilQk83#$+^z2S?7KzE{KS4Rzy1frqa@PpiLZras_6( z>0Nqs==?uJvv{~h5t2lS7%JOps=}p`#Ib#&+D;mD*A9I5Z{OwN)#~`4ko)&kPN>98 zjZ{@rQ$)l-MawO=y6dI*v0HDA{;!|0@~=-?1yl@U83`CLpo*fXCJLx1p{gQffN}8! zS!<#xiYgcKwJb0vbR$0+J*N#)5xKya{=<$O-Ugz-b6zfCvCFW6uj9 z+AXle3?@Ff0IqUiQaM5>TUN_nTp4m>+QWwqKYisC?e8}-v<)y(f(K%%g1L(UO>TA1 zeb-$nX`JW*dUGN!B|#F*fCgkhjN(xw%DXJBk9|FV8=Q#=m;^x1QAw?N>1AJ9;Irua zant3rSU`ZZ#c7ZL#xm=i>urT~a_n-z1yoRo0A^FfQ7k1E=XI~P5QHi4`CnarJ2=)T zzyOZLVE{)R>t*-c_u%9FK1Mo~q7VwIPe3F_6_z&lWUrF;8_z@O{->D&qU%&aR+Y0@ zM7LXWc31H}k6*9YP>8Bv0AvnirB*r$C%=uo#Ka*cU)lNYx2vciEHxR#)0`aP$Abo(7ulnaPMtUYZ}qA@Jq@iHJrdO(&bMl6gk`P~pH6wH7C;~3;EAXcw)oaa+5Ysiii zs$c+gQ}qD?dn{{lRr7==#mnh?nbw2|9aWH~m~2|>mbc^HdWq1%5t$GISqdb&*xY&F zW9*OM@fyAig+Y)2>r}0AXvb}s+2&wHGax|3=QYoOI0D;kw{F~5WZ27oU-kNM_don- z^+-f?#z7pK+>Lr!Wt356CRqCajbMZMylKA;hCvDnLIAlzjF`#Hvn)+C)H2ear=HJB zvbhvQpsTI4rc7f)ZgZ}-*w&vzw8imiEe1D0Pe~OL>#b`ow;VmRhzWy{N(D4Y<;vPp zn%KUmcGae`E-oWNw#`E)P7_FsMYp?>2$JaJf706cXY$k*0|1B_8M4H) zQrm5|+hfHA$;hZ@NI_!q#!Rhh_tvn5WOGnFpbVf=k*h9ibGomse%}{T+eljhkwvzq znK9|dX6CED&kiJk0t`@t6I!Uk0>TVqa61(t2vU^U=g5T!vKb1I0HTyb_w5X_ygB&w zc|NC@d0L2wAR-`!LaC}jkSJyVAbDj0OI~Ts&Yb%1St*b(0LlP3#mu#pb!!M#lD7%} zq`;w&%2IwnR;{fT+V{S4`dS$K!DA{r zE86$I`{NrsG86TAe`1AM1d|r<6dRiJ=dUfh7j0$5?C<=(=h*om0F{|?rwLe6CR)~p zw%c)C8+cxe&36_43;H+pim~x54wa zS^zs##7k|q%GK~9fdeo^Z+nBCPzG=xC%#ukF^p})cfk1_U*q4r`zkCX2*89!BoZ3; zwl+34HS5!({~rUs=5n*9bc3HJOjT#kodhFch8|LaKCjb*%kkL&frc1aXvn!5MHE_E zofQ;ON}u_+>JWpM|ocy<_>pP18D+aySj#LU8|8fGJL!FE$dDh| z%y}O!3q|Bc3I?Hcmn|ASCI>u$Ow(g08DQW`PLc!8 zArK)`5U{iX7FgUCz14kVqtR|;Q=T$_M5QT4x|8bkZU?#GSVRCC0IG#)K{B3uod{=) zV;J=Q2Um=M`_v!<#Z|9bQya440XrYCqM&ul@xotz4;l*30mh3LpU07Y>_DMu(7saL)4 zd+A!&vG>2DU&}8TL^vrz7Ryu#6ED8~5Y@ghzFV+g@!@qJ%kl3S%F?8g_1$Y)w5iJ= zt@eASdESa3fIvX0fk3&3E_&;}`fN)(wMlR~Cjtb0@>a+pN=UAD6Dx%O*KpfNx0RNYoM){hatOuCnpmHo1r(~57P$EkN1SIA# zNdZF-+st5Z3LTP01gVgHD(XWE$@OGPJ1!~mtcH7=u-_y%5jm)|*K6$pJBP&so)WqC z*r;!RSKMdb^&a=weE3g!_r3I!+c40h!{zvWW*vVLyF@P8HIfZt(iVNzZnA&pzmLfu zweXeBx2-t12ks3iqp(<*RuKU9I+Fs(YFQk9y0gX-WKuuTM^%y2|B zteqrcc<{dxn{J%OYPpKW{iJ@CD_ zA>H}ho^#w$s+4I;G>B+`_;JI^cX3)1<^TQjzu%B5=VbZ8&+~Mj2=IUNIz4`wNf{5_ zt5wiL_DrEwKwi_MwXXpC59kF&Yg%0sM~i!#MMb?g`&>TTL$iG}%*IOputAIfTvslj z;EY5#9QDs9#JMklfX?6TuGRykgsPcl3S$k37q7k>?n|k93x(r@zv6RZCFrHk3~TYr zUM=H2kTruBc?dK~s^blND~43Y?z$4$KcFbab{?}0FFipZhxw^(oP!^;+96noRYuBM z{J(wU{pYjHdk?PvI3J1Rd;ewozSuU^L4p3^5(u2%{=T=+f8XCo;oVl)N!%_M$P~6t zn)52jmJUUXG|-FBGX!$ARk69a6j}TiRmzY)Orw; zJ91{%lXUJ-UavQYSB4Q8R}F-=boPU+e7l1`hYXqZGUA;r%bdY9Y8f0&K03hCtvWs`h1{l^}pnYAdRr7e5`$SmGO(7@f?uc-WDN*PH^>F!#8pb99Ej?m*@ke zbK@zldB&I8{EG))_b^=3)IJ%TrAGDp6uOPF+d!uM69?;VY-*>n7BXN9hxl3+5m{u+ zl;oDK-#4@M7qVZTjQD>^d^+YU;@4icU)lU`vOo+jAUbsUYM7X;E5UHDoB&`9gzN7# zk@SZ-ABeG_ZH#-t!$SaJrQcDy86$dEl=d=vT(HcHov&(d0toa&eUbYhfDnY$&fxp@ z-v}ERDSZ##fC%^9X{l8E9r~ls1^Mj=D ac{{v3jSTG^=7&T7#oUoj6eJa+k8yxIZJHkd diff --git a/man/as_epi_archive.Rd b/man/as_epi_archive.Rd index 77ae1868..04b8cff7 100644 --- a/man/as_epi_archive.Rd +++ b/man/as_epi_archive.Rd @@ -58,7 +58,7 @@ would be equivalent to: }\if{html}{\out{}} } \examples{ -df <- data.frame (geo_value = c(replicate(2, "ca"), replicate(2, "fl")), +df <- data.frame (geo_value = c(rep("ca", 2), rep("fl", 2)), county = c(1, 3, 2, 5), time_value = c("2020-06-01", "2020-06-02", diff --git a/man/epix_as_of.Rd b/man/epix_as_of.Rd index 969ae868..432ffc33 100644 --- a/man/epix_as_of.Rd +++ b/man/epix_as_of.Rd @@ -40,6 +40,22 @@ is equivalent to: }\if{html}{\out{}} } \examples{ -epix_as_of(x = archive_cases_dv, - max_version = max(archive_cases_dv$DT$version)) + +range(archive_cases_dv$DT$version) # 2020-06-02 -- 2020-06-15 + +epix_as_of(x = archive_cases_dv, + max_version = as.Date("2020-06-12")) + +# When fetching a snapshot as of the latest version with update data in the +# archive, a warning is issued as this update data might not yet be finalized +# (for example, if data versions are labeled with dates, these versions might be +# overwritten throughout the day if the data can be updated multiple times per +# day; when we build an archive based on special update-data queries all made at +# the same time, the latest available update might still be subject to change, +# but previous versions should be finalized). We can muffle such warnings with +# the following pattern: +withCallingHandlers({ + epix_as_of(x = archive_cases_dv, + max_version = max(archive_cases_dv$DT$version)) +}, epiprocess__snapshot_as_of_last_update_version = function(wrn) invokeRestart("muffleWarning")) } diff --git a/man/epix_merge.Rd b/man/epix_merge.Rd index b688bd06..6fb59d92 100644 --- a/man/epix_merge.Rd +++ b/man/epix_merge.Rd @@ -49,10 +49,10 @@ is equivalent to: # create two example epi_archive datasets x <- archive_cases_dv$DT \%>\% dplyr::select(geo_value,time_value,version,case_rate) \%>\% - as_epi_archive() + as_epi_archive(compactify=TRUE) y <- archive_cases_dv$DT \%>\% dplyr::select(geo_value,time_value,version,percent_cli) \%>\% - as_epi_archive() + as_epi_archive(compactify=TRUE) # a full join stored in x epix_merge(x, y, all = TRUE) diff --git a/tests/testthat/test-compactify.R b/tests/testthat/test-compactify.R index 8ccdeee5..7bcb0fe6 100644 --- a/tests/testthat/test-compactify.R +++ b/tests/testthat/test-compactify.R @@ -48,7 +48,7 @@ dt <- row_replace(74,73,74) # Not LOCF dt_true <- as_tibble(as_epi_archive(dt,compactify=TRUE)$DT) dt_false <- as_tibble(as_epi_archive(dt,compactify=FALSE)$DT) -dt_null <- as_tibble(as_epi_archive(dt,compactify=NULL)$DT) +dt_null <- suppressWarnings(as_tibble(as_epi_archive(dt,compactify=NULL)$DT)) test_that("Warning for LOCF with compactify as NULL", { expect_warning(as_epi_archive(dt,compactify=NULL)) @@ -73,12 +73,14 @@ test_that("as_of utilizes LOCF even after removal of LOCF values",{ ea_true <- as_epi_archive(dt,compactify=TRUE) ea_false <- as_epi_archive(dt,compactify=FALSE) - epix_as_of(ea_true,max(ea_true$DT$version)) - # Row 22, an LOCF row corresponding to the latest version, but for the # date 2020-06-02, is omitted in ea_true - as_of_true <- ea_true$as_of(max(ea_true$DT$version)) - as_of_false <- ea_false$as_of(max(ea_false$DT$version)) + expect_warning({ + as_of_true <- ea_true$as_of(max(ea_true$DT$version)) + }, class = "epiprocess__snapshot_as_of_last_update_version") + expect_warning({ + as_of_false <- ea_false$as_of(max(ea_false$DT$version)) + }, class = "epiprocess__snapshot_as_of_last_update_version") expect_identical(as_of_true,as_of_false) }) From 1be2b373b37a2d82aaeaac262af0da992b7bb940 Mon Sep 17 00:00:00 2001 From: kenmawer Date: Tue, 21 Jun 2022 10:25:35 -0700 Subject: [PATCH 47/96] Added a feature for time of compactify. --- vignettes/compactify.Rmd | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 vignettes/compactify.Rmd diff --git a/vignettes/compactify.Rmd b/vignettes/compactify.Rmd new file mode 100644 index 00000000..e69de29b From 9a5d0a74ccd05d6ca29b2fac12d86b9ff519b8f2 Mon Sep 17 00:00:00 2001 From: kenmawer Date: Tue, 21 Jun 2022 10:39:07 -0700 Subject: [PATCH 48/96] Basic template (not documented yet). --- vignettes/compactify.Rmd | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/vignettes/compactify.Rmd b/vignettes/compactify.Rmd index e69de29b..576a3650 100644 --- a/vignettes/compactify.Rmd +++ b/vignettes/compactify.Rmd @@ -0,0 +1,12 @@ +--- +title: Compactify to remove LOCF values +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Compactify to remove LOCF values} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r} +print("Hello world!") +``` \ No newline at end of file From 1d455c03c98b5437814146992821c0c1bc364d22 Mon Sep 17 00:00:00 2001 From: kenmawer Date: Tue, 21 Jun 2022 11:11:10 -0700 Subject: [PATCH 49/96] Updated broken <<<<< code. --- DESCRIPTION | 3 --- R/archive.R | 21 +-------------------- man/epi_slide.Rd | 6 ++++-- 3 files changed, 5 insertions(+), 25 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 7a8d12e0..9de225ba 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -54,7 +54,4 @@ Roxygen: list(markdown = TRUE) RoxygenNote: 7.2.0 Depends: R (>= 2.10) -<<<<<<< HEAD -======= URL: https://cmu-delphi.github.io/epiprocess/ ->>>>>>> d16d8890a0445c133fd34b20fce68addf947c4be diff --git a/R/archive.R b/R/archive.R index 7f4caab6..c9167f2a 100644 --- a/R/archive.R +++ b/R/archive.R @@ -114,11 +114,7 @@ epi_archive = #' @return An `epi_archive` object. #' @importFrom data.table as.data.table key setkeyv initialize = function(x, geo_type, time_type, other_keys, -<<<<<<< HEAD additional_metadata, compactify) { -======= - additional_metadata) { ->>>>>>> d16d8890a0445c133fd34b20fce68addf947c4be # Check that we have a data frame if (!is.data.frame(x)) { Abort("`x` must be a data frame.") @@ -144,13 +140,7 @@ epi_archive = if (missing(time_type)) { time_type = guess_time_type(x$time_value) } -<<<<<<< HEAD - - # Conduct checks on keys variables and metadata -======= - # Finish off with small checks on keys variables and metadata ->>>>>>> d16d8890a0445c133fd34b20fce68addf947c4be if (missing(other_keys)) other_keys = NULL if (missing(additional_metadata)) additional_metadata = list() if (!all(other_keys %in% names(x))) { @@ -164,17 +154,13 @@ epi_archive = Warn("`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\".") } -<<<<<<< HEAD # Finish off with compactify if (missing(compactify)) { compactify = NULL } else if (!rlang::is_bool(compactify) && !rlang::is_null(compactify)) { Abort("compactify must be boolean or null.") - } - -======= ->>>>>>> d16d8890a0445c133fd34b20fce68addf947c4be + } # Create the data table; if x was an un-keyed data.table itself, # then the call to as.data.table() will fail to set keys, so we # need to check this, then do it manually if needed @@ -559,14 +545,9 @@ epi_archive = #' time_type = "day", #' other_keys = "county") as_epi_archive = function(x, geo_type, time_type, other_keys, -<<<<<<< HEAD additional_metadata = list(),compactify = NULL) { epi_archive$new(x, geo_type, time_type, other_keys, additional_metadata, compactify) -======= - additional_metadata = list()) { - epi_archive$new(x, geo_type, time_type, other_keys, additional_metadata) ->>>>>>> d16d8890a0445c133fd34b20fce68addf947c4be } #' Test for `epi_archive` format diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd index 90d94224..903cb017 100644 --- a/man/epi_slide.Rd +++ b/man/epi_slide.Rd @@ -123,7 +123,9 @@ through the \code{new_col_name} argument. When \code{f} is a named function with arguments, if a tibble with an unnamed grouping variable is passed in as the method argument to \code{f}, include a parameter for the grouping-variable in \verb{function()} just prior to -specifying the method to prevent that from being overridden. For example:\preformatted{# Construct an tibble with an unnamed grouping variable +specifying the method to prevent that from being overridden. For example: + +\if{html}{\out{
}}\preformatted{# Construct an tibble with an unnamed grouping variable edf = bind_rows(tibble(geo_value = "ak", time_value = as.Date("2020-01-01") + 1:10, x1=1:10, y=1:10 + rnorm(10L))) \%>\% as_epi_df() @@ -136,7 +138,7 @@ edf \%>\% group_by(geo_value) \%>\% epi_slide(function(x, g, method="qr", ...) tibble(model=list( lm(y ~ x1, x, method=method))), n=7L) -} +}\if{html}{\out{
}} } \examples{ # slide a 7-day trailing average formula on cases From f967522f9d344a4a9056d2a5c8354e07bfa28bd9 Mon Sep 17 00:00:00 2001 From: kenmawer Date: Tue, 21 Jun 2022 11:33:24 -0700 Subject: [PATCH 50/96] Also need to commit this. --- man/epi_slide.Rd | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd index 90d94224..903cb017 100644 --- a/man/epi_slide.Rd +++ b/man/epi_slide.Rd @@ -123,7 +123,9 @@ through the \code{new_col_name} argument. When \code{f} is a named function with arguments, if a tibble with an unnamed grouping variable is passed in as the method argument to \code{f}, include a parameter for the grouping-variable in \verb{function()} just prior to -specifying the method to prevent that from being overridden. For example:\preformatted{# Construct an tibble with an unnamed grouping variable +specifying the method to prevent that from being overridden. For example: + +\if{html}{\out{
}}\preformatted{# Construct an tibble with an unnamed grouping variable edf = bind_rows(tibble(geo_value = "ak", time_value = as.Date("2020-01-01") + 1:10, x1=1:10, y=1:10 + rnorm(10L))) \%>\% as_epi_df() @@ -136,7 +138,7 @@ edf \%>\% group_by(geo_value) \%>\% epi_slide(function(x, g, method="qr", ...) tibble(model=list( lm(y ~ x1, x, method=method))), n=7L) -} +}\if{html}{\out{
}} } \examples{ # slide a 7-day trailing average formula on cases From 8ec0b0ba1f26abbce07afc5c880b9abeb58da56a Mon Sep 17 00:00:00 2001 From: kenmawer Date: Tue, 21 Jun 2022 11:34:20 -0700 Subject: [PATCH 51/96] Fixed accidentally deleted comment. --- R/archive.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/archive.R b/R/archive.R index 90c77fd2..863accea 100644 --- a/R/archive.R +++ b/R/archive.R @@ -140,6 +140,8 @@ epi_archive = if (missing(time_type)) { time_type = guess_time_type(x$time_value) } + + # Finish off with small checks on keys variables and metadata if (missing(other_keys)) other_keys = NULL if (missing(additional_metadata)) additional_metadata = list() if (!all(other_keys %in% names(x))) { From 4ec8bbfe04bafbb93b58f59d4731c619c60dd6a5 Mon Sep 17 00:00:00 2001 From: kenmawer Date: Tue, 21 Jun 2022 11:34:41 -0700 Subject: [PATCH 52/96] Fixed formatting issue. --- R/archive.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/archive.R b/R/archive.R index c9167f2a..6f808a30 100644 --- a/R/archive.R +++ b/R/archive.R @@ -140,6 +140,7 @@ epi_archive = if (missing(time_type)) { time_type = guess_time_type(x$time_value) } + # Finish off with small checks on keys variables and metadata if (missing(other_keys)) other_keys = NULL if (missing(additional_metadata)) additional_metadata = list() From f93bd71a74e490924e39f11ccf5969d4d1814975 Mon Sep 17 00:00:00 2001 From: kenmawer Date: Tue, 21 Jun 2022 11:36:41 -0700 Subject: [PATCH 53/96] This new branch is for test rectification due to new datasets. I also noticed some call undefined datasets. --- tests/testthat/test-compactify.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/testthat/test-compactify.R b/tests/testthat/test-compactify.R index 8ccdeee5..6b677d4d 100644 --- a/tests/testthat/test-compactify.R +++ b/tests/testthat/test-compactify.R @@ -2,6 +2,8 @@ library(epiprocess) library(data.table) library(dplyr) +# RECTIFY TESTS FOR NEW DATASETS AND TO ENSURE THEY PASS!!! + dt <- archive_cases_dv$DT test_that("Input for compactify must be NULL or a boolean", { expect_error(as_epi_archive(dv_duplicated,compactify="no")) From 2424251c0d7cbd01ab90401c9fee929bf874b785 Mon Sep 17 00:00:00 2001 From: kenmawer Date: Tue, 21 Jun 2022 14:05:52 -0700 Subject: [PATCH 54/96] Updated test to use new dataset similar to old one. --- tests/testthat/test-compactify.R | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/testthat/test-compactify.R b/tests/testthat/test-compactify.R index 6b677d4d..15bace1d 100644 --- a/tests/testthat/test-compactify.R +++ b/tests/testthat/test-compactify.R @@ -4,12 +4,15 @@ library(dplyr) # RECTIFY TESTS FOR NEW DATASETS AND TO ENSURE THEY PASS!!! -dt <- archive_cases_dv$DT +dt <- archive_cases_dv_subset$DT +dt <- filter(dt,geo_value == "ca") %>% + filter(version <= "2020-06-15") %>% + select(-case_rate_7d_av) + test_that("Input for compactify must be NULL or a boolean", { expect_error(as_epi_archive(dv_duplicated,compactify="no")) }) -dt <- filter(dt,geo_value == "ca") dt$percent_cli <- c(1:80) dt$case_rate <- c(1:80) @@ -57,7 +60,7 @@ test_that("Warning for LOCF with compactify as NULL", { }) test_that("No warning when there is no LOCF", { - expect_warning(as_epi_archive(dt[1:10,],compactify=NULL),NA) + expect_warning(as_epi_archive(dt[1:5],compactify=NULL),NA) }) test_that("LOCF values are ignored with compactify=FALSE", { From 07bce5078316b49a43dc2e525ac1b099b449625f Mon Sep 17 00:00:00 2001 From: kenmawer Date: Tue, 21 Jun 2022 15:59:26 -0700 Subject: [PATCH 55/96] Broke up the archive vignette to create a vignette for compactify, which includes performances with and without LOCF applied to data. --- vignettes/archive.Rmd | 34 +++-------------------------- vignettes/compactify.Rmd | 47 ++++++++++++++++++++++++++++++++++++++-- 2 files changed, 48 insertions(+), 33 deletions(-) diff --git a/vignettes/archive.Rmd b/vignettes/archive.Rmd index ffc6fc55..420e0bda 100644 --- a/vignettes/archive.Rmd +++ b/vignettes/archive.Rmd @@ -66,7 +66,9 @@ has (at least) the following columns: As we can see from the above, the data frame returned by `delphi.epidata::covidcast()` has the columns required for the `epi_archive` format, with `issue` playing the role of `version`. We can now use -`as_epi_archive()` to bring it into `epi_archive` format. +`as_epi_archive()` to bring it into `epi_archive` format. For removal of +LOCF values using `as_epi_archive` using compactify, please refer to the +compactify vignette. ```{r, eval=FALSE} x <- dv %>% @@ -124,36 +126,6 @@ To make a copy, we can use the `clone()` method for an R6 class, as in `y <- x$clone()`. You can read more about reference semantics in Hadley Wickham's [Advanced R](https://adv-r.hadley.nz/r6.html#r6-semantics) book. -## Removing LOCF data to save space - -We need not even store rows that look like the last observation carried forward, -as they use up extra space. Furthermore, we already apply LOCF in the -epi_archive-related functions, such that the results should be the same with or -without the rows, aas long as use code does not rely on directly modifying -epi_archive's fields in a way that expects all the original records and breaks -if they are trimmed down. - -There are three -different values that can be asssigned to `compactify`: - -* No argument: Does not put in LOCF values, and prints a list of LOCF values -that have been omitted but could have been placed. -* `TRUE`: Does not put in LOCF values, but doesn't print anything relating -to which values have been omitted. -* `FALSE`: Includes LOCF values. - -For this example, we have one chart using LOCF values, while another doesn't -use them to illustrate LOCF. Notice how the head of the first dataset differs -from the second from the third value included. - -```{r} -locf_omitted <- as_epi_archive(archive_cases_dv$DT) -locf_included <- as_epi_archive(archive_cases_dv$DT,compactify = FALSE) - -head(locf_omitted$DT) -head(locf_included$DT) -``` - ## Some details on metadata The following pieces of metadata are included as fields in an `epi_archive` diff --git a/vignettes/compactify.Rmd b/vignettes/compactify.Rmd index 576a3650..3379de64 100644 --- a/vignettes/compactify.Rmd +++ b/vignettes/compactify.Rmd @@ -7,6 +7,49 @@ vignette: > %\VignetteEncoding{UTF-8} --- +## Removing LOCF data to save space + +We need not even store rows that look like the last observation carried forward, +as they use up extra space. Furthermore, we already apply LOCF in the +epi_archive-related functions, such that the results should be the same with or +without the rows, aas long as use code does not rely on directly modifying +epi_archive's fields in a way that expects all the original records and breaks +if they are trimmed down. + +There are three +different values that can be assigned to `compactify`: + +* No argument: Does not put in LOCF values, and prints the first six LOCF values +that have been omitted but could have been placed. +* `TRUE`: Does not put in LOCF values, but doesn't print anything relating +to which values have been omitted. +* `FALSE`: Includes LOCF values. + +For this example, we have one chart using LOCF values, while another doesn't +use them to illustrate LOCF. Notice how the head of the first dataset differs +from the second from the third value included. + +```{r} +dt <- archive_cases_dv_subset$DT + +locf_omitted <- as_epi_archive(dt) +locf_included <- as_epi_archive(dt,compactify = FALSE) + +head(locf_omitted$DT) +head(locf_included$DT) +``` + +LOCF can mar the performance of dataset operationsm as evidenced with the +system time. + ```{r} -print("Hello world!") -``` \ No newline at end of file +iterate_test <- function(my_dt) { + for (i in 1:200) { + filter(dt,version >= as.Date("2020-01-01") + i) + } +} + +system.time(iterate_test(locf_included$DT)) + +system.time(iterate_test(locf_omitted$DT)) +``` From a309f0b3ac998bcd6d5406572b625a398a66ecd7 Mon Sep 17 00:00:00 2001 From: kenmawer Date: Tue, 21 Jun 2022 16:23:05 -0700 Subject: [PATCH 56/96] Updated incorrect code and vignette for more details. --- vignettes/compactify.Rmd | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/vignettes/compactify.Rmd b/vignettes/compactify.Rmd index 3379de64..c2a04196 100644 --- a/vignettes/compactify.Rmd +++ b/vignettes/compactify.Rmd @@ -39,17 +39,38 @@ head(locf_omitted$DT) head(locf_included$DT) ``` -LOCF can mar the performance of dataset operationsm as evidenced with the -system time. +LOCF can mar the performance of dataset operations. As we can see, both 500 +iterations of `filter` and 500 iterations of `as_of` are both faster when +the LOCF values are omitted. ```{r} -iterate_test <- function(my_dt) { - for (i in 1:200) { - filter(dt,version >= as.Date("2020-01-01") + i) +dt2 <- select(dt,-percent_cli) + +locf_included_2 <- as_epi_archive(dt2,compactify=FALSE) +locf_omitted_2 <- as_epi_archive(dt2,compactify=TRUE) + +nrow(locf_included_2$DT) +nrow(locf_omitted_2$DT) + +iterate_filter <- function(my_ea) { + for (i in 1:500) { + dplyr::filter(my_ea$DT,version >= as.Date("2020-01-01") + i) + } +} + +# Performance of filtering +system.time(iterate_filter(locf_included_2)) +system.time(iterate_filter(locf_omitted_2)) + +iterate_as_of <- function(my_ea) { + for (i in 1:500) { + my_ea$as_of(min(my_ea$DT$time_value) + i) } } -system.time(iterate_test(locf_included$DT)) +# Performance of as_of +system.time(iterate_as_of(locf_included_2)) +system.time(iterate_as_of(locf_omitted_2)) + -system.time(iterate_test(locf_omitted$DT)) ``` From 78f3dcaabeccd1be6caf6ef2811ea8daed8673f5 Mon Sep 17 00:00:00 2001 From: kenmawer Date: Tue, 21 Jun 2022 16:36:17 -0700 Subject: [PATCH 57/96] Removed comment. --- tests/testthat/test-compactify.R | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/testthat/test-compactify.R b/tests/testthat/test-compactify.R index 15bace1d..41e5fa69 100644 --- a/tests/testthat/test-compactify.R +++ b/tests/testthat/test-compactify.R @@ -2,8 +2,6 @@ library(epiprocess) library(data.table) library(dplyr) -# RECTIFY TESTS FOR NEW DATASETS AND TO ENSURE THEY PASS!!! - dt <- archive_cases_dv_subset$DT dt <- filter(dt,geo_value == "ca") %>% filter(version <= "2020-06-15") %>% From a24dc213a595c5f5f52f8a78a8285ccae6f75194 Mon Sep 17 00:00:00 2001 From: kenmawer Date: Tue, 21 Jun 2022 16:54:46 -0700 Subject: [PATCH 58/96] Replaced incorrect dataset. --- tests/testthat/test-compactify.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-compactify.R b/tests/testthat/test-compactify.R index 41e5fa69..51037f96 100644 --- a/tests/testthat/test-compactify.R +++ b/tests/testthat/test-compactify.R @@ -8,7 +8,7 @@ dt <- filter(dt,geo_value == "ca") %>% select(-case_rate_7d_av) test_that("Input for compactify must be NULL or a boolean", { - expect_error(as_epi_archive(dv_duplicated,compactify="no")) + expect_error(as_epi_archive(dt,compactify="no")) }) dt$percent_cli <- c(1:80) From d7c56d3d3305adbf3d5eee4b9240f7e8fefc7092 Mon Sep 17 00:00:00 2001 From: kenmawer Date: Wed, 22 Jun 2022 11:59:58 -0700 Subject: [PATCH 59/96] Updated documentation. --- R/archive.R | 4 +-- vignettes/compactify.Rmd | 57 +++++++++++++++++++++++++++++++--------- 2 files changed, 46 insertions(+), 15 deletions(-) diff --git a/R/archive.R b/R/archive.R index 863accea..94420898 100644 --- a/R/archive.R +++ b/R/archive.R @@ -180,12 +180,12 @@ epi_archive = # Checks for LOCF's in a data frame rm_locf <- function(df) { - filter(df,if_any(c(everything(),-version),~ !is_locf(.))) + dplyr::filter(df,if_any(c(everything(),-version),~ !is_locf(.))) } # Keeps LOCF values, such as to be printed keep_locf <- function(df) { - filter(df,if_all(c(everything(),-version),~ is_locf(.))) + dplyr::filter(df,if_all(c(everything(),-version),~ is_locf(.))) } # Runs compactify on data frame diff --git a/vignettes/compactify.Rmd b/vignettes/compactify.Rmd index c2a04196..b0b755d4 100644 --- a/vignettes/compactify.Rmd +++ b/vignettes/compactify.Rmd @@ -39,38 +39,69 @@ head(locf_omitted$DT) head(locf_included$DT) ``` -LOCF can mar the performance of dataset operations. As we can see, both 500 -iterations of `filter` and 500 iterations of `as_of` are both faster when -the LOCF values are omitted. +LOCF can mar the performance of dataset operations. As the column +`case_rate_7d_av` has many more LOCF values than `percent_cli`, we will omit the +`percent_cli` column for comparing performance. ```{r} dt2 <- select(dt,-percent_cli) locf_included_2 <- as_epi_archive(dt2,compactify=FALSE) locf_omitted_2 <- as_epi_archive(dt2,compactify=TRUE) +``` + +We can see how large each dataset is to better understand why LOCF uses up +space that may slow down performance. +```{r} nrow(locf_included_2$DT) nrow(locf_omitted_2$DT) +``` + +As we can see, performing 200 iterations of `dplyr::filter` is faster when the +LOCF values are omitted. What would happen if we filter the data with base R's +`subset` instead? Again, 200 iterations would happen more quickly without the +LOCF values. **NOTE:** While this is omitted here, one may also be willing to +compare performance of filtering with `[]` that would be similar to how one +would filter data in Python. + +```{r} +# Performance of filtering iterate_filter <- function(my_ea) { - for (i in 1:500) { + for (i in 1:1000) { dplyr::filter(my_ea$DT,version >= as.Date("2020-01-01") + i) } } -# Performance of filtering -system.time(iterate_filter(locf_included_2)) -system.time(iterate_filter(locf_omitted_2)) +elapsed_time <- function(fx) c(system.time(fx))[[3]] +speed_test <- function(f,name) { + data.frame( + operation = name, + locf=elapsed_time(f(locf_included_2)), + no_locf=elapsed_time(f(locf_omitted_2)) + ) +} + +speed_test(iterate_filter,"filter_1000x") + +``` + +```{r} +# Performance of as_of iterated 200 times iterate_as_of <- function(my_ea) { - for (i in 1:500) { - my_ea$as_of(min(my_ea$DT$time_value) + i) + for (i in 1:1000) { + my_ea$as_of(min(my_ea$DT$time_value) + i - 1000) } } -# Performance of as_of -system.time(iterate_as_of(locf_included_2)) -system.time(iterate_as_of(locf_omitted_2)) +speed_test(iterate_as_of,"as_of_1000x") +# Performance of slide +slide_median <- function(my_ea) { + my_ea$slide(median = median(case_rate_7d_av)) +} -``` +speed_test(slide_median,"slide_median") +``` \ No newline at end of file From 2f8735a4a303bb83ce40619df00491f4b4adbfaf Mon Sep 17 00:00:00 2001 From: kenmawer Date: Wed, 22 Jun 2022 13:56:28 -0700 Subject: [PATCH 60/96] Updated this to show a plot of times with and without LOCF. --- vignettes/compactify.Rmd | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/vignettes/compactify.Rmd b/vignettes/compactify.Rmd index b0b755d4..3cedf332 100644 --- a/vignettes/compactify.Rmd +++ b/vignettes/compactify.Rmd @@ -30,6 +30,8 @@ use them to illustrate LOCF. Notice how the head of the first dataset differs from the second from the third value included. ```{r} +library(dplyr) + dt <- archive_cases_dv_subset$DT locf_omitted <- as_epi_archive(dt) @@ -60,17 +62,13 @@ nrow(locf_omitted_2$DT) As we can see, performing 200 iterations of `dplyr::filter` is faster when the -LOCF values are omitted. What would happen if we filter the data with base R's -`subset` instead? Again, 200 iterations would happen more quickly without the -LOCF values. **NOTE:** While this is omitted here, one may also be willing to -compare performance of filtering with `[]` that would be similar to how one -would filter data in Python. +LOCF values are omitted. ```{r} # Performance of filtering iterate_filter <- function(my_ea) { for (i in 1:1000) { - dplyr::filter(my_ea$DT,version >= as.Date("2020-01-01") + i) + filter(my_ea$DT,version >= as.Date("2020-01-01") + i) } } @@ -84,10 +82,12 @@ speed_test <- function(f,name) { ) } -speed_test(iterate_filter,"filter_1000x") +speeds <- speed_test(iterate_filter,"filter_1000x") ``` +We would also like to measure the speed of `epi_archive` methods. + ```{r} # Performance of as_of iterated 200 times iterate_as_of <- function(my_ea) { @@ -96,12 +96,22 @@ iterate_as_of <- function(my_ea) { } } -speed_test(iterate_as_of,"as_of_1000x") +speeds <- rbind(speeds, speed_test(iterate_as_of,"as_of_1000x")) # Performance of slide slide_median <- function(my_ea) { my_ea$slide(median = median(case_rate_7d_av)) } -speed_test(slide_median,"slide_median") -``` \ No newline at end of file +speeds <- rbind(speeds, speed_test(slide_median,"slide_median")) +``` +Here is a detailed performance comparison: + +```{r} +speeds_tidy <- gather(speeds,key="is_locf",value="time_in_s",locf,no_locf) + +library(ggplot2) + +ggplot(speeds_tidy) + + geom_bar(aes(x=is_locf,y=time_in_s,fill=operation),stat = "identity") +``` From 4ac9413f620fd8ececa38041b9778f15b31a3936 Mon Sep 17 00:00:00 2001 From: kenmawer Date: Wed, 29 Jun 2022 16:54:09 -0700 Subject: [PATCH 61/96] km-compactifify_rectify withouot any errors. --- R/methods-epi_archive.R | 14 +++++++------- man/epix_as_of.Rd | 8 ++++---- man/epix_merge.Rd | 6 +++--- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 876510f8..14888e08 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -35,9 +35,9 @@ #' @export #' @examples #' -#' range(archive_cases_dv$DT$version) # 2020-06-02 -- 2020-06-15 +#' range(archive_cases_dv_subset$DT$version) # 2020-06-02 -- 2021-12-01 #' -#' epix_as_of(x = archive_cases_dv, +#' epix_as_of(x = archive_cases_dv_subset, #' max_version = as.Date("2020-06-12")) #' #' # When fetching a snapshot as of the latest version with update data in the @@ -49,8 +49,8 @@ #' # but previous versions should be finalized). We can muffle such warnings with #' # the following pattern: #' withCallingHandlers({ -#' epix_as_of(x = archive_cases_dv, -#' max_version = max(archive_cases_dv$DT$version)) +#' epix_as_of(x = archive_cases_dv_subset, +#' max_version = max(archive_cases_dv_subset$DT$version)) #' }, epiprocess__snapshot_as_of_last_update_version = function(wrn) invokeRestart("muffleWarning")) epix_as_of = function(x, max_version, min_time_value = -Inf) { if (!inherits(x, "epi_archive")) Abort("`x` must be of class `epi_archive`.") @@ -94,10 +94,10 @@ epix_as_of = function(x, max_version, min_time_value = -Inf) { #' @export #' @examples #' # create two example epi_archive datasets -#' x <- archive_cases_dv$DT %>% -#' dplyr::select(geo_value,time_value,version,case_rate) %>% +#' x <- archive_cases_dv_subset$DT %>% +#' dplyr::select(geo_value,time_value,version,case_rate_7d_av) %>% #' as_epi_archive(compactify=TRUE) -#' y <- archive_cases_dv$DT %>% +#' y <- archive_cases_dv_subset$DT %>% #' dplyr::select(geo_value,time_value,version,percent_cli) %>% #' as_epi_archive(compactify=TRUE) #' # a full join stored in x diff --git a/man/epix_as_of.Rd b/man/epix_as_of.Rd index f4f02996..e8ca2f47 100644 --- a/man/epix_as_of.Rd +++ b/man/epix_as_of.Rd @@ -45,9 +45,9 @@ epix_as_of(x = archive_cases_dv_subset, max_version = max(archive_cases_dv_subset$DT$version)) -range(archive_cases_dv$DT$version) # 2020-06-02 -- 2020-06-15 +range(archive_cases_dv_subset$DT$version) # 2020-06-02 -- 2021-12-01 -epix_as_of(x = archive_cases_dv, +epix_as_of(x = archive_cases_dv_subset, max_version = as.Date("2020-06-12")) # When fetching a snapshot as of the latest version with update data in the @@ -59,7 +59,7 @@ epix_as_of(x = archive_cases_dv, # but previous versions should be finalized). We can muffle such warnings with # the following pattern: withCallingHandlers({ - epix_as_of(x = archive_cases_dv, - max_version = max(archive_cases_dv$DT$version)) + epix_as_of(x = archive_cases_dv_subset, + max_version = max(archive_cases_dv_subset$DT$version)) }, epiprocess__snapshot_as_of_last_update_version = function(wrn) invokeRestart("muffleWarning")) } diff --git a/man/epix_merge.Rd b/man/epix_merge.Rd index 1c601161..2df640e0 100644 --- a/man/epix_merge.Rd +++ b/man/epix_merge.Rd @@ -47,10 +47,10 @@ is equivalent to: } \examples{ # create two example epi_archive datasets -x <- archive_cases_dv$DT \%>\% - dplyr::select(geo_value,time_value,version,case_rate) \%>\% +x <- archive_cases_dv_subset$DT \%>\% + dplyr::select(geo_value,time_value,version,case_rate_7d_av) \%>\% as_epi_archive(compactify=TRUE) -y <- archive_cases_dv$DT \%>\% +y <- archive_cases_dv_subset$DT \%>\% dplyr::select(geo_value,time_value,version,percent_cli) \%>\% as_epi_archive(compactify=TRUE) # a full join stored in x From 929c3003114adeaf31bf82c440d86df0e44e2b88 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 30 Jun 2022 14:52:13 -0700 Subject: [PATCH 62/96] Clarify compactify testing code - Be explicit about `filter` -> `dplyr::filter` - Have `row_replace` fetch `dt` from its args, not global, making it resemble a dplyr verb more closely. Note in comment that this appears to use copy on write despite assigning into a `data.table` which sometimes uses reference semantics. --- tests/testthat/test-compactify.R | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/tests/testthat/test-compactify.R b/tests/testthat/test-compactify.R index 7bcb0fe6..1861979f 100644 --- a/tests/testthat/test-compactify.R +++ b/tests/testthat/test-compactify.R @@ -6,19 +6,21 @@ dt <- archive_cases_dv$DT test_that("Input for compactify must be NULL or a boolean", { expect_error(as_epi_archive(dv_duplicated,compactify="no")) }) - -dt <- filter(dt,geo_value == "ca") + +dt <- dplyr::filter(dt, geo_value == "ca") dt$percent_cli <- c(1:80) dt$case_rate <- c(1:80) -row_replace <- function(row,x,y) { +row_replace <- function(dt,row,x,y) { + # (This way of "replacing" elements appears to use copy-on-write even though + # we are working with a data.table.) dt[row,4] <- x dt[row,5] <- y dt } # Rows 1 should not be eliminated even if NA -dt <- row_replace(1,NA,NA) # Not LOCF +dt <- row_replace(dt,1,NA,NA) # Not LOCF # NOTE! We are assuming that there are no NA's in geo_value, time_value, # and version. Even though compactify may erroneously remove the first row @@ -26,25 +28,25 @@ dt <- row_replace(1,NA,NA) # Not LOCF # has problems beyond the scope of this test # Rows 11 and 12 correspond to different time_values -dt <- row_replace(12,11,11) # Not LOCF +dt <- row_replace(dt,12,11,11) # Not LOCF # Rows 20 and 21 only differ in version -dt <- row_replace(21,20,20) # LOCF +dt <- row_replace(dt,21,20,20) # LOCF # Rows 21 and 22 only differ in version -dt <- row_replace(22,20,20) # LOCF +dt <- row_replace(dt,22,20,20) # LOCF # Row 39 comprises the first NA's -dt <-row_replace(39,NA,NA) # Not LOCF +dt <- row_replace(dt,39,NA,NA) # Not LOCF # Row 40 has two NA's, just like its lag, row 39 -dt <- row_replace(40,NA,NA) # LOCF +dt <- row_replace(dt,40,NA,NA) # LOCF # Row 62's values already exist in row 15, but row 15 is not a preceding row -dt <- row_replace(62,15,15) # Not LOCF +dt <- row_replace(dt,62,15,15) # Not LOCF # Row 73 only has one value carried over -dt <- row_replace(74,73,74) # Not LOCF +dt <- row_replace(dt,74,73,74) # Not LOCF dt_true <- as_tibble(as_epi_archive(dt,compactify=TRUE)$DT) dt_false <- as_tibble(as_epi_archive(dt,compactify=FALSE)$DT) From 4047e3dd92289b30c14eb436700e14e213ba2c43 Mon Sep 17 00:00:00 2001 From: kenmawer Date: Mon, 4 Jul 2022 11:04:45 -0700 Subject: [PATCH 63/96] Fixed broken code. --- vignettes/archive.Rmd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vignettes/archive.Rmd b/vignettes/archive.Rmd index 1390914d..99d3d676 100644 --- a/vignettes/archive.Rmd +++ b/vignettes/archive.Rmd @@ -146,8 +146,8 @@ the dataset, without being stored in the end: ```{r,message=FALSE} # `dt` here is taken from the tests -as_epi_archive(archive_cases_dv$DT,compactify=TRUE)$geo_type # "state" -as_epi_archive(archive_cases_dv$DT,compactify=TRUE)$compactify # NULL +as_epi_archive(archive_cases_dv_subset$DT,compactify=TRUE)$geo_type # "state" +as_epi_archive(archive_cases_dv_subset$DT,compactify=TRUE)$compactify # NULL ``` ## Producing snapshots in `epi_df` form From 3f276f9248fd3e86a3ffd27e52fbfb98518adf72 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 1 Jul 2022 09:09:06 -0700 Subject: [PATCH 64/96] Apply code formatting to an `epi_archive` example --- R/archive.R | 26 +++++++++++++------------- man/as_epi_archive.Rd | 24 ++++++++++++------------ 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/R/archive.R b/R/archive.R index 4f137f39..55a76b66 100644 --- a/R/archive.R +++ b/R/archive.R @@ -524,23 +524,23 @@ epi_archive = #' ``` #' #' @export -#' @examples -#' df <- data.frame (geo_value = c(rep("ca", 2), rep("fl", 2)), +#' @examples +#' df <- data.frame(geo_value = c(rep("ca", 2), rep("fl", 2)), #' county = c(1, 3, 2, 5), -#' time_value = c("2020-06-01", -#' "2020-06-02", -#' "2020-06-01", -#' "2020-06-02"), -#' version = c("2020-06-02", -#' "2020-06-03", -#' "2020-06-02", -#' "2020-06-03"), +#' time_value = c("2020-06-01", +#' "2020-06-02", +#' "2020-06-01", +#' "2020-06-02"), +#' version = c("2020-06-02", +#' "2020-06-03", +#' "2020-06-02", +#' "2020-06-03"), #' cases = c(1, 2, 3, 4), #' cases_rate = c(0.01, 0.02, 0.01, 0.05)) #' -#' x <- df %>% as_epi_archive(geo_type = "state", -#' time_type = "day", -#' other_keys = "county") +#' x <- df %>% as_epi_archive(geo_type = "state", +#' time_type = "day", +#' other_keys = "county") as_epi_archive = function(x, geo_type, time_type, other_keys, additional_metadata = list(),compactify = NULL) { epi_archive$new(x, geo_type, time_type, other_keys, additional_metadata, diff --git a/man/as_epi_archive.Rd b/man/as_epi_archive.Rd index 04b8cff7..c94a6027 100644 --- a/man/as_epi_archive.Rd +++ b/man/as_epi_archive.Rd @@ -58,20 +58,20 @@ would be equivalent to: }\if{html}{\out{}} } \examples{ -df <- data.frame (geo_value = c(rep("ca", 2), rep("fl", 2)), +df <- data.frame(geo_value = c(rep("ca", 2), rep("fl", 2)), county = c(1, 3, 2, 5), - time_value = c("2020-06-01", - "2020-06-02", - "2020-06-01", - "2020-06-02"), - version = c("2020-06-02", - "2020-06-03", - "2020-06-02", - "2020-06-03"), + time_value = c("2020-06-01", + "2020-06-02", + "2020-06-01", + "2020-06-02"), + version = c("2020-06-02", + "2020-06-03", + "2020-06-02", + "2020-06-03"), cases = c(1, 2, 3, 4), cases_rate = c(0.01, 0.02, 0.01, 0.05)) -x <- df \%>\% as_epi_archive(geo_type = "state", - time_type = "day", - other_keys = "county") +x <- df \%>\% as_epi_archive(geo_type = "state", + time_type = "day", + other_keys = "county") } From be4a0cd9eb8061309a9dd852a8d99a5428c58976 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 1 Jul 2022 09:10:16 -0700 Subject: [PATCH 65/96] Revise `compactify` warning text and assign it a(nother) class --- R/archive.R | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/R/archive.R b/R/archive.R index 55a76b66..d1cf4ac4 100644 --- a/R/archive.R +++ b/R/archive.R @@ -201,28 +201,25 @@ epi_archive = # Warns about redundant rows if (is.null(compactify) && nrow(elim) > 0) { - warning_intro <- paste("LOCF rows found;", - "these have been removed: \n") + warning_intro <- break_str(paste( + 'Found rows that appear redundant based on', + 'last (version of an) observation carried forward;', + 'these rows have been removed to "compactify" and save space:' + )) - # elim size capped at 6 - len <- nrow(elim) - elim <- elim[1:min(6,len),] + warning_data <- paste(collapse="\n", capture.output(print(elim, topn=3L, nrows=7L))) - warning_data <- paste(collapse="\n",capture.output(print(elim))) + warning_outro <- break_str(paste( + "Built-in `epi_archive` functionality should be unaffected,", + "but results may change if you work directly with its fields (such as `DT`).", + "See `?as_epi_archive` for details.", + "To silence this warning but keep compactification,", + "you can pass `compactify=TRUE` when constructing the archive." + )) - warning_message <- paste(warning_intro,warning_data) - if (len > 6) { - warning_message <- paste0(warning_message,"\n", - "Only the first 6 LOCF rows are ", - "printed. There are more than 6 LOCF", - " rows.") - } - - warning_message <- paste0(warning_message,"\n", - "To disable warning but still remove ", - "LOCF rows, set compactify=FALSE.") + warning_message <- paste(sep="\n", warning_intro, warning_data, warning_outro) - rlang::warn(warning_message) + rlang::warn(warning_message, class="epiprocess__compactify_default_removed_rows") } # Instantiate all self variables From 8a9326cbfe1438ebb257a8f4b394846e5aaab1d9 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 4 Jul 2022 12:58:22 -0700 Subject: [PATCH 66/96] Emit some help on `archive_cases_dv` eval errors during `unregister` Package reloading can/will force package-level promises, so if we introduce a bug in package-level promises, we can get "trapped" and unable to reload without using `unloadNamespace` or more drastic measures like restarting the R session. But we may be confused at what's going on in the first place when this happens, so build the promise to emit a helpful message if it looks like we're in that situation. --- R/data.R | 114 ++++++++++++++++++++++++++++++++++++- tests/testthat/test-data.R | 68 ++++++++++++++++++++++ 2 files changed, 179 insertions(+), 3 deletions(-) create mode 100644 tests/testthat/test-data.R diff --git a/R/data.R b/R/data.R index 0222d98b..1957da36 100644 --- a/R/data.R +++ b/R/data.R @@ -45,6 +45,114 @@ #' @export "archive_cases_dv" -# Like normal data objects, set archive_cases_dv up as a promise, so it doesn't -# take unnecessary space. This also avoids a need for @include tags. -delayedAssign("archive_cases_dv", as_epi_archive(archive_cases_dv_dt, compactify=FALSE)) +#' Detect whether `pkgload` is unregistering a package (with some unlikely false positives) +#' +#' More precisely, detects the presence of a call to an `unregister` or +#' `unregister_namespace` function from any package in the indicated part of the +#' function call stack. +#' +#' @param parent_n optional, single non-`NA` non-negative integer; how many +#' "parent"/"ancestor" calls should we skip inspecting? Default of `0L` will +#' check everything up to, but not including the call to this function. If +#' building wrappers or utilities around this function it may be useful to use +#' this default to ignore those wrappers, especially if they might trigger +#' false positives now or in some future version of this function with a looser +#' function name test. +#' +#' @return Boolean +#' +#' @noRd +some_package_is_being_unregistered = function(parent_n = 0L) { + calls = sys.calls() + # `calls` will include the call to this function; strip out this call plus + # `parent_n` additional requested calls to make it like we're reasoning about + # the desired call. This could prevent potential false positives from + # triggering if, in a later version, we decide to loosen the `call_name` + # checks below to something that would be `TRUE` for the name of this function + # or one of the undesired call ancestors. + calls_to_inspect = head(calls, n = -(parent_n + 1L)) + # Note that `head(sys.calls(), n=-1L)` isn't equivalent, due to lazy argument + # evaluation. Note that copy-pasting the body of this function without this + # `head` operation isn't always equivalent to calling it; e.g., within the + # `value` argument of a package-level `delayedAssign`, `sys.calls()` will + # return `NULL` is some or all cases, including when its evaluation has been + # triggered via `unregister`. + simple_call_names = purrr::map_chr(calls_to_inspect, function(call) { + maybe_simple_call_name = rlang::call_name(call) + if (is.null(maybe_simple_call_name)) NA_character_ else maybe_simple_call_name + }) + # `pkgload::unregister` is an (the?) exported function that forces + # package-level promises, while `pkgload:::unregister_namespace` is the + # internal function that does this package-level promise. Check for both just + # in case there's another exported function that calls `unregister_namespace` + # or other `pkgload` versions don't use the `unregister_namespace` internal. + # (Note that `NA_character_ %in% ` is `FALSE` rather + # than `NA`, giving the desired semantics and avoiding potential `NA`s in the + # argument to `any`.) + any(simple_call_names %in% c("unregister", "unregister_namespace")) +} + +#' [`base::delayedAssign`] with [`pkgload::unregister`] awareness, injection support +#' +#' Provides better feedback on errors during promise evaluation while a package +#' is being unregistered, to help package developers escape from a situation +#' where a buggy promise prevents package reloading. Also provide `rlang` +#' injection support (like [`rlang::env_bind_lazy`]). The call stack will look +#' different than when using `delayedAssign` directly. +#' +#' @noRd +delayed_assign_with_unregister_awareness = function(x, value, + eval.env = rlang::caller_env(), + assign.env = rlang::caller_env()) { + value_quosure = rlang::as_quosure(rlang::enexpr(value), eval.env) + this_env = environment() + delayedAssign(x, eval.env = this_env, assign.env = assign.env, value = { + if (some_package_is_being_unregistered()) { + withCallingHandlers( + # `rlang::eval_tidy(value_quosure)` is shorter and would sort of work, + # but doesn't give the same `ls`, `rm`, and top-level `<-` behavior as + # we'd have with `delayedAssign`; it doesn't seem to actually evaluate + # quosure's expr in the quosure's env. Using `rlang::eval_bare` instead + # seems to do the trick. (We also could have just used a `value_expr` + # and `eval.env` together rather than introducing `value_quosure` at + # all.) + rlang::eval_bare(rlang::quo_get_expr(value_quosure), rlang::quo_get_env(value_quosure)), + error = function(err) { + Abort(paste("An error was raised while attempting to evaluate a promise", + "(prepared with `delayed_assign_with_unregister_awareness`)", + "while an `unregister` or `unregister_namespace` call", + "was being evaluated.", + "This can happen, for example, when `devtools::load_all`", + "reloads a package that contains a buggy promise,", + "because reloading can cause old package-level promises to", + "be forced via `pkgload::unregister` and", + "`pkgload:::unregister_namespace`, due to", + "https://github.com/r-lib/pkgload/pull/157.", + "If this is the current situation, you might be able to", + "be successfully reload the package again after", + "`unloadNamespace`-ing it (but this situation will", + "keep re-occurring every other `devtools::load`", + "and every `devtools:document` until the bug or situation", + "generating the promise's error has been resolved)." + ), + class = "epiprocess__promise_evaluation_error_during_unregister", + parent = err) + }) + } else { + rlang::eval_bare(rlang::quo_get_expr(value_quosure), rlang::quo_get_env(value_quosure)) + } + }) +} + +# Like normal data objects, set `archive_cases_dv` up as a promise, so it +# doesn't take unnecessary space before it's evaluated. This also avoids a need +# for @include tags. However, this pattern will use unnecessary space after this +# promise is evaluated, because `as_epi_archive` clones `archive_cases_dv_dt` +# and `archive_cases_dv_dt` will stick around along with `archive_cases_dv` +# after they have been evaluated. We may want to add an option to avoid cloning +# in `as_epi_archive` and make use of it here. But we may also want to change +# this into an active binding that clones every time, unless we can hide the +# `DT` field from the user (make it non-`public` in general) or make it +# read-only (in this specific case), so that the user cannot modify the `DT` +# here and potentially mess up examples that they refer to later on. +delayed_assign_with_unregister_awareness("archive_cases_dv", as_epi_archive(archive_cases_dv_dt, compactify=FALSE)) diff --git a/tests/testthat/test-data.R b/tests/testthat/test-data.R new file mode 100644 index 00000000..d30612cb --- /dev/null +++ b/tests/testthat/test-data.R @@ -0,0 +1,68 @@ +test_that("`archive_cases_dv` is formed successfully", { + expect_true(is_epi_archive(archive_cases_dv)) +}) + +test_that("`delayed_assign_with_unregister_awareness` works as expected on good promises", { + # Since we're testing environment stuff, use some "my_" prefixes to try to + # prevent naming coincidences from changing behavior. + my_eval_env = rlang::new_environment(list(x=40L, n_evals=0L), parent=rlang::base_env()) + my_assign_env = rlang::new_environment() + delayed_assign_with_unregister_awareness("good1", { + n_evals <- n_evals + 1L + x + 2L + }, my_eval_env, my_assign_env) + force(my_assign_env[["good1"]]) + force(my_assign_env[["good1"]]) + force(my_assign_env[["good1"]]) + expect_identical(my_assign_env[["good1"]], 42L) + expect_identical(my_eval_env[["n_evals"]], 1L) +}) + +test_that("original `delayedAssign` works as expected on good promises", { + my_eval_env = rlang::new_environment(list(x=40L, n_evals=0L), parent=rlang::base_env()) + my_assign_env = rlang::new_environment() + delayedAssign("good1", { + n_evals <- n_evals + 1L + x + 2L + }, my_eval_env, my_assign_env) + force(my_assign_env[["good1"]]) + force(my_assign_env[["good1"]]) + force(my_assign_env[["good1"]]) + expect_identical(my_assign_env[["good1"]], 42L) + expect_identical(my_eval_env[["n_evals"]], 1L) +}) + +test_that("`delayed_assign_with_unregister_awareness` doesn't wrap a buggy promise if not unregistering", { + delayed_assign_with_unregister_awareness("x", Abort("msg", class="original_error_class")) + expect_error(force(x), class="original_error_class") +}) + +test_that("`delayed_assign_with_unregister_awareness` doesn't wrap a buggy promise if not unregistering", { + delayed_assign_with_unregister_awareness("x", Abort("msg", class="original_error_class")) + # Take advantage of a false positive / hedge against package renaming: make + # our own `unregister` function to trigger the special error message. + unregister = function(y) y + expect_error(unregister(force(x)), class="epiprocess__promise_evaluation_error_during_unregister") +}) + +test_that("`delayed_assign_with_unregister_awareness` injection support works", { + my_exprs = rlang::exprs(a = b + c, d = e) + delayed_assign_with_unregister_awareness( + "good2", list(!!!my_exprs), + eval.env=rlang::new_environment(list(b=2L, c=3L, e=4L), rlang::base_env()) + ) + force(good2) + expect_identical(good2, list(a=5L, d=4L)) +}) + +test_that("`some_package_is_being_unregistered` doesn't fail in response to non-simple calls", { + # Prerequisite for current implementation to work (testing here to help debug + # in case some R version doesn't obey): + expect_false(NA_character_ %in% letters) + f = function() function() some_package_is_being_unregistered() + my_expr = rlang::expr(f()()) + # Prerequisite for this to test to actually be testing on non-simple calls: + expect_false(rlang::is_call_simple(my_expr)) + # Actual test (`FALSE` is correct; `NA` or error is not): + expect_false(rlang::eval_bare(my_expr)) +}) From b3ddc3d007751908816c87bca6941cdb02d3bf35 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 5 Jul 2022 16:48:25 -0700 Subject: [PATCH 67/96] Add `clobberable_versions_start`, `observed_versions_end` params --- DESCRIPTION | 3 +- NAMESPACE | 1 + R/archive.R | 206 ++++++++++++++++--- man/as_epi_archive.Rd | 20 +- man/epi_archive.Rd | 55 ++++- man/max_version_with_row_in.Rd | 17 ++ tests/testthat/test-archive-version-bounds.R | 119 +++++++++++ tests/testthat/test-compactify.R | 34 ++- 8 files changed, 409 insertions(+), 46 deletions(-) create mode 100644 man/max_version_with_row_in.Rd create mode 100644 tests/testthat/test-archive-version-bounds.R diff --git a/DESCRIPTION b/DESCRIPTION index ceb5b29c..cfd1fe32 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -33,7 +33,8 @@ Imports: tidyr, tidyselect, tsibble, - utils + utils, + vctrs Suggests: covidcast, delphi.epidata, diff --git a/NAMESPACE b/NAMESPACE index bbed96cc..887c77fe 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -40,6 +40,7 @@ export(group_modify) export(growth_rate) export(is_epi_archive) export(is_epi_df) +export(max_version_with_row_in) export(mutate) export(relocate) export(rename) diff --git a/R/archive.R b/R/archive.R index d1cf4ac4..cfc4496b 100644 --- a/R/archive.R +++ b/R/archive.R @@ -6,6 +6,79 @@ # `data.table::` everywhere and not importing things. .datatable.aware = TRUE +#' Validate a version bound arg +#' +#' Expected to be used on `clobberable_versions_start` and +#' `observed_versions_end`. Some additional checks are needed. +#' +#' @param version_bound the version bound to validate +#' @param x a data frame containing a version column with which to check +#' compatibility +#' @param null_ok Boolean; is `NULL` an acceptable "bound"? (If so, `NULL` will +#' have a special context-dependent meaning.) +#' @param version_bound_arg optional string; what to call the version bound in +#' error messages +#' +#' @section Side effects: raises an error if version bound appears invalid +#' +#' @noRd +validate_version_bound = function(version_bound, x, null_ok, + version_bound_arg=rlang::caller_arg(version_bound)) { + if (is.null(version_bound)) { + # Check for NULL (length 0) before length-1-ness. + if (null_ok) { + # Looks like a valid version bound; return: + return(invisible(NULL)) + } else { + Abort(sprintf("%s must not be NULL", version_bound_arg), + class=sprintf("epiprocess__%s_is_null", version_bound_arg)) + } + } else if (length(version_bound) != 1L) { + # Check for length-1-ness fairly early so we don't have to worry as much + # about our `if`s receiving non-length-1 "Boolean"s. + Abort(sprintf("`version_bound` must%s have length 1, but instead was length %d", + if (null_ok) " be `NULL` or" else "", + length(version_bound)), + class=sprintf("epiprocess__%s_is_not_length_1", version_bound_arg)) + } else if (is.na(version_bound)) { + # Check for NA-ness before class check, as it's probably the more relevant + # error message, as using an NA of a different class is easy to do + # accidentally and for some classes is "necessary" as they don't have + # built-in NAs. + Abort(sprintf( + '`%s` must not satisfy `is.na`%s', + version_bound_arg, + if (null_ok) "; if you were trying to disable a version-related feature, `NULL` may work instead" else "" + ), class=sprintf("epiprocess__%s_is_na", version_bound_arg)) + } else if (!identical(class(version_bound), class(x[["version"]])) || + !identical(typeof(version_bound), typeof(x[["version"]]))) { + Abort(sprintf('`class(%1$s)` must be identical to `class(%2$s)` and `typeof(%1$s)` must be identical to `typeof(%2$s)`', + version_bound_arg, 'x[["version"]]'), + class=sprintf("epiprocess__%s_has_invalid_class_or_typeof", version_bound_arg)) + } else { + # Looks like a valid version bound; return: + return(invisible(NULL)) + } +} + +#' Default arg helper: `max(x$version)`, with error if `x` has 0 rows +#' +#' Exported to make defaults more easily copyable. +#' +#' @param x `x` argument of [`as_epi_archive`] +#' +#' @return `max(x$version)` if it has any rows; raises error if it has 0 rows +#' +#' @export +max_version_with_row_in = function(x) { + if (nrow(x) == 0L) { + Abort(sprintf("`nrow(x)==0L`, representing a data set history with no row up through the latest observed version, but we don't have a sensible guess at what version that is, or whether any of the empty versions might be clobbered in the future; if we use `x` to form an `epi_archive`, then `clobberable_versions_start` and `observed_versions_end` must be manually specified."), + class="epiprocess__max_version_cannot_be_used") + } else { + version_bound <- max(x[["version"]]) + } +} + #' @title `epi_archive` object #' #' @description An `epi_archive` is an R6 class which contains a data table @@ -87,6 +160,8 @@ epi_archive = geo_type = NULL, time_type = NULL, additional_metadata = NULL, + clobberable_versions_start = NULL, + observed_versions_end = NULL, #' @description Creates a new `epi_archive` object. #' @param x A data frame, data table, or tibble, with columns `geo_value`, #' `time_value`, `version`, and then any additional number of columns. @@ -102,19 +177,53 @@ epi_archive = #' @param additional_metadata List of additional metadata to attach to the #' `epi_archive` object. The metadata will have `geo_type` and `time_type` #' fields; named entries from the passed list or will be included as well. -#' @param compactify Optional, Boolean: should we remove rows that are +#' @param compactify Optional; Boolean or `NULL`: should we remove rows that are #' considered redundant for the purposes of `epi_archive`'s built-in methods -#' such as `$as_of`? As these methods use the last (version of an) -#' observation carried forward (LOCF) to interpolate between the version data -#' provided, rows that won't change these LOCF results can potentially be -#' omitted to save space. Generally, this can be set to `TRUE`, but if you -#' directly inspect or edit the fields of the `epi_archive` such as the `$DT`, -#' you will have to determine whether `compactify=TRUE` will still produce -#' equivalent results. +#' such as `as_of`? As these methods use the last (version of an) observation +#' carried forward (LOCF) to interpolate between the version data provided, +#' rows that don't change these LOCF results can potentially be omitted to +#' save space while maintaining the same behavior (with the help of the +#' `clobberable_versions_start` and `observed_versions_end` fields in some +#' edge cases). `TRUE` will remove these rows, `FALSE` will not, and missing +#' or `NULL` will remove these rows and issue a warning. Generally, this can +#' be set to `TRUE`, but if you directly inspect or edit the fields of the +#' `epi_archive` such as its `DT`, you will have to determine whether +#' `compactify=TRUE` will produce the desired results. If compactification +#' here is removing a large proportion of the rows, this may indicate a +#' potential for space, time, or bandwidth savings upstream the data pipeline, +#' e.g., when fetching, storing, or preparing the input data `x` +#' @param clobberable_versions_start Optional; `length`-1; either a value of the +#' same `class` and `typeof` as `x$version`, or `NULL`: specifically, either +#' (a) the earliest version that could be subject to "clobbering" (being +#' overwritten with different update data using the same version tag as the +#' old update data) or (b), `NULL` if no versions are clobberable. The default +#' value is `max_version_with_row_in(x)`; this default assumes that (i) there +#' is at least one row in `x`, (ii) if an update row (even a +#' compactify-redundant update row) is present with version `ver`, then all +#' previous versions must be finalized and non-clobberable, although `ver` +#' (and onward) might still be modified, (iii) even if we have "observed" +#' empty updates for some versions beyond `max(x$version)` (as indicated by +#' `observed_versions_end`; see below), we can't assume `max(x$version)` has +#' been finalized, because we might see a nonfinalized version + empty +#' subsequent versions due to upstream database replication delays in +#' combination with the upstream replicas using last-version-carried-forward +#' to extrapolate that there were no updates, and (iv) "redundant" update rows +#' that would be removed by `compactify` are not redundant, and actually come +#' from an explicit version release that indicates that preceding versions are +#' finalized. +#' @param observed_versions_end Optional; a value of the same `class` and +#' `typeof` as `x$version`: what is the last version we have observed? The +#' default is `max_version_with_row_in(x)`, but values greater than this could +#' also be valid, and would indicate that we observed additional versions of +#' the data beyond `max(x$version)`, but they all contained empty updates. +#' (The default value of `clobberable_versions_start` does not fully trust +#' these empty updates, and assumes that any version `>= max(x$version)` could +#' be clobbered.) If `nrow(x) == 0`, then this argument is mandatory. #' @return An `epi_archive` object. #' @importFrom data.table as.data.table key setkeyv initialize = function(x, geo_type, time_type, other_keys, - additional_metadata, compactify) { + additional_metadata, compactify, + clobberable_versions_start, observed_versions_end) { # Check that we have a data frame if (!is.data.frame(x)) { Abort("`x` must be a data frame.") @@ -130,6 +239,9 @@ epi_archive = if (!("version" %in% names(x))) { Abort("`x` must contain a `version` column.") } + if (anyNA(x$version)) { + Abort("`x$version` must not contain `NA`s") + } # If geo type is missing, then try to guess it if (missing(geo_type)) { @@ -154,15 +266,40 @@ epi_archive = c("geo_type", "time_type"))) { Warn("`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\".") } - - # Finish off with compactify + + # Conduct checks and apply defaults for `compactify` if (missing(compactify)) { compactify = NULL } else if (!rlang::is_bool(compactify) && - !rlang::is_null(compactify)) { + !rlang::is_null(compactify)) { Abort("compactify must be boolean or null.") } - + + # Apply defaults and conduct checks and apply defaults for + # `clobberable_versions_start`, `observed_versions_end`: + if (missing(clobberable_versions_start)) { + clobberable_versions_start <- max_version_with_row_in(x) + } + if (missing(observed_versions_end)) { + observed_versions_end <- max_version_with_row_in(x) + } + validate_version_bound(clobberable_versions_start, x, null_ok=TRUE) + validate_version_bound(observed_versions_end, x, null_ok=FALSE) + if (nrow(x) > 0L && observed_versions_end < max(x[["version"]])) { + Abort(sprintf("`observed_versions_end` was %s, but `x` contained + updates for a later version or versions, up through %s", + observed_versions_end, max(x[["version"]])), + class="epiprocess__observed_versions_end_earlier_than_updates") + } + if (!is.na(clobberable_versions_start) && clobberable_versions_start > observed_versions_end) { + Abort(sprintf("`observed_versions_end` was %s, but a `clobberable_versions_start` + of %s indicated that there were later observed versions", + observed_versions_end, clobberable_versions_start), + class="epiprocess__observed_versions_end_earlier_than_clobberable_versions_start") + } + + # --- End of validation and replacing missing args with defaults --- + # Create the data table; if x was an un-keyed data.table itself, # then the call to as.data.table() will fail to set keys, so we # need to check this, then do it manually if needed @@ -227,6 +364,8 @@ epi_archive = self$geo_type = geo_type self$time_type = time_type self$additional_metadata = additional_metadata + self$clobberable_versions_start = clobberable_versions_start + self$observed_versions_end = observed_versions_end }, print = function() { cat("An `epi_archive` object, with metadata:\n") @@ -261,23 +400,27 @@ epi_archive = #' @importFrom data.table between key as_of = function(max_version, min_time_value = -Inf) { # Self max version and other keys - self_max = max(self$DT$version) other_keys = setdiff(key(self$DT), c("geo_value", "time_value", "version")) if (length(other_keys) == 0) other_keys = NULL # Check a few things on max_version - if (!identical(class(max_version), class(self$DT$version))) { - Abort("`max_version` and `DT$version` must have same class.") + if (!identical(class(max_version), class(self$DT$version)) || + !identical(typeof(max_version), typeof(self$DT$version))) { + Abort("`max_version` and `DT$version` must have same `class` and `typeof`.") } if (length(max_version) != 1) { Abort("`max_version` cannot be a vector.") } - if (max_version > self_max) { - Abort("`max_version` must be at most `max(DT$max_version)`.") + if (is.na(max_version)) { + Abort("`max_version` must not be NA.") + } + if (max_version > self$observed_versions_end) { + Abort("`max_version` must be at most `self$observed_versions_end`.") } - if (max_version == self_max) { - Warn("Getting data as of the latest version possible. For a variety of reasons, it is possible that we only have a preliminary picture of this version (e.g., the upstream source has updated it but we have not seen it due to latency in synchronization). Thus, the snapshot that we produce here might not be reproducible at a later time (e.g., when the archive has caught up in terms of synchronization).", class="epiprocess__snapshot_as_of_last_update_version") + if (!is.null(self$clobberable_versions_start) && max_version >= self$clobberable_versions_start) { + Warn('Getting data as of some "clobberable" version (`>= self$clobberable_versions_start`). For a variety of reasons, it is possible that we only have a preliminary picture of this version (e.g., the upstream source has updated it but we have not seen it due to latency in synchronization). Thus, the snapshot that we produce here might not be reproducible at a later time (e.g., when the archive has caught up in terms of synchronization and "clobbered" the current picture of this version). (You can muffle this warning with `withCallingHandlers({}, "epiprocess__snapshot_as_of_clobberable_version"=function(wrn) invokeRestart("muffleWarning"))`.)', + class="epiprocess__snapshot_as_of_clobberable_version") } # Filter by version and return @@ -505,9 +648,19 @@ epi_archive = #' @param additional_metadata List of additional metadata to attach to the #' `epi_archive` object. The metadata will have `geo_type` and `time_type` #' fields; named entries from the passed list or will be included as well. -#' @param compactify By default, removes LOCF rows and warns the user about -#' them. Optionally, one can input a Boolean: TRUE eliminates LOCF rows, -#' while FALSE keeps them. +#' @param compactify Optional; Boolean or `NULL`: should we remove rows that are +#' considered redundant for the purposes of `epi_archive`'s built-in methods +#' such as `as_of`? As these methods use the last (version of an) observation +#' carried forward (LOCF) to interpolate between the version data provided, +#' rows that don't change these LOCF results can potentially be omitted to +#' save space. `TRUE` will remove these rows, `FALSE` will not, and missing or +#' `NULL` will remove these rows and issue a warning. Generally, this can be +#' set to `TRUE`, but if you directly inspect or edit the fields of the +#' `epi_archive` such as its `DT`, you will have to determine whether +#' `compactify=TRUE` will produce the desired results. If compactification +#' here is removing a large proportion of the rows, this may indicate a +#' potential for space, time, or bandwidth savings upstream the data pipeline, +#' e.g., when fetching, storing, or preparing the input data `x` #' @return An `epi_archive` object. #' #' @details This simply a wrapper around the `new()` method of the `epi_archive` @@ -539,9 +692,12 @@ epi_archive = #' time_type = "day", #' other_keys = "county") as_epi_archive = function(x, geo_type, time_type, other_keys, - additional_metadata = list(),compactify = NULL) { + additional_metadata = list(), + compactify = NULL, + clobberable_versions_start = max_version_with_row_in(x), + observed_versions_end = max_version_with_row_in(x)) { epi_archive$new(x, geo_type, time_type, other_keys, additional_metadata, - compactify) + compactify, clobberable_versions_start, observed_versions_end) } #' Test for `epi_archive` format diff --git a/man/as_epi_archive.Rd b/man/as_epi_archive.Rd index c94a6027..5bed8410 100644 --- a/man/as_epi_archive.Rd +++ b/man/as_epi_archive.Rd @@ -10,7 +10,9 @@ as_epi_archive( time_type, other_keys, additional_metadata = list(), - compactify = NULL + compactify = NULL, + clobberable_versions_start = max_version_with_row_in(x), + observed_versions_end = max_version_with_row_in(x) ) } \arguments{ @@ -33,9 +35,19 @@ apart from "geo_value", "time_value", and "version".} \code{epi_archive} object. The metadata will have \code{geo_type} and \code{time_type} fields; named entries from the passed list or will be included as well.} -\item{compactify}{By default, removes LOCF rows and warns the user about -them. Optionally, one can input a Boolean: TRUE eliminates LOCF rows, -while FALSE keeps them.} +\item{compactify}{Optional; Boolean or \code{NULL}: should we remove rows that are +considered redundant for the purposes of \code{epi_archive}'s built-in methods +such as \code{as_of}? As these methods use the last (version of an) observation +carried forward (LOCF) to interpolate between the version data provided, +rows that don't change these LOCF results can potentially be omitted to +save space. \code{TRUE} will remove these rows, \code{FALSE} will not, and missing or +\code{NULL} will remove these rows and issue a warning. Generally, this can be +set to \code{TRUE}, but if you directly inspect or edit the fields of the +\code{epi_archive} such as its \code{DT}, you will have to determine whether +\code{compactify=TRUE} will produce the desired results. If compactification +here is removing a large proportion of the rows, this may indicate a +potential for space, time, or bandwidth savings upstream the data pipeline, +e.g., when fetching, storing, or preparing the input data \code{x}} } \value{ An \code{epi_archive} object. diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index 161f4a85..e1b72919 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -104,7 +104,9 @@ Creates a new \code{epi_archive} object. time_type, other_keys, additional_metadata, - compactify + compactify, + clobberable_versions_start, + observed_versions_end )}\if{html}{\out{}} } @@ -130,15 +132,50 @@ apart from "geo_value", "time_value", and "version".} \code{epi_archive} object. The metadata will have \code{geo_type} and \code{time_type} fields; named entries from the passed list or will be included as well.} -\item{\code{compactify}}{Optional, Boolean: should we remove rows that are +\item{\code{compactify}}{Optional; Boolean or \code{NULL}: should we remove rows that are considered redundant for the purposes of \code{epi_archive}'s built-in methods -such as \verb{$as_of}? As these methods use the last (version of an) -observation carried forward (LOCF) to interpolate between the version data -provided, rows that won't change these LOCF results can potentially be -omitted to save space. Generally, this can be set to \code{TRUE}, but if you -directly inspect or edit the fields of the \code{epi_archive} such as the \verb{$DT}, -you will have to determine whether \code{compactify=TRUE} will still produce -equivalent results.} +such as \code{as_of}? As these methods use the last (version of an) observation +carried forward (LOCF) to interpolate between the version data provided, +rows that don't change these LOCF results can potentially be omitted to +save space while maintaining the same behavior (with the help of the +\code{clobberable_versions_start} and \code{observed_versions_end} fields in some +edge cases). \code{TRUE} will remove these rows, \code{FALSE} will not, and missing +or \code{NULL} will remove these rows and issue a warning. Generally, this can +be set to \code{TRUE}, but if you directly inspect or edit the fields of the +\code{epi_archive} such as its \code{DT}, you will have to determine whether +\code{compactify=TRUE} will produce the desired results. If compactification +here is removing a large proportion of the rows, this may indicate a +potential for space, time, or bandwidth savings upstream the data pipeline, +e.g., when fetching, storing, or preparing the input data \code{x}} + +\item{\code{clobberable_versions_start}}{Optional; \code{length}-1; either a value of the +same \code{class} and \code{typeof} as \code{x$version}, or \code{NULL}: specifically, either +(a) the earliest version that could be subject to "clobbering" (being +overwritten with different update data using the same version tag as the +old update data) or (b), \code{NULL} if no versions are clobberable. The default +value is \code{max_version_with_row_in(x)}; this default assumes that (i) there +is at least one row in \code{x}, (ii) if an update row (even a +compactify-redundant update row) is present with version \code{ver}, then all +previous versions must be finalized and non-clobberable, although \code{ver} +(and onward) might still be modified, (iii) even if we have "observed" +empty updates for some versions beyond \code{max(x$version)} (as indicated by +\code{observed_versions_end}; see below), we can't assume \code{max(x$version)} has +been finalized, because we might see a nonfinalized version + empty +subsequent versions due to upstream database replication delays in +combination with the upstream replicas using last-version-carried-forward +to extrapolate that there were no updates, and (iv) "redundant" update rows +that would be removed by \code{compactify} are not redundant, and actually come +from an explicit version release that indicates that preceding versions are +finalized.} + +\item{\code{observed_versions_end}}{Optional; a value of the same \code{class} and +\code{typeof} as \code{x$version}: what is the last version we have observed? The +default is \code{max_version_with_row_in(x)}, but values greater than this could +also be valid, and would indicate that we observed additional versions of +the data beyond \code{max(x$version)}, but they all contained empty updates. +(The default value of \code{clobberable_versions_start} does not fully trust +these empty updates, and assumes that any version \verb{>= max(x$version)} could +be clobbered.) If \code{nrow(x) == 0}, then this argument is mandatory.} } \if{html}{\out{}} } diff --git a/man/max_version_with_row_in.Rd b/man/max_version_with_row_in.Rd new file mode 100644 index 00000000..cce97f9d --- /dev/null +++ b/man/max_version_with_row_in.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive.R +\name{max_version_with_row_in} +\alias{max_version_with_row_in} +\title{Default arg helper: \code{max(x$version)}, with error if \code{x} has 0 rows} +\usage{ +max_version_with_row_in(x) +} +\arguments{ +\item{x}{\code{x} argument of \code{\link{as_epi_archive}}} +} +\value{ +\code{max(x$version)} if it has any rows; raises error if it has 0 rows +} +\description{ +Exported to make defaults more easily copyable. +} diff --git a/tests/testthat/test-archive-version-bounds.R b/tests/testthat/test-archive-version-bounds.R new file mode 100644 index 00000000..b687cfd4 --- /dev/null +++ b/tests/testthat/test-archive-version-bounds.R @@ -0,0 +1,119 @@ +test_that("`validate_version_bound` allows/catches `NULL` as requested", { + my_version_bound = NULL + validate_version_bound(my_version_bound, null_ok=TRUE) + expect_error(validate_version_bound(my_version_bound, null_ok=FALSE), + class="epiprocess__my_version_bound_is_null") + # Note that if the error class name changes, this test may produce some + # confusing output along the following lines: + # + # > Error in `$<-.data.frame`(`*tmp*`, "call_text", value = c("testthat::expect_error(...)", : + # > replacement has 4 rows, data has 3 +}) + +test_that("`validate_version_bound` catches bounds that are the wrong length", { + # Even if NULL is allowed, we should disallow other length-0 things: + my_version_bound1 = integer(0L) + expect_error(validate_version_bound(my_version_bound1, null_ok=TRUE), + class="epiprocess__my_version_bound1_is_not_length_1") + # And length > 1 things: + my_version_bound2 = c(2, 10) + expect_error(validate_version_bound(my_version_bound2, null_ok=TRUE), + class="epiprocess__my_version_bound2_is_not_length_1") +}) + +test_that("`validate_version_bound` catches NA and outputs a sensible error message", { + my_version_bound1 = as.Date(NA) + expect_error(validate_version_bound(my_version_bound1, null_ok=FALSE), + regexp = "satisfy `is.na`$", + class = "epiprocess__my_version_bound1_is_na") + my_version_bound2 = NA_integer_ + expect_error(validate_version_bound(my_version_bound2, null_ok=FALSE), + regexp = "satisfy `is.na`$", + class = "epiprocess__my_version_bound2_is_na") +}) + +test_that("`validate_version_bound` validate and class checks together allow and catch as intended", { + my_int = 5L + my_dbl = 5 + my_list = list(5L) + my_date = as.Date("2000-01-01") + my_datetime = vctrs::vec_cast(my_date, as.POSIXct(as.Date("1900-01-01"))) + # When first drafted, this validate function was a (validate+)cast function, + # which used vctrs::vec_cast inside. However, the initial implementation + # didn't actually allow casting to occur, and it was easier to change to the + # current stringent validation than to think about what exactly casts to + # allow. Some of the tests here were motivated based on that setup and have + # been kept around. For example, we wouldn't want to allow casts between dates + # and POSIXct's, because there are tz gotchas; these first couple of checks + # detect that we have a validate-compatible date and datetime to make sure we + # can properly help ward off the gotchas if switching to a cast rather than a + # validate. + expect_identical(vctrs::vec_cast(my_datetime, my_date), my_date) + expect_identical(vctrs::vec_cast(my_date, my_datetime), my_datetime) + # + x_int = tibble::tibble(version = my_int) + x_dbl = tibble::tibble(version = my_dbl) + x_list = tibble::tibble(version = my_list) + x_date = tibble::tibble(version = my_date) + x_datetime = tibble::tibble(version = my_datetime) + # Custom classes matter (test vectors and non-vctrs-specialized lists separately): + my_version_bound1 = `class<-`(24, "c1") + expect_error(validate_version_bound(my_version_bound1, x_int, null_ok=FALSE), + class="epiprocess__my_version_bound1_has_invalid_class_or_typeof") + my_version_bound2 = `class<-`(list(12), c("c2a","c2b","c2c")) + expect_error(validate_version_bound(my_version_bound2, x_list, null_ok=FALSE), + class="epiprocess__my_version_bound2_has_invalid_class_or_typeof") + # Want no error matching date to date or datetime to datetime, but no interop due to tz issues: + validate_version_bound(my_date, x_date, version_bound_arg="vb") + validate_version_bound(my_datetime, x_datetime, version_bound_arg="vb") + expect_error(validate_version_bound(my_datetime, x_date, null_ok=TRUE, version_bound_arg="vb"), + class="epiprocess__vb_has_invalid_class_or_typeof") + expect_error(validate_version_bound(my_date, x_datetime, null_ok=TRUE, version_bound_arg="vb"), + class="epiprocess__vb_has_invalid_class_or_typeof") + # Bad: + expect_error(validate_version_bound(3.5, x_int, TRUE, "vb")) + expect_error(validate_version_bound(.Machine$integer.max, x_dbl, TRUE, "vb")) + expect_error(validate_version_bound(`class<-`(list(2), "clazz"), + tibble::tibble(version=`class<-`(5L, "clazz")), TRUE, "vb")) + # Maybe questionable: + expect_error(validate_version_bound(3, x_int, TRUE, "vb")) + expect_error(validate_version_bound(3L, x_dbl, TRUE, "vb")) + # Good: + validate_version_bound(my_int, x_int, TRUE, "vb") + validate_version_bound(my_dbl, x_dbl, TRUE, "vb") + validate_version_bound(my_list, x_list, TRUE, "vb") + validate_version_bound(my_date, x_date, TRUE, "vb") + validate_version_bound(my_datetime, x_datetime, TRUE, "vb") +}) + +test_that("archive version bounds args work as intended", { + measurement_date = as.Date("2000-01-01") + update_tbl = tibble::tibble( + geo_value = "g1", + time_value = measurement_date, + version = measurement_date + 1:5, + value = 1:5 + ) + expect_error(as_epi_archive(update_tbl, + clobberable_versions_start = 1241, + observed_versions_end = measurement_date), + class="epiprocess__clobberable_versions_start_has_invalid_class_or_typeof") + expect_error(as_epi_archive(update_tbl[integer(0L),]), + class="epiprocess__max_version_cannot_be_used") + expect_error(as_epi_archive(update_tbl, + clobberable_versions_start = NULL, + observed_versions_end = measurement_date), + class="epiprocess__observed_versions_end_earlier_than_updates") + expect_error(as_epi_archive(update_tbl, + clobberable_versions_start=measurement_date+6L, + observed_versions_end = measurement_date+5L), + class="epiprocess__observed_versions_end_earlier_than_clobberable_versions_start") + expect_error(as_epi_archive(update_tbl, observed_versions_end = NULL), + regexp="observed_versions_end.*must not be NULL") + ea_default = as_epi_archive(update_tbl) + ea_default$as_of(measurement_date+4L) + expect_warning(ea_default$as_of(measurement_date+5L), + class = "epiprocess__snapshot_as_of_clobberable_version") + expect_error(ea_default$as_of(measurement_date+6L), + regexp = "max_version.*at most.*observed_versions_end") +}) diff --git a/tests/testthat/test-compactify.R b/tests/testthat/test-compactify.R index 1861979f..40802881 100644 --- a/tests/testthat/test-compactify.R +++ b/tests/testthat/test-compactify.R @@ -71,18 +71,38 @@ test_that("LOCF values are taken out with compactify=TRUE", { expect_identical(dt_null,dt_test) }) -test_that("as_of utilizes LOCF even after removal of LOCF values",{ +test_that("as_of produces the same results with compactify=TRUE as with compactify=FALSE", { ea_true <- as_epi_archive(dt,compactify=TRUE) ea_false <- as_epi_archive(dt,compactify=FALSE) - # Row 22, an LOCF row corresponding to the latest version, but for the - # date 2020-06-02, is omitted in ea_true + # Row 22, an LOCF row corresponding to the latest version, is omitted in + # ea_true + latest_version = max(ea_false$DT$version) expect_warning({ - as_of_true <- ea_true$as_of(max(ea_true$DT$version)) - }, class = "epiprocess__snapshot_as_of_last_update_version") + as_of_true <- ea_true$as_of(latest_version) + }, class = "epiprocess__snapshot_as_of_clobberable_version") expect_warning({ - as_of_false <- ea_false$as_of(max(ea_false$DT$version)) - }, class = "epiprocess__snapshot_as_of_last_update_version") + as_of_false <- ea_false$as_of(latest_version) + }, class = "epiprocess__snapshot_as_of_clobberable_version") expect_identical(as_of_true,as_of_false) }) + +test_that("compactify does not alter the default clobberable and observed version bounds", { + x = tibble::tibble( + geo_value = "geo1", + time_value = as.Date("2000-01-01"), + version = as.Date("2000-01-01") + 1:5, + value = 42L + ) + ea_true <- as_epi_archive(x, compactify=TRUE) + ea_false <- as_epi_archive(x, compactify=FALSE) + # We say that we base the bounds on the user's `x` arg. We might mess up or + # change our minds and base things on the `DT` field (or a temporary `DT` + # variable, post-compactify) instead. Check that this test would trigger + # in that case: + expect_true(max(ea_true$DT$version) != max(ea_false$DT$version)) + # The actual test: + expect_identical(ea_true$clobberable_versions_start, ea_false$clobberable_versions_start) + expect_identical(ea_true$observed_versions_end, ea_false$observed_versions_end) +}) From 2a20004539c041ae7dd4dcfeceffaa7c467d8e26 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 5 Jul 2022 19:21:55 -0700 Subject: [PATCH 68/96] Add back explanation about reconstructing sample archive from stored DT --- data-raw/archive_cases_dv_subset.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/data-raw/archive_cases_dv_subset.R b/data-raw/archive_cases_dv_subset.R index 115a4342..e3c0001f 100644 --- a/data-raw/archive_cases_dv_subset.R +++ b/data-raw/archive_cases_dv_subset.R @@ -31,6 +31,9 @@ case_rate_subset <- covidcast( epix_merge(archive_cases_dv_subset, case_rate_subset, all = TRUE) +# If we directly store an epi_archive R6 object as data, it will store its class +# implementation there as well. To prevent mismatches between these stored +# implementations and the latest class definition, don't store them as R6 +# objects; store the DT and construct the R6 object on request. archive_cases_dv_subset_dt = archive_cases_dv_subset$DT - usethis::use_data(archive_cases_dv_subset_dt, overwrite = TRUE, internal = TRUE) From bcb48ff492e1a66ba97dcb6478f8189bc934c0b5 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 6 Jul 2022 14:34:41 -0700 Subject: [PATCH 69/96] Go back to using NA to indicate no clobberable versions While drafting, NA was used originally, but this ran the danger of the user using `max` on a version column with 0 entries or an `NA`. However, NA versions are disallowed now, so only the 0-entry case is to be worried about, and this is only if the user uses `max` themselves instead of relying on our checked default or the function we use to prepare this default. `NULL` runs a much greater danger: it appears that R6 does not add any extra checks to field/method access to make sure that things actually exist, and, like base environments, returns NULL in the case of certain typos or an object that was invalidly constructed without the expected field. A third option, a custom sentinel value, would be the most robust, but would also pollute the namespace. Try to avoid this for now. --- R/archive.R | 154 +++++++++++-------- tests/testthat/test-archive-version-bounds.R | 50 +++--- 2 files changed, 112 insertions(+), 92 deletions(-) diff --git a/R/archive.R b/R/archive.R index 01742aa0..b1734572 100644 --- a/R/archive.R +++ b/R/archive.R @@ -14,7 +14,7 @@ #' @param version_bound the version bound to validate #' @param x a data frame containing a version column with which to check #' compatibility -#' @param null_ok Boolean; is `NULL` an acceptable "bound"? (If so, `NULL` will +#' @param na_ok Boolean; is `NULL` an acceptable "bound"? (If so, `NULL` will #' have a special context-dependent meaning.) #' @param version_bound_arg optional string; what to call the version bound in #' error messages @@ -22,41 +22,41 @@ #' @section Side effects: raises an error if version bound appears invalid #' #' @noRd -validate_version_bound = function(version_bound, x, null_ok, - version_bound_arg=rlang::caller_arg(version_bound)) { - if (is.null(version_bound)) { - # Check for NULL (length 0) before length-1-ness. - if (null_ok) { - # Looks like a valid version bound; return: - return(invisible(NULL)) - } else { - Abort(sprintf("%s must not be NULL", version_bound_arg), - class=sprintf("epiprocess__%s_is_null", version_bound_arg)) - } - } else if (length(version_bound) != 1L) { +validate_version_bound = function(version_bound, x, na_ok, + version_bound_arg = rlang::caller_arg(version_bound), + x_arg = rlang::caller_arg(version_bound)) { + # We might want some (optional?) validation here to detect internal bugs. + if (length(version_bound) != 1L) { # Check for length-1-ness fairly early so we don't have to worry as much # about our `if`s receiving non-length-1 "Boolean"s. - Abort(sprintf("`version_bound` must%s have length 1, but instead was length %d", - if (null_ok) " be `NULL` or" else "", + Abort(sprintf("`version_bound` must have length 1, but instead was length %d", length(version_bound)), class=sprintf("epiprocess__%s_is_not_length_1", version_bound_arg)) } else if (is.na(version_bound)) { - # Check for NA-ness before class check, as it's probably the more relevant - # error message, as using an NA of a different class is easy to do - # accidentally and for some classes is "necessary" as they don't have - # built-in NAs. - Abort(sprintf( - '`%s` must not satisfy `is.na`%s', - version_bound_arg, - if (null_ok) "; if you were trying to disable a version-related feature, `NULL` may work instead" else "" - ), class=sprintf("epiprocess__%s_is_na", version_bound_arg)) + # Check for NA before class&type, as any-class&type NA should be fine for + # our purposes, and some version classes&types might not have their own NA + # value to pass in. + if (na_ok) { + # Looks like a valid version bound; exit without error. + return(invisible(NULL)) + } else { + Abort(sprintf( + '`%s` must not satisfy `is.na` (NAs are not allowed for this kind of version bound)', + version_bound_arg + ), class=sprintf("epiprocess__%s_is_na", version_bound_arg)) + } } else if (!identical(class(version_bound), class(x[["version"]])) || !identical(typeof(version_bound), typeof(x[["version"]]))) { - Abort(sprintf('`class(%1$s)` must be identical to `class(%2$s)` and `typeof(%1$s)` must be identical to `typeof(%2$s)`', - version_bound_arg, 'x[["version"]]'), - class=sprintf("epiprocess__%s_has_invalid_class_or_typeof", version_bound_arg)) + Abort(sprintf( + '`class(%1$s)` must be identical to `class(%2$s)` and `typeof(%1$s)` must be identical to `typeof(%2$s)`', + version_bound_arg, + # '{x_arg}[["version"]]' except adding parentheses if needed: + rlang::expr_deparse(rlang::new_call( + quote(`[[`), rlang::pairlist2(rlang::parse_expr(x_arg), "version") + )) + ), class=sprintf("epiprocess__%s_has_invalid_class_or_typeof", version_bound_arg)) } else { - # Looks like a valid version bound; return: + # Looks like a valid version bound; exit without error. return(invisible(NULL)) } } @@ -67,7 +67,8 @@ validate_version_bound = function(version_bound, x, null_ok, #' #' @param x `x` argument of [`as_epi_archive`] #' -#' @return `max(x$version)` if it has any rows; raises error if it has 0 rows +#' @return `max(x$version)` if it has any rows; raises error if it has 0 rows or +#' an `NA` version value #' #' @export max_version_with_row_in = function(x) { @@ -75,10 +76,30 @@ max_version_with_row_in = function(x) { Abort(sprintf("`nrow(x)==0L`, representing a data set history with no row up through the latest observed version, but we don't have a sensible guess at what version that is, or whether any of the empty versions might be clobbered in the future; if we use `x` to form an `epi_archive`, then `clobberable_versions_start` and `observed_versions_end` must be manually specified."), class="epiprocess__max_version_cannot_be_used") } else { - version_bound <- max(x[["version"]]) + version_col = purrr::pluck(x, "version") # error not NULL if doesn't exist + if (anyNA(version_col)) { + Abort("version values cannot be NA", + class="epiprocess__version_values_must_not_be_na") + } else { + version_bound <- max(version_col) + } } } +#' Get the next possible value greater than `x` of the same type +#' +#' @param x the starting "value"(s) +#' @return same class, typeof, and length as `x` +#' +#' @export +next_after = function(x) UseMethod("next_after") + +#' @export +next_after.integer = function(x) x + 1L + +#' @export +next_after.Date = function(x) x + 1L + #' @title `epi_archive` object #' #' @description An `epi_archive` is an R6 class which contains a data table @@ -193,32 +214,32 @@ epi_archive = #' potential for space, time, or bandwidth savings upstream the data pipeline, #' e.g., when fetching, storing, or preparing the input data `x` #' @param clobberable_versions_start Optional; `length`-1; either a value of the -#' same `class` and `typeof` as `x$version`, or `NULL`: specifically, either -#' (a) the earliest version that could be subject to "clobbering" (being -#' overwritten with different update data using the same version tag as the -#' old update data) or (b), `NULL` if no versions are clobberable. The default -#' value is `max_version_with_row_in(x)`; this default assumes that (i) there -#' is at least one row in `x`, (ii) if an update row (even a -#' compactify-redundant update row) is present with version `ver`, then all -#' previous versions must be finalized and non-clobberable, although `ver` -#' (and onward) might still be modified, (iii) even if we have "observed" -#' empty updates for some versions beyond `max(x$version)` (as indicated by -#' `observed_versions_end`; see below), we can't assume `max(x$version)` has -#' been finalized, because we might see a nonfinalized version + empty -#' subsequent versions due to upstream database replication delays in -#' combination with the upstream replicas using last-version-carried-forward -#' to extrapolate that there were no updates, and (iv) "redundant" update rows -#' that would be removed by `compactify` are not redundant, and actually come -#' from an explicit version release that indicates that preceding versions are -#' finalized. -#' @param observed_versions_end Optional; a value of the same `class` and -#' `typeof` as `x$version`: what is the last version we have observed? The -#' default is `max_version_with_row_in(x)`, but values greater than this could -#' also be valid, and would indicate that we observed additional versions of -#' the data beyond `max(x$version)`, but they all contained empty updates. -#' (The default value of `clobberable_versions_start` does not fully trust -#' these empty updates, and assumes that any version `>= max(x$version)` could -#' be clobbered.) If `nrow(x) == 0`, then this argument is mandatory. +#' same `class` and `typeof` as `x$version`, or an `NA` of any `class` and +#' `typeof`: specifically, either (a) the earliest version that could be +#' subject to "clobbering" (being overwritten with different update data using +#' the same version tag as the old update data), or (b) `NA`, to indicate that +#' no versions are clobberable. The default value is +#' `max_version_with_row_in(x)`; this default assumes that (i) there is at +#' least one row in `x`, (ii) if an update row (even a compactify-redundant +#' update row) is present with version `ver`, then all previous versions must +#' be finalized and non-clobberable, although `ver` (and onward) might still +#' be modified, (iii) even if we have "observed" empty updates for some +#' versions beyond `max(x$version)` (as indicated by `observed_versions_end`; +#' see below), we can't assume `max(x$version)` has been finalized, because we +#' might see a nonfinalized version + empty subsequent versions due to +#' upstream database replication delays in combination with the upstream +#' replicas using last-version-carried-forward to extrapolate that there were +#' no updates, and (iv) "redundant" update rows that would be removed by +#' `compactify` are not redundant, and actually come from an explicit version +#' release that indicates that preceding versions are finalized. +#' @param observed_versions_end Optional; length-1, same `class` and `typeof` as +#' `x$version`: what is the last version we have observed? The default is +#' `max_version_with_row_in(x)`, but values greater than this could also be +#' valid, and would indicate that we observed additional versions of the data +#' beyond `max(x$version)`, but they all contained empty updates. (The default +#' value of `clobberable_versions_start` does not fully trust these empty +#' updates, and assumes that any version `>= max(x$version)` could be +#' clobbered.) If `nrow(x) == 0`, then this argument is mandatory. #' @return An `epi_archive` object. #' @importFrom data.table as.data.table key setkeyv initialize = function(x, geo_type, time_type, other_keys, @@ -240,7 +261,8 @@ epi_archive = Abort("`x` must contain a `version` column.") } if (anyNA(x$version)) { - Abort("`x$version` must not contain `NA`s") + Abort("`x$version` must not contain `NA`s", + class = "epiprocess__version_values_must_not_be_na") } # If geo type is missing, then try to guess it @@ -283,8 +305,8 @@ epi_archive = if (missing(observed_versions_end)) { observed_versions_end <- max_version_with_row_in(x) } - validate_version_bound(clobberable_versions_start, x, null_ok=TRUE) - validate_version_bound(observed_versions_end, x, null_ok=FALSE) + validate_version_bound(clobberable_versions_start, x, na_ok=TRUE) + validate_version_bound(observed_versions_end, x, na_ok=FALSE) if (nrow(x) > 0L && observed_versions_end < max(x[["version"]])) { Abort(sprintf("`observed_versions_end` was %s, but `x` contained updates for a later version or versions, up through %s", @@ -381,10 +403,18 @@ epi_archive = min(self$DT$time_value))) cat(sprintf("* %-14s = %s\n", "max time value", max(self$DT$time_value))) - cat(sprintf("* %-14s = %s\n", "min version", + cat(sprintf("* %-14s = %s\n", "first version with update", min(self$DT$version))) - cat(sprintf("* %-14s = %s\n", "max version", + cat(sprintf("* %-14s = %s\n", "last version with update", max(self$DT$version))) + if (is.na(self$clobberable_versions_start)) { + cat("No clobberable versions\n") + } else { + cat(sprintf("* %-14s = %s\n", "clobberable versions start", + self$clobberable_versions_start)) + } + cat(sprintf("* %-14s = %s\n", "observed versions end", + self$observed_versions_end)) cat("----------\n") cat(sprintf("Data archive (stored in DT field): %i x %i\n", nrow(self$DT), ncol(self$DT))) @@ -423,7 +453,7 @@ epi_archive = if (max_version > self$observed_versions_end) { Abort("`max_version` must be at most `self$observed_versions_end`.") } - if (!is.null(self$clobberable_versions_start) && max_version >= self$clobberable_versions_start) { + if (!is.na(self$clobberable_versions_start) && max_version >= self$clobberable_versions_start) { Warn('Getting data as of some "clobberable" version (`>= self$clobberable_versions_start`). For a variety of reasons, it is possible that we only have a preliminary picture of this version (e.g., the upstream source has updated it but we have not seen it due to latency in synchronization). Thus, the snapshot that we produce here might not be reproducible at a later time (e.g., when the archive has caught up in terms of synchronization and "clobbered" the current picture of this version). (You can muffle this warning with `withCallingHandlers({}, "epiprocess__snapshot_as_of_clobberable_version"=function(wrn) invokeRestart("muffleWarning"))`.)', class="epiprocess__snapshot_as_of_clobberable_version") } diff --git a/tests/testthat/test-archive-version-bounds.R b/tests/testthat/test-archive-version-bounds.R index b687cfd4..8223d64f 100644 --- a/tests/testthat/test-archive-version-bounds.R +++ b/tests/testthat/test-archive-version-bounds.R @@ -1,37 +1,27 @@ -test_that("`validate_version_bound` allows/catches `NULL` as requested", { - my_version_bound = NULL - validate_version_bound(my_version_bound, null_ok=TRUE) - expect_error(validate_version_bound(my_version_bound, null_ok=FALSE), - class="epiprocess__my_version_bound_is_null") +test_that("`validate_version_bound` allows/catches `NA` as requested", { + my_version_bound = NA + validate_version_bound(my_version_bound, na_ok=TRUE) + expect_error(validate_version_bound(my_version_bound, na_ok=FALSE), + class="epiprocess__my_version_bound_is_na") # Note that if the error class name changes, this test may produce some # confusing output along the following lines: # # > Error in `$<-.data.frame`(`*tmp*`, "call_text", value = c("testthat::expect_error(...)", : - # > replacement has 4 rows, data has 3 + # > replacement has 5 rows, data has 3 }) test_that("`validate_version_bound` catches bounds that are the wrong length", { - # Even if NULL is allowed, we should disallow other length-0 things: - my_version_bound1 = integer(0L) - expect_error(validate_version_bound(my_version_bound1, null_ok=TRUE), - class="epiprocess__my_version_bound1_is_not_length_1") - # And length > 1 things: + my_version_bound1a = NULL + expect_error(validate_version_bound(my_version_bound1a, na_ok=TRUE), + class="epiprocess__my_version_bound1a_is_not_length_1") + my_version_bound1b = integer(0L) + expect_error(validate_version_bound(my_version_bound1b, na_ok=TRUE), + class="epiprocess__my_version_bound1b_is_not_length_1") my_version_bound2 = c(2, 10) - expect_error(validate_version_bound(my_version_bound2, null_ok=TRUE), + expect_error(validate_version_bound(my_version_bound2, na_ok=TRUE), class="epiprocess__my_version_bound2_is_not_length_1") }) -test_that("`validate_version_bound` catches NA and outputs a sensible error message", { - my_version_bound1 = as.Date(NA) - expect_error(validate_version_bound(my_version_bound1, null_ok=FALSE), - regexp = "satisfy `is.na`$", - class = "epiprocess__my_version_bound1_is_na") - my_version_bound2 = NA_integer_ - expect_error(validate_version_bound(my_version_bound2, null_ok=FALSE), - regexp = "satisfy `is.na`$", - class = "epiprocess__my_version_bound2_is_na") -}) - test_that("`validate_version_bound` validate and class checks together allow and catch as intended", { my_int = 5L my_dbl = 5 @@ -58,17 +48,17 @@ test_that("`validate_version_bound` validate and class checks together allow and x_datetime = tibble::tibble(version = my_datetime) # Custom classes matter (test vectors and non-vctrs-specialized lists separately): my_version_bound1 = `class<-`(24, "c1") - expect_error(validate_version_bound(my_version_bound1, x_int, null_ok=FALSE), + expect_error(validate_version_bound(my_version_bound1, x_int, na_ok=FALSE), class="epiprocess__my_version_bound1_has_invalid_class_or_typeof") my_version_bound2 = `class<-`(list(12), c("c2a","c2b","c2c")) - expect_error(validate_version_bound(my_version_bound2, x_list, null_ok=FALSE), + expect_error(validate_version_bound(my_version_bound2, x_list, na_ok=FALSE), class="epiprocess__my_version_bound2_has_invalid_class_or_typeof") # Want no error matching date to date or datetime to datetime, but no interop due to tz issues: validate_version_bound(my_date, x_date, version_bound_arg="vb") validate_version_bound(my_datetime, x_datetime, version_bound_arg="vb") - expect_error(validate_version_bound(my_datetime, x_date, null_ok=TRUE, version_bound_arg="vb"), + expect_error(validate_version_bound(my_datetime, x_date, na_ok=TRUE, version_bound_arg="vb"), class="epiprocess__vb_has_invalid_class_or_typeof") - expect_error(validate_version_bound(my_date, x_datetime, null_ok=TRUE, version_bound_arg="vb"), + expect_error(validate_version_bound(my_date, x_datetime, na_ok=TRUE, version_bound_arg="vb"), class="epiprocess__vb_has_invalid_class_or_typeof") # Bad: expect_error(validate_version_bound(3.5, x_int, TRUE, "vb")) @@ -101,15 +91,15 @@ test_that("archive version bounds args work as intended", { expect_error(as_epi_archive(update_tbl[integer(0L),]), class="epiprocess__max_version_cannot_be_used") expect_error(as_epi_archive(update_tbl, - clobberable_versions_start = NULL, + clobberable_versions_start = NA, observed_versions_end = measurement_date), class="epiprocess__observed_versions_end_earlier_than_updates") expect_error(as_epi_archive(update_tbl, clobberable_versions_start=measurement_date+6L, observed_versions_end = measurement_date+5L), class="epiprocess__observed_versions_end_earlier_than_clobberable_versions_start") - expect_error(as_epi_archive(update_tbl, observed_versions_end = NULL), - regexp="observed_versions_end.*must not be NULL") + expect_error(as_epi_archive(update_tbl, observed_versions_end = NA), + regexp="observed_versions_end.*must not satisfy.*is.na") ea_default = as_epi_archive(update_tbl) ea_default$as_of(measurement_date+4L) expect_warning(ea_default$as_of(measurement_date+5L), From 6f5b177b2696098aebe5610bf27543d4be5a72b6 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 6 Jul 2022 14:43:23 -0700 Subject: [PATCH 70/96] Re-`document` --- man/epi_archive.Rd | 52 +++++++++++++++++----------------- man/max_version_with_row_in.Rd | 3 +- 2 files changed, 28 insertions(+), 27 deletions(-) diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index e1b72919..b7d44acb 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -149,33 +149,33 @@ potential for space, time, or bandwidth savings upstream the data pipeline, e.g., when fetching, storing, or preparing the input data \code{x}} \item{\code{clobberable_versions_start}}{Optional; \code{length}-1; either a value of the -same \code{class} and \code{typeof} as \code{x$version}, or \code{NULL}: specifically, either -(a) the earliest version that could be subject to "clobbering" (being -overwritten with different update data using the same version tag as the -old update data) or (b), \code{NULL} if no versions are clobberable. The default -value is \code{max_version_with_row_in(x)}; this default assumes that (i) there -is at least one row in \code{x}, (ii) if an update row (even a -compactify-redundant update row) is present with version \code{ver}, then all -previous versions must be finalized and non-clobberable, although \code{ver} -(and onward) might still be modified, (iii) even if we have "observed" -empty updates for some versions beyond \code{max(x$version)} (as indicated by -\code{observed_versions_end}; see below), we can't assume \code{max(x$version)} has -been finalized, because we might see a nonfinalized version + empty -subsequent versions due to upstream database replication delays in -combination with the upstream replicas using last-version-carried-forward -to extrapolate that there were no updates, and (iv) "redundant" update rows -that would be removed by \code{compactify} are not redundant, and actually come -from an explicit version release that indicates that preceding versions are -finalized.} +same \code{class} and \code{typeof} as \code{x$version}, or an \code{NA} of any \code{class} and +\code{typeof}: specifically, either (a) the earliest version that could be +subject to "clobbering" (being overwritten with different update data using +the same version tag as the old update data), or (b) \code{NA}, to indicate that +no versions are clobberable. The default value is +\code{max_version_with_row_in(x)}; this default assumes that (i) there is at +least one row in \code{x}, (ii) if an update row (even a compactify-redundant +update row) is present with version \code{ver}, then all previous versions must +be finalized and non-clobberable, although \code{ver} (and onward) might still +be modified, (iii) even if we have "observed" empty updates for some +versions beyond \code{max(x$version)} (as indicated by \code{observed_versions_end}; +see below), we can't assume \code{max(x$version)} has been finalized, because we +might see a nonfinalized version + empty subsequent versions due to +upstream database replication delays in combination with the upstream +replicas using last-version-carried-forward to extrapolate that there were +no updates, and (iv) "redundant" update rows that would be removed by +\code{compactify} are not redundant, and actually come from an explicit version +release that indicates that preceding versions are finalized.} -\item{\code{observed_versions_end}}{Optional; a value of the same \code{class} and -\code{typeof} as \code{x$version}: what is the last version we have observed? The -default is \code{max_version_with_row_in(x)}, but values greater than this could -also be valid, and would indicate that we observed additional versions of -the data beyond \code{max(x$version)}, but they all contained empty updates. -(The default value of \code{clobberable_versions_start} does not fully trust -these empty updates, and assumes that any version \verb{>= max(x$version)} could -be clobbered.) If \code{nrow(x) == 0}, then this argument is mandatory.} +\item{\code{observed_versions_end}}{Optional; length-1, same \code{class} and \code{typeof} as +\code{x$version}: what is the last version we have observed? The default is +\code{max_version_with_row_in(x)}, but values greater than this could also be +valid, and would indicate that we observed additional versions of the data +beyond \code{max(x$version)}, but they all contained empty updates. (The default +value of \code{clobberable_versions_start} does not fully trust these empty +updates, and assumes that any version \verb{>= max(x$version)} could be +clobbered.) If \code{nrow(x) == 0}, then this argument is mandatory.} } \if{html}{\out{}} } diff --git a/man/max_version_with_row_in.Rd b/man/max_version_with_row_in.Rd index cce97f9d..0b2c6deb 100644 --- a/man/max_version_with_row_in.Rd +++ b/man/max_version_with_row_in.Rd @@ -10,7 +10,8 @@ max_version_with_row_in(x) \item{x}{\code{x} argument of \code{\link{as_epi_archive}}} } \value{ -\code{max(x$version)} if it has any rows; raises error if it has 0 rows +\code{max(x$version)} if it has any rows; raises error if it has 0 rows or +an \code{NA} version value } \description{ Exported to make defaults more easily copyable. From c65c538ef1104246c4e8eb6f8953c0ee500f48f6 Mon Sep 17 00:00:00 2001 From: kenmawer Date: Mon, 11 Jul 2022 10:19:04 -0700 Subject: [PATCH 71/96] This still won't pass checks. --- NAMESPACE | 6 ++++-- man/as_epi_archive.Rd | 12 ++++++++---- man/epi_archive.Rd | 36 ++++++++++++++++++------------------ man/epi_slide.Rd | 12 ++++++++---- man/epix_as_of.Rd | 12 ++++++++---- man/epix_merge.Rd | 12 ++++++++---- man/epix_slide.Rd | 12 ++++++++---- man/next_after.Rd | 17 +++++++++++++++++ 8 files changed, 79 insertions(+), 40 deletions(-) create mode 100644 man/next_after.Rd diff --git a/NAMESPACE b/NAMESPACE index 4aeee63c..ac8e6632 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,6 @@ # Generated by roxygen2: do not edit by hand +S3method("[",epi_df) S3method(arrange,epi_df) S3method(as_epi_df,data.frame) S3method(as_epi_df,epi_df) @@ -10,6 +11,8 @@ S3method(filter,epi_df) S3method(group_by,epi_df) S3method(group_modify,epi_df) S3method(mutate,epi_df) +S3method(next_after,Date) +S3method(next_after,integer) S3method(print,epi_df) S3method(relocate,epi_df) S3method(rename,epi_df) @@ -41,6 +44,7 @@ export(is_epi_df) export(max_version_with_row_in) export(mutate) export(new_epi_df) +export(next_after) export(relocate) export(rename) export(slice) @@ -81,5 +85,3 @@ importFrom(tidyr,unnest) importFrom(tidyselect,eval_select) importFrom(tidyselect,starts_with) importFrom(tsibble,as_tsibble) -importFrom(utils,head) -importFrom(utils,tail) diff --git a/man/as_epi_archive.Rd b/man/as_epi_archive.Rd index f10cc3b6..b9f30dcc 100644 --- a/man/as_epi_archive.Rd +++ b/man/as_epi_archive.Rd @@ -59,11 +59,15 @@ examples. } \details{ This simply a wrapper around the \code{new()} method of the \code{epi_archive} -class, so for example:\preformatted{x <- as_epi_archive(df, geo_type = "state", time_type = "day") -} +class, so for example: -would be equivalent to:\preformatted{x <- epi_archive$new(df, geo_type = "state", time_type = "day") -} +\if{html}{\out{
}}\preformatted{x <- as_epi_archive(df, geo_type = "state", time_type = "day") +}\if{html}{\out{
}} + +would be equivalent to: + +\if{html}{\out{
}}\preformatted{x <- epi_archive$new(df, geo_type = "state", time_type = "day") +}\if{html}{\out{
}} } \examples{ # Simple ex. with necessary keys diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index ed57842d..83af9248 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -100,17 +100,17 @@ toy_epi_archive \section{Methods}{ \subsection{Public methods}{ \itemize{ -\item \href{#method-new}{\code{epi_archive$new()}} -\item \href{#method-print}{\code{epi_archive$print()}} -\item \href{#method-as_of}{\code{epi_archive$as_of()}} -\item \href{#method-merge}{\code{epi_archive$merge()}} -\item \href{#method-slide}{\code{epi_archive$slide()}} -\item \href{#method-clone}{\code{epi_archive$clone()}} +\item \href{#method-epi_archive-new}{\code{epi_archive$new()}} +\item \href{#method-epi_archive-print}{\code{epi_archive$print()}} +\item \href{#method-epi_archive-as_of}{\code{epi_archive$as_of()}} +\item \href{#method-epi_archive-merge}{\code{epi_archive$merge()}} +\item \href{#method-epi_archive-slide}{\code{epi_archive$slide()}} +\item \href{#method-epi_archive-clone}{\code{epi_archive$clone()}} } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-new}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-epi_archive-new}{}}} \subsection{Method \code{new()}}{ Creates a new \code{epi_archive} object. \subsection{Usage}{ @@ -205,8 +205,8 @@ An \code{epi_archive} object. } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-print}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-epi_archive-print}{}}} \subsection{Method \code{print()}}{ \subsection{Usage}{ \if{html}{\out{
}}\preformatted{epi_archive$print()}\if{html}{\out{
}} @@ -214,8 +214,8 @@ An \code{epi_archive} object. } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-as_of}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-epi_archive-as_of}{}}} \subsection{Method \code{as_of()}}{ Generates a snapshot in \code{epi_df} format as of a given version. See the documentation for the wrapper function \code{epix_as_of()} for details. @@ -225,8 +225,8 @@ See the documentation for the wrapper function \code{epix_as_of()} for details. } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-merge}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-epi_archive-merge}{}}} \subsection{Method \code{merge()}}{ Merges another \code{data.table} with the current one, and allows for a post-filling of \code{NA} values by last observation carried forward (LOCF). @@ -237,8 +237,8 @@ See the documentation for the wrapper function \code{epix_merge()} for details. } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-slide}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-epi_archive-slide}{}}} \subsection{Method \code{slide()}}{ Slides a given function over variables in an \code{epi_archive} object. See the documentation for the wrapper function \code{epix_as_of()} for @@ -260,8 +260,8 @@ details. } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-clone}{}}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-epi_archive-clone}{}}} \subsection{Method \code{clone()}}{ The objects of this class are cloneable with this method. \subsection{Usage}{ diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd index 7e76edc6..b5098c19 100644 --- a/man/epi_slide.Rd +++ b/man/epi_slide.Rd @@ -104,12 +104,16 @@ incomplete windows) is therefore left up to the user, either through the specified function or formula \code{f}, or through post-processing. If \code{f} is missing, then an expression for tidy evaluation can be specified, -for example, as in:\preformatted{epi_slide(x, cases_7dav = mean(cases), n = 7) -} +for example, as in: + +\if{html}{\out{
}}\preformatted{epi_slide(x, cases_7dav = mean(cases), n = 7) +}\if{html}{\out{
}} -which would be equivalent to:\preformatted{epi_slide(x, function(x, ...) mean(x$cases), n = 7, +which would be equivalent to: + +\if{html}{\out{
}}\preformatted{epi_slide(x, function(x, ...) mean(x$cases), n = 7, new_col_name = "cases_7dav") -} +}\if{html}{\out{
}} Thus, to be clear, when the computation is specified via an expression for tidy evaluation (first example, above), then the name for the new column is diff --git a/man/epix_as_of.Rd b/man/epix_as_of.Rd index 117135d6..e8ca2f47 100644 --- a/man/epix_as_of.Rd +++ b/man/epix_as_of.Rd @@ -29,11 +29,15 @@ examples. } \details{ This is simply a wrapper around the \code{as_of()} method of the -\code{epi_archive} class, so if \code{x} is an \code{epi_archive} object, then:\preformatted{epix_as_of(x, max_version = v) -} +\code{epi_archive} class, so if \code{x} is an \code{epi_archive} object, then: -is equivalent to:\preformatted{x$as_of(max_version = v) -} +\if{html}{\out{
}}\preformatted{epix_as_of(x, max_version = v) +}\if{html}{\out{
}} + +is equivalent to: + +\if{html}{\out{
}}\preformatted{x$as_of(max_version = v) +}\if{html}{\out{
}} } \examples{ # warning message of data latency shown diff --git a/man/epix_merge.Rd b/man/epix_merge.Rd index 330e7615..2df640e0 100644 --- a/man/epix_merge.Rd +++ b/man/epix_merge.Rd @@ -35,11 +35,15 @@ examples. } \details{ This is simply a wrapper around the \code{merge()} method of the -\code{epi_archive} class, so if \code{x} and \code{y} are an \code{epi_archive} objects, then:\preformatted{epix_merge(x, y) -} +\code{epi_archive} class, so if \code{x} and \code{y} are an \code{epi_archive} objects, then: -is equivalent to:\preformatted{x$merge(y) -} +\if{html}{\out{
}}\preformatted{epix_merge(x, y) +}\if{html}{\out{
}} + +is equivalent to: + +\if{html}{\out{
}}\preformatted{x$merge(y) +}\if{html}{\out{
}} } \examples{ # create two example epi_archive datasets diff --git a/man/epix_slide.Rd b/man/epix_slide.Rd index f01a0a71..b6f7a323 100644 --- a/man/epix_slide.Rd +++ b/man/epix_slide.Rd @@ -115,11 +115,15 @@ should never be used in place of \code{epi_slide()}, and only used when version-aware sliding is necessary (as it its purpose). Finally, this is simply a wrapper around the \code{slide()} method of the -\code{epi_archive} class, so if \code{x} is an \code{epi_archive} object, then:\preformatted{epix_slide(x, new_var = comp(old_var), n = 120) -} +\code{epi_archive} class, so if \code{x} is an \code{epi_archive} object, then: -is equivalent to:\preformatted{x$slide(x, new_var = comp(old_var), n = 120) -} +\if{html}{\out{
}}\preformatted{epix_slide(x, new_var = comp(old_var), n = 120) +}\if{html}{\out{
}} + +is equivalent to: + +\if{html}{\out{
}}\preformatted{x$slide(x, new_var = comp(old_var), n = 120) +}\if{html}{\out{
}} } \examples{ # these dates are reference time points for the 3 day average sliding window diff --git a/man/next_after.Rd b/man/next_after.Rd new file mode 100644 index 00000000..5170e8d9 --- /dev/null +++ b/man/next_after.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive.R +\name{next_after} +\alias{next_after} +\title{Get the next possible value greater than \code{x} of the same type} +\usage{ +next_after(x) +} +\arguments{ +\item{x}{the starting "value"(s)} +} +\value{ +same class, typeof, and length as \code{x} +} +\description{ +Get the next possible value greater than \code{x} of the same type +} From f82621813138124901c2fe5b842fce1c227f860d Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 21 Jul 2022 13:01:30 -0700 Subject: [PATCH 72/96] Add epix_fill_through_version, overhaul epix_merge; needs extra work Extra work: finish writing tests, review and fixup vignettes, rebuild the example archive's DT and make sure it doesn't differ in content from the old version minus the clobberable versions addition, then commit the new version. --- DESCRIPTION | 3 +- NAMESPACE | 8 +- R/archive.R | 146 ++++++--- R/data.R | 10 +- R/methods-epi_archive.R | 301 +++++++++++++++--- data-raw/archive_cases_dv_subset.R | 10 +- man/epi_archive.Rd | 82 +++-- man/epix_as_of.Rd | 4 +- man/epix_fill_through_version.Rd | 44 +++ man/epix_merge.Rd | 76 +++-- man/next_after.Rd | 17 + .../testthat/test-epix_fill_through_version.R | 80 +++++ tests/testthat/test-epix_merge.R | 56 ++++ 13 files changed, 692 insertions(+), 145 deletions(-) create mode 100644 man/epix_fill_through_version.Rd create mode 100644 man/next_after.Rd create mode 100644 tests/testthat/test-epix_fill_through_version.R create mode 100644 tests/testthat/test-epix_merge.R diff --git a/DESCRIPTION b/DESCRIPTION index 00b78130..b0b83ca0 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -46,7 +46,8 @@ Suggests: knitr, outbreaks, rmarkdown, - testthat (>= 3.0.0) + testthat (>= 3.0.0), + waldo (>= 0.3.1) VignetteBuilder: knitr Remotes: diff --git a/NAMESPACE b/NAMESPACE index b45e8285..8cda18b8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -11,6 +11,8 @@ S3method(group_by,epi_df) S3method(group_modify,epi_df) S3method(head,epi_df) S3method(mutate,epi_df) +S3method(next_after,Date) +S3method(next_after,integer) S3method(print,epi_df) S3method(relocate,epi_df) S3method(rename,epi_df) @@ -43,17 +45,18 @@ export(is_epi_df) export(max_version_with_row_in) export(mutate) export(new_epi_df) +export(next_after) export(relocate) export(rename) export(slice) export(ungroup) export(unnest) importFrom(R6,R6Class) +importFrom(data.table,":=") importFrom(data.table,as.data.table) importFrom(data.table,between) importFrom(data.table,key) -importFrom(data.table,merge.data.table) -importFrom(data.table,nafill) +importFrom(data.table,set) importFrom(data.table,setkeyv) importFrom(dplyr,arrange) importFrom(dplyr,filter) @@ -72,6 +75,7 @@ importFrom(rlang,"!!!") importFrom(rlang,"!!") importFrom(rlang,.data) importFrom(rlang,.env) +importFrom(rlang,arg_match) importFrom(rlang,enquo) importFrom(rlang,enquos) importFrom(rlang,is_quosure) diff --git a/R/archive.R b/R/archive.R index b1734572..d9aff82e 100644 --- a/R/archive.R +++ b/R/archive.R @@ -8,8 +8,8 @@ #' Validate a version bound arg #' -#' Expected to be used on `clobberable_versions_start` and -#' `observed_versions_end`. Some additional checks are needed. +#' Expected to be used on `clobberable_versions_start`, `observed_versions_end`, +#' and similar arguments. Some additional context-specific checks may be needed. #' #' @param version_bound the version bound to validate #' @param x a data frame containing a version column with which to check @@ -136,7 +136,11 @@ next_after.Date = function(x) x + 1L #' **A word of caution:** R6 objects, unlike most other objects in R, have #' reference semantics. A primary consequence of this is that objects are not #' copied when modified. You can read more about this in Hadley Wickham's -#' [Advanced R](https://adv-r.hadley.nz/r6.html#r6-semantics) book. +#' [Advanced R](https://adv-r.hadley.nz/r6.html#r6-semantics) book. In order +#' to construct a modified archive while keeping the original intact, first +#' make a clone using the `$clone` method, then overwrite the clone's `DT` +#' field with `data.table::copy(clone$DT)`, and finally perform the +#' modifications on the clone. #' #' @section Metadata: #' The following pieces of metadata are included as fields in an `epi_archive` @@ -158,7 +162,7 @@ next_after.Date = function(x) x + 1L #' `epi_df` format, which represents the most up-to-date values of the signal #' variables, as of the specified version. This is accomplished by calling the #' `as_of()` method for an `epi_archive` object `x`. More details on this -#' method are documented in the wrapper function `epix_as_of()`. +#' method are documented in the wrapper function [`epix_as_of()`]. #' #' @section Sliding Computations: #' We can run a sliding computation over an `epi_archive` object, much like @@ -168,8 +172,8 @@ next_after.Date = function(x) x + 1L #' difference: it is version-aware. That is, for an `epi_archive` object, the #' sliding computation at any given reference time point t is performed on #' **data that would have been available as of t**. More details on `slide()` -#' are documented in the wrapper function `epix_slide()`. -#' +#' are documented in the wrapper function [`epix_slide()`]. +#' #' @importFrom R6 R6Class #' @export epi_archive = @@ -216,22 +220,26 @@ epi_archive = #' @param clobberable_versions_start Optional; `length`-1; either a value of the #' same `class` and `typeof` as `x$version`, or an `NA` of any `class` and #' `typeof`: specifically, either (a) the earliest version that could be -#' subject to "clobbering" (being overwritten with different update data using -#' the same version tag as the old update data), or (b) `NA`, to indicate that -#' no versions are clobberable. The default value is -#' `max_version_with_row_in(x)`; this default assumes that (i) there is at -#' least one row in `x`, (ii) if an update row (even a compactify-redundant -#' update row) is present with version `ver`, then all previous versions must +#' subject to "clobbering" (being overwritten with different update data, but +#' using the same version tag as the old update data), or (b) `NA`, to +#' indicate that no versions are clobberable. There are a variety of reasons +#' why versions could be clobberable, such as upstream hotfixes to the latest +#' version, or delays in data synchronization that were mistaken for versions +#' with no updates; potential causes vary between different data pipelines. +#' The default value is `max_version_with_row_in(x)`; this default assumes +#' that (i) if a row in `x` (even one that `compactify` would consider +#' redundant) is present with version `ver`, then all previous versions must #' be finalized and non-clobberable, although `ver` (and onward) might still -#' be modified, (iii) even if we have "observed" empty updates for some +#' be modified, (ii) even if we have "observed" empty updates for some #' versions beyond `max(x$version)` (as indicated by `observed_versions_end`; #' see below), we can't assume `max(x$version)` has been finalized, because we #' might see a nonfinalized version + empty subsequent versions due to #' upstream database replication delays in combination with the upstream #' replicas using last-version-carried-forward to extrapolate that there were -#' no updates, and (iv) "redundant" update rows that would be removed by +#' no updates, (iii) "redundant" update rows that would be removed by #' `compactify` are not redundant, and actually come from an explicit version -#' release that indicates that preceding versions are finalized. +#' release that indicates that preceding versions are finalized. If `nrow(x) +#' == 0`, then this argument is mandatory. #' @param observed_versions_end Optional; length-1, same `class` and `typeof` as #' `x$version`: what is the last version we have observed? The default is #' `max_version_with_row_in(x)`, but values greater than this could also be @@ -431,7 +439,7 @@ epi_archive = }, ##### #' @description Generates a snapshot in `epi_df` format as of a given version. -#' See the documentation for the wrapper function `epix_as_of()` for details. +#' See the documentation for the wrapper function [`epix_as_of()`] for details. #' @importFrom data.table between key as_of = function(max_version, min_time_value = -Inf) { # Self max version and other keys @@ -454,7 +462,7 @@ epi_archive = Abort("`max_version` must be at most `self$observed_versions_end`.") } if (!is.na(self$clobberable_versions_start) && max_version >= self$clobberable_versions_start) { - Warn('Getting data as of some "clobberable" version (`>= self$clobberable_versions_start`). For a variety of reasons, it is possible that we only have a preliminary picture of this version (e.g., the upstream source has updated it but we have not seen it due to latency in synchronization). Thus, the snapshot that we produce here might not be reproducible at a later time (e.g., when the archive has caught up in terms of synchronization and "clobbered" the current picture of this version). (You can muffle this warning with `withCallingHandlers({}, "epiprocess__snapshot_as_of_clobberable_version"=function(wrn) invokeRestart("muffleWarning"))`.)', + Warn('Getting data as of some "clobberable" version that might be hotfixed, synced, or otherwise replaced later with different data using the same version tag. Thus, the snapshot that we produce here might not be reproducible later. See `?epi_archive` for more info and `?epix_as_of` on how to muffle.', class="epiprocess__snapshot_as_of_clobberable_version") } @@ -475,40 +483,88 @@ epi_archive = ) }, ##### -#' @description Merges another `data.table` with the current one, and allows for -#' a post-filling of `NA` values by last observation carried forward (LOCF). -#' See the documentation for the wrapper function `epix_merge()` for details. -#' @importFrom data.table key merge.data.table nafill - merge = function(y, ..., locf = TRUE, nan = NA) { - # FIXME need to do something with the version bounds - - # Check we have a `data.table` object - if (!(inherits(y, "data.table") || inherits(y, "epi_archive"))) { - Abort("`y` must be of class `data.table` or `epi_archive`.") +#' @description Fill in unobserved history using requested scheme by mutating +#' `self` and potentially reseating its fields. See +#' [`epix_fill_through_version`] for a full description of the non-R6-method +#' version, which doesn't mutate the input archive but might alias its fields. +#' +#' @param fill_versions_end as in [`epix_fill_through_version`] +#' @param how as in [`epix_fill_through_version`] +#' +#' @importFrom data.table key setkeyv := +#' @importFrom rlang arg_match + fill_through_version = function(fill_versions_end, + how=c("na", "lvcf")) { + validate_version_bound(fill_versions_end, self$DT, na_ok=FALSE) + how <- arg_match(how) + if (self$observed_versions_end < fill_versions_end) { + new_DT = switch( + how, + "na" = { + # old DT + a version consisting of all NA observations + # immediately after the last currently/actually-observed + # version. Note that this NA-observation version must only be + # added if `self` is outdated. + nonversion_key_cols = setdiff(key(self$DT), "version") + nonkey_cols = setdiff(names(self$DT), key(self$DT)) + next_version_tag = next_after(self$observed_versions_end) + if (next_version_tag > fill_versions_end) { + Abort(sprintf(paste( + "Apparent problem with `next_after` implementation:", + "archive contained observations through version %s", + "and the next possible version was supposed to be %s,", + "but this appeared to jump from a version < %3$s", + "to one > %3$s, implying at least one version in between." + ), self$observed_versions_end, next_version_tag, fill_versions_end)) + } + next_version_DT = unique(self$DT, by=nonversion_key_cols)[ + , version := next_version_tag][ + , (nonkey_cols) := NA] + setkeyv(rbind(self$DT, next_version_DT), key(self$DT))[] + }, + "lvcf" = { + # just the old DT; LVCF is built into other methods: + self$DT + } + ) + new_observed_versions_end = fill_versions_end + # Update `self` all at once with simple, error-free operations + + # return below: + self$DT <- new_DT + self$observed_versions_end <- new_observed_versions_end + } else { + # Already sufficiently up to date; nothing to do. } + return (invisible(self)) + }, + ##### +#' @description Merges another `epi_archive` with the current one, overwriting +#' fields of `self`, reseating its fields, and returning invisibly. See +#' [`epix_merge`] for a full description of the non-R6-method version, which +#' does not overwrite, and does not alias either archive's `DT`. +#' @param y as in [`epix_merge`] +#' @param observed_versions_end_conflict as in [`epix_merge`] +#' @param compactify as in [`epix_merge`] + merge = function(y, observed_versions_end_conflict = c("stop","na","lvcf","truncate"), compactify=TRUE) { + result = epix_merge(self, y, + observed_versions_end_conflict = observed_versions_end_conflict, + compactify = compactify) - # Use the data.table merge function, carrying through ... args - if (inherits(y, "data.table")) self$DT = merge(self$DT, y, ...) - else self$DT = merge(self$DT, y$DT, ...) + if (length(epi_archive$private_fields) != 0L) { + Abort("expected no private fields in epi_archive", + internal=TRUE) + } - # Now use the data.table locf function, if we're asked to - if (locf) { - key_vars = key(self$DT) - cols = setdiff(names(self$DT), key_vars) - by = setdiff(key_vars, "version") + # Mutate fields all at once, trying to avoid any potential errors: + for (field_name in names(epi_archive$public_fields)) { + self[[field_name]] <- result[[field_name]] + } - # Important: use nafill and not setnafill because the latter - # returns the entire data frame by reference, and the former can - # be set to act on particular columns by reference using := - self$DT[, - (cols) := nafill(.SD, type = "locf", nan = nan), - .SDcols = cols, - by = by] - } - }, + return (invisible(self)) + }, ##### #' @description Slides a given function over variables in an `epi_archive` -#' object. See the documentation for the wrapper function `epix_as_of()` for +#' object. See the documentation for the wrapper function [`epix_slide()`] for #' details. #' @importFrom data.table key #' @importFrom rlang !! !!! enquo enquos is_quosure sym syms diff --git a/R/data.R b/R/data.R index d97e751d..c15bb96d 100644 --- a/R/data.R +++ b/R/data.R @@ -170,7 +170,7 @@ delayed_assign_with_unregister_awareness = function(x, value, }) } -# Like normal data objects, set `archive_cases_dv` up as a promise, so it +# Like normal data objects, set `archive_cases_dv_subset` up as a promise, so it # doesn't take unnecessary space before it's evaluated. This also avoids a need # for @include tags. However, this pattern will use unnecessary space after this # promise is evaluated, because `as_epi_archive` clones `archive_cases_dv_subset_dt` @@ -181,6 +181,14 @@ delayed_assign_with_unregister_awareness = function(x, value, # `DT` field from the user (make it non-`public` in general) or make it # read-only (in this specific case), so that the user cannot modify the `DT` # here and potentially mess up examples that they refer to later on. +# +# During development, note that reloading the package and re-evaluating this +# promise should prepare the archive from the DT using any changes that have +# been made to `as_epi_archive`; however, if earlier, any field of +# `archive_cases_dv_subset` was modified using `<-`, a global environment +# binding may have been created with the same name as the package promise, and +# this binding will stick around even when the package is reloaded, and will +# need to be `rm`-d to easily access the refreshed package promise. delayed_assign_with_unregister_awareness("archive_cases_dv_subset", as_epi_archive(archive_cases_dv_subset_dt, compactify=FALSE)) #' Subset of JHU daily cases from California and Florida diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 14888e08..a89cbf96 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -51,60 +51,281 @@ #' withCallingHandlers({ #' epix_as_of(x = archive_cases_dv_subset, #' max_version = max(archive_cases_dv_subset$DT$version)) -#' }, epiprocess__snapshot_as_of_last_update_version = function(wrn) invokeRestart("muffleWarning")) +#' }, epiprocess__snapshot_as_of_clobberable_version = function(wrn) invokeRestart("muffleWarning")) +#' # Since R 4.0, there is a `globalCallingHandlers` function that can be used +#' # to globally toggle these warnings. epix_as_of = function(x, max_version, min_time_value = -Inf) { if (!inherits(x, "epi_archive")) Abort("`x` must be of class `epi_archive`.") return(x$as_of(max_version, min_time_value)) } +#' `epi_archive` with unobserved history filled in (won't mutate, might alias) +#' +#' Sometimes, due to upstream data pipeline issues, we have to work with a +#' version history that isn't completely up to date, but with functions that +#' expect archives that are completely up to date, or equally as up-to-date as +#' another archive. This function provides one way to approach such mismatches: +#' pretend that we've "observed" additional versions, filling in these versions +#' with NAs or extrapolated values. +#' +#' '`epix_fill_through_version` will not mutate its `x` argument, but its result +#' might alias fields of `x` (e.g., mutating the result's `DT` might mutate +#' `x$DT`). The R6 method variant, `x$fill_through_version`, will mutate `x` to +#' give the result, but might reseat its fields (e.g., references to the old +#' `x$DT` might not be updated by this function or subsequent operations on +#' `x`), and returns the updated `x` \link[base:invisible]{invisibly}. +#' +#' @param x An `epi_archive` +#' @param fill_versions_end Length-1, same class&type as `%s$version`: the +#' version through which to fill in missing version history; this will be the +#' result's `$observed_versions_end` unless it already had a later +#' `$observed_versions_end`. +#' @param how Optional; `"na"` or `"lvcf"`: `"na"` will fill in any missing +#' required version history with `NA`s, by inserting (if necessary) an update +#' immediately after the current `$observed_versions_end` that revises all +#' existing measurements to be `NA` (this is only supported for `version` +#' classes with a `next_after` implementation); `"lvcf"` will fill in missing +#' version history with the last version of each observation carried forward +#' (LVCF), by leaving the update `$DT` alone (other `epi_archive` methods are +#' based on LVCF). Default is `"na"`. +#' @return An `epi_archive` +epix_fill_through_version = function(x, fill_versions_end, + how=c("na", "lvcf")) { + if (!inherits(x, "epi_archive")) Abort("`x` must be of class `epi_archive`.") + # Enclosing parentheses drop the invisibility flag. See description above of + # potential mutation and aliasing behavior. + ( x$clone()$fill_through_version(fill_versions_end, how=how) ) +} + #' Merge two `epi_archive` objects #' -#' Merges the underlying data tables in two `epi_archive` objects, allows for -#' post-filling of `NA` values by last observation carried forward (LOCF), and -#' **overwrites** the first data table with the merged one. See the [archive -#' vignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html) for -#' examples. +#' Merges two `epi_archive`s that share a common `geo_value`, `time_value`, and +#' set of key columns. When they also share a common `observed_versions_end`, +#' using `$as_of` on the result should be the same as using `$as_of` on `x` and +#' `y` individually, then performing a full join of the `DT`s on the non-version +#' key columns (potentially consolidating multiple warnings about clobberable +#' versions). If the `observed_versions_end` values differ, the +#' `observed_versions_end_conflict` parameter controls what is done. #' -#' @param x,y Two `epi_archive` objects to join together, or more specifically, -#' whose underlying data tables are to be joined together. The data table in -#' `x` will be overwritten with the joined data table. For convenience, we -#' also allow `y` to be passed in directly as a `data.table` (need not be an -#' `epi_archive` object). -#' @param ... Named arguments to pass to `data.table::merge.data.table()`, which -#' is used for the join (with all default settings as in this function). For -#' example, passing `all = TRUE` will perform a full join. -#' @param locf Should LOCF be used after joining on all non-key columns? This -#' will take the latest version of each signal value and propogate it forward -#' to fill in gaps that appear after merging. Default is `TRUE`. -#' @param nan Should `NaN` values be treated as `NA` values in the post-filling -#' step? Default is `NA`, which means that they are treated as `NA` values; if -# `NaN`, then they are treated as distinct. -#' @return Nothing; the data table in `x` is overwritten with the merged one. -#' -#' @details This is simply a wrapper around the `merge()` method of the -#' `epi_archive` class, so if `x` and `y` are an `epi_archive` objects, then: -#' ``` -#' epix_merge(x, y) -#' ``` -#' is equivalent to: -#' ``` -#' x$merge(y) -#' ``` +#' This function, [`epix_merge`], does not mutate its inputs and will not alias +#' either archive's `DT`, but may alias other fields; `x$merge` will overwrite +#' `x` with the result of the merge, reseating its fields. +#' +#' @param x,y Two `epi_archive` objects to join together. +#' @param observed_versions_end_conflict Optional; `"stop"`, `"na"`, `"lvcf"`, +#' or `"truncate"`; in the case that `x$observed_versions_end` doesn't match +#' `y$observed_versions_end`, what do we do?: `"stop"`: emit an error; "na": +#' use `max(x$observed_versions_end, y$observed_versions_end)`, but in the +#' less up-to-date input archive, imagine there was an update immediately +#' after its last observed version which revised all observations to be `NA`; +#' `"locf"`: use `max(x$observed_versions_end, y$observed_versions_end)`, and +#' last-version-carried-forward extrapolation to invent update data for the +#' less up-to-date input archive; or `"truncate"`: use +#' `min(x$observed_versions_end, y$observed_versions_end)` and discard any +#' rows containing update rows for later versions. +#' @param compactify Optional; `TRUE`, `FALSE`, or `NULL`; should the result be +#' compactified? See [`as_epi_archive`] for an explanation of what this means. +#' Default here is `TRUE`. +#' @return the resulting `epi_archive` +#' +#' @details In all cases, `additional_metadata` will be an empty list, and +#' `clobberable_versions_start` will be set to the earliest version that could +#' be clobbered in either input archive. The result's `DT` will always be +#' compactified. #' -#' @export #' @examples #' # create two example epi_archive datasets -#' x <- archive_cases_dv_subset$DT %>% -#' dplyr::select(geo_value,time_value,version,case_rate_7d_av) %>% +#' x <- archive_cases_dv_subset$DT %>% +#' dplyr::select(geo_value,time_value,version,case_rate_7d_av) %>% #' as_epi_archive(compactify=TRUE) -#' y <- archive_cases_dv_subset$DT %>% -#' dplyr::select(geo_value,time_value,version,percent_cli) %>% +#' y <- archive_cases_dv_subset$DT %>% +#' dplyr::select(geo_value,time_value,version,percent_cli) %>% #' as_epi_archive(compactify=TRUE) #' # a full join stored in x -#' epix_merge(x, y, all = TRUE) -epix_merge = function(x, y, ..., locf = TRUE, nan = NA) { - if (!inherits(x, "epi_archive")) Abort("`x` must be of class `epi_archive`.") - return(x$merge(y, ..., locf = locf, nan = nan)) +#' epix_merge(x, y) +#' +#' @importFrom data.table key set +#' @export +epix_merge = function(x, y, + observed_versions_end_conflict = c("stop","na","lvcf","truncate"), + compactify = TRUE) { + if (!inherits(x, "epi_archive")) { + Abort("`x` must be of class `epi_archive`.") + } + + if (!inherits(y, "epi_archive")) { + Abort("`y` must be of class `epi_archive`.") + } + + observed_versions_end_conflict <- rlang::arg_match(observed_versions_end_conflict) + + if (!identical(x$geo_type, y$geo_type)) { + Abort("`x` and `y` must have the same `$geo_type`") + } + + if (!identical(x$time_type, y$time_type)) { + Abort("`x` and `y` must have the same `$time_type`") + } + + if (length(x$additional_metadata) != 0L) { + Warn("x$additional_metadata will be dropped", + class = "epiprocess__epix_merge_drops_additional_metadata") + } + if (length(y$additional_metadata) != 0L) { + Warn("y$additional_metadata will be dropped", + class = "epiprocess__epix_merge_drops_additional_metadata") + } + result_additional_metadata = list() + + result_clobberable_versions_start = + if (all(is.na(c(x$clobberable_versions_start, y$clobberable_versions_start)))) { + NA # (any type of NA is fine here) + } else { + Min(c(x$clobberable_versions_start, y$clobberable_versions_start)) + } + + # The actual merge below may not succeed 100% of the time, so do this + # preprocessing using non-mutating (but potentially aliasing) functions. This + # approach potentially uses more memory, but won't leave behind a + # partially-mutated `x` on failure. + if (observed_versions_end_conflict == "stop") { + if (!identical(x$observed_versions_end, y$observed_versions_end)) { + Abort(paste( + "`x` and `y` were not equally up to date version-wise:", + "`x$observed_versions_end` was not identical to `y$observed_versions_end`;", + "either ensure that `x` and `y` are equally up to date before merging,", + "or specify how to deal with this using `observed_versions_end_conflict`" + )) + } else { + new_observed_versions_end = x$observed_versions_end + x_DT = x$DT + y_DT = y$DT + } + } else if (observed_versions_end_conflict %in% c("na", "lvcf")) { + new_observed_versions_end = max(x$observed_versions_end, y$observed_versions_end) + x_DT = epix_fill_through_version(x, new_observed_versions_end, observed_versions_end_conflict)$DT + y_DT = epix_fill_through_version(y, new_observed_versions_end, observed_versions_end_conflict)$DT + } else if (observed_versions_end_conflict == "truncate") { + new_observed_versions_end = min(x$observed_versions_end, y$observed_versions_end) + x_DT = x$DT[version <= ..new_observed_versions_end] + y_DT = y$DT[version <= ..new_observed_versions_end] + } else Abort("unimplemented") + + if (!identical(key(x$DT), key(x_DT)) || !identical(key(y$DT), key(y_DT))) { + Abort("preprocessing of data tables in merge changed the key unexpectedly", + internal=TRUE) + } + ## key(x_DT) should be the same as key(x$DT) and key(y_DT) should be the same + ## as key(y$DT). If we want to break this function into parts it makes sense + ## to use {x,y}_DT below, but this makes the error checks and messages look a + ## little weird and rely on the key-matching assumption above. + if (!identical(sort(key(x_DT)), sort(key(y_DT)))) { + Abort(" + The archives must have the same set of key column names; if the + key columns represent the same things, just with different + names, please retry after manually renaming to match; if they + represent different things (e.g., x has an age breakdown + but y does not), please retry after processing them to share + the same key (e.g., by summarizing x to remove the age breakdown, + or by applying a static age breakdown to y). + ", class="epiprocess__epix_merge_x_y_must_have_same_key_set") + } + # `by` cols = result (and each input's) `key` cols, and determine + # the row set, determined using a full join via `merge` + # + # non-`by` cols = "value"-ish cols, and are looked up with last + # version carried forward via rolling joins + by = key(x_DT) # = some perm of key(y_DT) + if (!all(c("geo_value","time_value","version") %in% key(x_DT))) { + Abort('Invalid `by`; `by` is currently set to the common `key` of + the two archives, and is expected to contain + "geo_value", "time_value", and "version".', + class="epiprocess__epi_archive_must_have_required_key_cols") + } + if (length(by) < 1L || tail(by, 1L) != "version") { + Abort('Invalid `by`; `by` is currently set to the common `key` of + the two archives, and is expected to have a "version" as + the last key col.', + class="epiprocess__epi_archive_must_have_version_at_end_of_key") + } + x_nonby_colnames = setdiff(names(x_DT), by) + y_nonby_colnames = setdiff(names(y_DT), by) + if (length(intersect(x_nonby_colnames, y_nonby_colnames)) != 0L) { + Abort(" + `x` and `y` DTs have overlapping non-by column names; + this is currently not supported; please manually fix up first: + any overlapping columns that can are key-like should be + incorporated into the key, and other columns should be renamed. + ", class="epiprocess__epix_merge_x_y_must_not_have_overlapping_nonby_colnames") + } + x_by_vals = x_DT[, ..by] + if (anyDuplicated(x_by_vals) != 0L) { + Abort(" + The `by` columns must uniquely determine rows of `x$DT`; + the `by` is currently set to the common `key` of the two + archives, so this can be resolved by adding key-like columns + to `x`'s key (to get a unique key). + ", class="epiprocess__epix_merge_by_cols_must_act_as_unique_key") + } + y_by_vals = y_DT[, ..by] + if (anyDuplicated(y_by_vals) != 0L) { + Abort(" + The `by` columns must uniquely determine rows of `y$DT`; + the `by` is currently set to the common `key` of the two + archives, so this can be resolved by adding key-like columns + to `y`'s key (to get a unique key). + ", class="epiprocess__epix_merge_by_cols_must_act_as_unique_key") + } + result_DT = merge(x_by_vals, y_by_vals, by=by, + # We must have `all=TRUE` or we may skip updates + # from x and/or y and corrupt the history + all=TRUE, + # We don't want Cartesian products, but the + # by-is-unique-key check above already ensures + # this. (Note that `allow.cartesian=FALSE` doesn't + # actually catch all Cartesian products anyway.) + # Disable superfluous check: + allow.cartesian=TRUE) + set(result_DT,, x_nonby_colnames, + x_DT[result_DT[, ..by], ..x_nonby_colnames, + # It's good practice to specify `on`, and we must + # explicitly specify `on` if there's a potential key vs. + # by order mismatch (not possible currently for x + # with by = key(x$DT), but possible for y): + on = by, + # last version carried forward: + roll=TRUE, + # requesting non-version key that doesn't exist in the other archive, + # or before its first version, should result in NA + nomatch=NA, + # see note on `allow.cartesian` above; currently have a + # similar story here. + allow.cartesian=TRUE]) + set(result_DT,, y_nonby_colnames, + y_DT[result_DT[, ..by], ..y_nonby_colnames, + on = by, + roll=TRUE, + nomatch=NA, + allow.cartesian=TRUE]) + # The key could be unset in case of a key vs. by order mismatch as + # noted above. Ensure that we keep it: + setkeyv(result_DT, by) + + return (as_epi_archive( + result_DT[], # clear data.table internal invisibility flag if set + geo_type = x$geo_type, + time_type = x$time_type, + other_keys = setdiff(key(result_DT), c("geo_value","time_value","version")), + additional_metadata = result_additional_metadata, + # it'd probably be better to pre-compactify before the merge, and might be + # guaranteed not to be necessary to compactify the merge result if the + # inputs are already compactified, but at time of writing we don't have + # compactify in its own method or field, and it seems like it should be + # pretty fast anyway. + compactify=compactify, + clobberable_versions_start = result_clobberable_versions_start, + observed_versions_end = new_observed_versions_end + )) } #' Slide a function over variables in an `epi_archive` object diff --git a/data-raw/archive_cases_dv_subset.R b/data-raw/archive_cases_dv_subset.R index e3c0001f..1af33e1d 100644 --- a/data-raw/archive_cases_dv_subset.R +++ b/data-raw/archive_cases_dv_subset.R @@ -3,7 +3,7 @@ library(epiprocess) library(data.table) library(dplyr) -archive_cases_dv_subset <- covidcast( +dv_subset <- covidcast( data_source = "doctor-visits", signals = "smoothed_adj_cli", time_type = "day", @@ -11,9 +11,11 @@ archive_cases_dv_subset <- covidcast( time_values = epirange(20200601, 20211201), geo_values = "ca,fl,ny,tx", issues = epirange(20200601, 20211201) -) %>% +) %>% fetch_tbl() %>% select(geo_value, time_value, version = issue, percent_cli = value) %>% + # We're using compactify=FALSE here and below to avoid some testthat test + # failures on tests that were based on a non-compactified version. as_epi_archive(compactify=FALSE) case_rate_subset <- covidcast( @@ -29,7 +31,9 @@ case_rate_subset <- covidcast( select(geo_value, time_value, version = issue, case_rate_7d_av = value) %>% as_epi_archive(compactify=FALSE) -epix_merge(archive_cases_dv_subset, case_rate_subset, all = TRUE) +archive_cases_dv_subset = epix_merge(dv_subset, case_rate_subset, + observed_versions_end_conflict="na", + compactify=FALSE) # If we directly store an epi_archive R6 object as data, it will store its class # implementation there as well. To prevent mismatches between these stored diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index b7d44acb..27b3c960 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -40,7 +40,11 @@ state is. \strong{A word of caution:} R6 objects, unlike most other objects in R, have reference semantics. A primary consequence of this is that objects are not copied when modified. You can read more about this in Hadley Wickham's -\href{https://adv-r.hadley.nz/r6.html#r6-semantics}{Advanced R} book. +\href{https://adv-r.hadley.nz/r6.html#r6-semantics}{Advanced R} book. In order +to construct a modified archive while keeping the original intact, first +make a clone using the \verb{$clone} method, then overwrite the clone's \code{DT} +field with \code{data.table::copy(clone$DT)}, and finally perform the +modifications on the clone. } \section{Metadata}{ @@ -66,7 +70,7 @@ An \code{epi_archive} object can be used to generate a snapshot of the data in \code{epi_df} format, which represents the most up-to-date values of the signal variables, as of the specified version. This is accomplished by calling the \code{as_of()} method for an \code{epi_archive} object \code{x}. More details on this -method are documented in the wrapper function \code{epix_as_of()}. +method are documented in the wrapper function \code{\link[=epix_as_of]{epix_as_of()}}. } \section{Sliding Computations}{ @@ -78,7 +82,7 @@ the way \code{epi_slide()} works for an \code{epi_df} object, but with one key difference: it is version-aware. That is, for an \code{epi_archive} object, the sliding computation at any given reference time point t is performed on \strong{data that would have been available as of t}. More details on \code{slide()} -are documented in the wrapper function \code{epix_slide()}. +are documented in the wrapper function \code{\link[=epix_slide]{epix_slide()}}. } \section{Methods}{ @@ -87,6 +91,7 @@ are documented in the wrapper function \code{epix_slide()}. \item \href{#method-epi_archive-new}{\code{epi_archive$new()}} \item \href{#method-epi_archive-print}{\code{epi_archive$print()}} \item \href{#method-epi_archive-as_of}{\code{epi_archive$as_of()}} +\item \href{#method-epi_archive-fill_through_version}{\code{epi_archive$fill_through_version()}} \item \href{#method-epi_archive-merge}{\code{epi_archive$merge()}} \item \href{#method-epi_archive-slide}{\code{epi_archive$slide()}} \item \href{#method-epi_archive-clone}{\code{epi_archive$clone()}} @@ -151,22 +156,25 @@ e.g., when fetching, storing, or preparing the input data \code{x}} \item{\code{clobberable_versions_start}}{Optional; \code{length}-1; either a value of the same \code{class} and \code{typeof} as \code{x$version}, or an \code{NA} of any \code{class} and \code{typeof}: specifically, either (a) the earliest version that could be -subject to "clobbering" (being overwritten with different update data using -the same version tag as the old update data), or (b) \code{NA}, to indicate that -no versions are clobberable. The default value is -\code{max_version_with_row_in(x)}; this default assumes that (i) there is at -least one row in \code{x}, (ii) if an update row (even a compactify-redundant -update row) is present with version \code{ver}, then all previous versions must +subject to "clobbering" (being overwritten with different update data, but +using the same version tag as the old update data), or (b) \code{NA}, to +indicate that no versions are clobberable. There are a variety of reasons +why versions could be clobberable, such as upstream hotfixes to the latest +version, or delays in data synchronization that were mistaken for versions +with no updates; potential causes vary between different data pipelines. +The default value is \code{max_version_with_row_in(x)}; this default assumes +that (i) if a row in \code{x} (even one that \code{compactify} would consider +redundant) is present with version \code{ver}, then all previous versions must be finalized and non-clobberable, although \code{ver} (and onward) might still -be modified, (iii) even if we have "observed" empty updates for some +be modified, (ii) even if we have "observed" empty updates for some versions beyond \code{max(x$version)} (as indicated by \code{observed_versions_end}; see below), we can't assume \code{max(x$version)} has been finalized, because we might see a nonfinalized version + empty subsequent versions due to upstream database replication delays in combination with the upstream replicas using last-version-carried-forward to extrapolate that there were -no updates, and (iv) "redundant" update rows that would be removed by +no updates, (iii) "redundant" update rows that would be removed by \code{compactify} are not redundant, and actually come from an explicit version -release that indicates that preceding versions are finalized.} +release that indicates that preceding versions are finalized. If \code{nrow(x) == 0}, then this argument is mandatory.} \item{\code{observed_versions_end}}{Optional; length-1, same \code{class} and \code{typeof} as \code{x$version}: what is the last version we have observed? The default is @@ -197,30 +205,68 @@ An \code{epi_archive} object. \if{latex}{\out{\hypertarget{method-epi_archive-as_of}{}}} \subsection{Method \code{as_of()}}{ Generates a snapshot in \code{epi_df} format as of a given version. -See the documentation for the wrapper function \code{epix_as_of()} for details. +See the documentation for the wrapper function \code{\link[=epix_as_of]{epix_as_of()}} for details. \subsection{Usage}{ \if{html}{\out{
}}\preformatted{epi_archive$as_of(max_version, min_time_value = -Inf)}\if{html}{\out{
}} } +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-epi_archive-fill_through_version}{}}} +\subsection{Method \code{fill_through_version()}}{ +Fill in unobserved history using requested scheme by mutating +\code{self} and potentially reseating its fields. See +\code{\link{epix_fill_through_version}} for a full description of the non-R6-method +version, which doesn't mutate the input archive but might alias its fields. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{epi_archive$fill_through_version(fill_versions_end, how = c("na", "lvcf"))}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{fill_versions_end}}{as in \code{\link{epix_fill_through_version}}} + +\item{\code{how}}{as in \code{\link{epix_fill_through_version}}} +} +\if{html}{\out{
}} +} } \if{html}{\out{
}} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-epi_archive-merge}{}}} \subsection{Method \code{merge()}}{ -Merges another \code{data.table} with the current one, and allows for -a post-filling of \code{NA} values by last observation carried forward (LOCF). -See the documentation for the wrapper function \code{epix_merge()} for details. +Merges another \code{epi_archive} with the current one, overwriting +fields of \code{self}, reseating its fields, and returning invisibly. See +\code{\link{epix_merge}} for a full description of the non-R6-method version, which +does not overwrite, and does not alias either archive's \code{DT}. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{epi_archive$merge(y, ..., locf = TRUE, nan = NA)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{epi_archive$merge( + y, + observed_versions_end_conflict = c("stop", "na", "lvcf", "truncate"), + compactify = TRUE +)}\if{html}{\out{
}} } +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{y}}{as in \code{\link{epix_merge}}} + +\item{\code{observed_versions_end_conflict}}{as in \code{\link{epix_merge}}} + +\item{\code{compactify}}{as in \code{\link{epix_merge}}} +} +\if{html}{\out{
}} +} } \if{html}{\out{
}} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-epi_archive-slide}{}}} \subsection{Method \code{slide()}}{ Slides a given function over variables in an \code{epi_archive} -object. See the documentation for the wrapper function \code{epix_as_of()} for +object. See the documentation for the wrapper function \code{\link[=epix_slide]{epix_slide()}} for details. \subsection{Usage}{ \if{html}{\out{
}}\preformatted{epi_archive$slide( diff --git a/man/epix_as_of.Rd b/man/epix_as_of.Rd index e8ca2f47..214fb680 100644 --- a/man/epix_as_of.Rd +++ b/man/epix_as_of.Rd @@ -61,5 +61,7 @@ epix_as_of(x = archive_cases_dv_subset, withCallingHandlers({ epix_as_of(x = archive_cases_dv_subset, max_version = max(archive_cases_dv_subset$DT$version)) -}, epiprocess__snapshot_as_of_last_update_version = function(wrn) invokeRestart("muffleWarning")) +}, epiprocess__snapshot_as_of_clobberable_version = function(wrn) invokeRestart("muffleWarning")) +# Since R 4.0, there is a `globalCallingHandlers` function that can be used +# to globally toggle these warnings. } diff --git a/man/epix_fill_through_version.Rd b/man/epix_fill_through_version.Rd new file mode 100644 index 00000000..929dfda5 --- /dev/null +++ b/man/epix_fill_through_version.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods-epi_archive.R +\name{epix_fill_through_version} +\alias{epix_fill_through_version} +\title{\code{epi_archive} with unobserved history filled in (won't mutate, might alias)} +\usage{ +epix_fill_through_version(x, fill_versions_end, how = c("na", "lvcf")) +} +\arguments{ +\item{x}{An \code{epi_archive}} + +\item{fill_versions_end}{Length-1, same class&type as \verb{\%s$version}: the +version through which to fill in missing version history; this will be the +result's \verb{$observed_versions_end} unless it already had a later +\verb{$observed_versions_end}.} + +\item{how}{Optional; \code{"na"} or \code{"lvcf"}: \code{"na"} will fill in any missing +required version history with \code{NA}s, by inserting (if necessary) an update +immediately after the current \verb{$observed_versions_end} that revises all +existing measurements to be \code{NA} (this is only supported for \code{version} +classes with a \code{next_after} implementation); \code{"lvcf"} will fill in missing +version history with the last version of each observation carried forward +(LVCF), by leaving the update \verb{$DT} alone (other \code{epi_archive} methods are +based on LVCF). Default is \code{"na"}.} +} +\value{ +An \code{epi_archive} +} +\description{ +Sometimes, due to upstream data pipeline issues, we have to work with a +version history that isn't completely up to date, but with functions that +expect archives that are completely up to date, or equally as up-to-date as +another archive. This function provides one way to approach such mismatches: +pretend that we've "observed" additional versions, filling in these versions +with NAs or extrapolated values. +} +\details{ +'\code{epix_fill_through_version} will not mutate its \code{x} argument, but its result +might alias fields of \code{x} (e.g., mutating the result's \code{DT} might mutate +\code{x$DT}). The R6 method variant, \code{x$fill_through_version}, will mutate \code{x} to +give the result, but might reseat its fields (e.g., references to the old +\code{x$DT} might not be updated by this function or subsequent operations on +\code{x}), and returns the updated \code{x} \link[base:invisible]{invisibly}. +} diff --git a/man/epix_merge.Rd b/man/epix_merge.Rd index 2df640e0..bef4cccf 100644 --- a/man/epix_merge.Rd +++ b/man/epix_merge.Rd @@ -4,55 +4,63 @@ \alias{epix_merge} \title{Merge two \code{epi_archive} objects} \usage{ -epix_merge(x, y, ..., locf = TRUE, nan = NA) +epix_merge( + x, + y, + observed_versions_end_conflict = c("stop", "na", "lvcf", "truncate"), + compactify = TRUE +) } \arguments{ -\item{x, y}{Two \code{epi_archive} objects to join together, or more specifically, -whose underlying data tables are to be joined together. The data table in -\code{x} will be overwritten with the joined data table. For convenience, we -also allow \code{y} to be passed in directly as a \code{data.table} (need not be an -\code{epi_archive} object).} +\item{x, y}{Two \code{epi_archive} objects to join together.} -\item{...}{Named arguments to pass to \code{data.table::merge.data.table()}, which -is used for the join (with all default settings as in this function). For -example, passing \code{all = TRUE} will perform a full join.} +\item{observed_versions_end_conflict}{Optional; \code{"stop"}, \code{"na"}, \code{"lvcf"}, +or \code{"truncate"}; in the case that \code{x$observed_versions_end} doesn't match +\code{y$observed_versions_end}, what do we do?: \code{"stop"}: emit an error; "na": +use \code{max(x$observed_versions_end, y$observed_versions_end)}, but in the +less up-to-date input archive, imagine there was an update immediately +after its last observed version which revised all observations to be \code{NA}; +\code{"locf"}: use \code{max(x$observed_versions_end, y$observed_versions_end)}, and +last-version-carried-forward extrapolation to invent update data for the +less up-to-date input archive; or \code{"truncate"}: use +\code{min(x$observed_versions_end, y$observed_versions_end)} and discard any +rows containing update rows for later versions.} -\item{locf}{Should LOCF be used after joining on all non-key columns? This -will take the latest version of each signal value and propogate it forward -to fill in gaps that appear after merging. Default is \code{TRUE}.} - -\item{nan}{Should \code{NaN} values be treated as \code{NA} values in the post-filling -step? Default is \code{NA}, which means that they are treated as \code{NA} values; if} +\item{compactify}{Optional; \code{TRUE}, \code{FALSE}, or \code{NULL}; should the result be +compactified? See \code{\link{as_epi_archive}} for an explanation of what this means. +Default here is \code{TRUE}.} } \value{ -Nothing; the data table in \code{x} is overwritten with the merged one. +the resulting \code{epi_archive} } \description{ -Merges the underlying data tables in two \code{epi_archive} objects, allows for -post-filling of \code{NA} values by last observation carried forward (LOCF), and -\strong{overwrites} the first data table with the merged one. See the \href{https://cmu-delphi.github.io/epiprocess/articles/archive.html}{archive vignette} for -examples. +Merges two \code{epi_archive}s that share a common \code{geo_value}, \code{time_value}, and +set of key columns. When they also share a common \code{observed_versions_end}, +using \verb{$as_of} on the result should be the same as using \verb{$as_of} on \code{x} and +\code{y} individually, then performing a full join of the \code{DT}s on the non-version +key columns (potentially consolidating multiple warnings about clobberable +versions). If the \code{observed_versions_end} values differ, the +\code{observed_versions_end_conflict} parameter controls what is done. } \details{ -This is simply a wrapper around the \code{merge()} method of the -\code{epi_archive} class, so if \code{x} and \code{y} are an \code{epi_archive} objects, then: - -\if{html}{\out{
}}\preformatted{epix_merge(x, y) -}\if{html}{\out{
}} +This function, \code{\link{epix_merge}}, does not mutate its inputs and will not alias +either archive's \code{DT}, but may alias other fields; \code{x$merge} will overwrite +\code{x} with the result of the merge, reseating its fields. -is equivalent to: - -\if{html}{\out{
}}\preformatted{x$merge(y) -}\if{html}{\out{
}} +In all cases, \code{additional_metadata} will be an empty list, and +\code{clobberable_versions_start} will be set to the earliest version that could +be clobbered in either input archive. The result's \code{DT} will always be +compactified. } \examples{ # create two example epi_archive datasets -x <- archive_cases_dv_subset$DT \%>\% - dplyr::select(geo_value,time_value,version,case_rate_7d_av) \%>\% +x <- archive_cases_dv_subset$DT \%>\% + dplyr::select(geo_value,time_value,version,case_rate_7d_av) \%>\% as_epi_archive(compactify=TRUE) -y <- archive_cases_dv_subset$DT \%>\% - dplyr::select(geo_value,time_value,version,percent_cli) \%>\% +y <- archive_cases_dv_subset$DT \%>\% + dplyr::select(geo_value,time_value,version,percent_cli) \%>\% as_epi_archive(compactify=TRUE) # a full join stored in x -epix_merge(x, y, all = TRUE) +epix_merge(x, y) + } diff --git a/man/next_after.Rd b/man/next_after.Rd new file mode 100644 index 00000000..5170e8d9 --- /dev/null +++ b/man/next_after.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive.R +\name{next_after} +\alias{next_after} +\title{Get the next possible value greater than \code{x} of the same type} +\usage{ +next_after(x) +} +\arguments{ +\item{x}{the starting "value"(s)} +} +\value{ +same class, typeof, and length as \code{x} +} +\description{ +Get the next possible value greater than \code{x} of the same type +} diff --git a/tests/testthat/test-epix_fill_through_version.R b/tests/testthat/test-epix_fill_through_version.R new file mode 100644 index 00000000..265ebc08 --- /dev/null +++ b/tests/testthat/test-epix_fill_through_version.R @@ -0,0 +1,80 @@ + +test_that("epix_fill_through_version mirrors input when it is sufficiently up to date", { + ea_orig = as_epi_archive(data.table::data.table(geo_value = "g1", time_value = as.Date("2020-01-01"), + version = 1:5, value = 1:5)) + some_earlier_observed_version = 2L + ea_trivial_fill_na1 = epix_fill_through_version(ea_orig, some_earlier_observed_version, "na") + ea_trivial_fill_na2 = epix_fill_through_version(ea_orig, ea_orig$observed_versions_end, "na") + ea_trivial_fill_lvcf = epix_fill_through_version(ea_orig, some_earlier_observed_version, "lvcf") + # Below, we want R6 objects to be compared based on contents rather than + # addresses. We appear to get this with `expect_identical` in `testthat` + # edition 3, which is based on `waldo::compare` rather than `base::identical`; + # `waldo::compare` in waldo >=0.3.1 appears (as of 0.4.0) to compare R6 + # objects by contents rather than address (in a way that is tested but maybe + # not guaranteed via user docs). Use `local_edition` to ensure we use edition + # 3 here. + local_edition(3) + expect_identical(ea_orig, ea_trivial_fill_na1) + expect_identical(ea_orig, ea_trivial_fill_na2) + expect_identical(ea_orig, ea_trivial_fill_lvcf) +}) + +test_that("epix_fill_through_version can extend observed versions, gives expected `as_of`s", { + ea_orig = as_epi_archive(data.table::data.table( + geo_value = "g1", + time_value = as.Date("2020-01-01") + c(rep(0L,5L), 1L), + version = c(1:5, 2L), + value = 1:6)) + first_unobserved_version = 6L + later_unobserved_version = 10L + ea_fill_na = epix_fill_through_version(ea_orig, later_unobserved_version, "na") + ea_fill_lvcf = epix_fill_through_version(ea_orig, later_unobserved_version, "lvcf") + + # We use edition 3 features here, passing `ignore_attr` to `waldo::compare`. + # Ensure we are using edition 3: + local_edition(3) + withCallingHandlers({ + expect_identical(ea_fill_na$observed_versions_end, later_unobserved_version) + expect_identical(tibble::as_tibble(ea_fill_na$as_of(first_unobserved_version)), + tibble::tibble(geo_value="g1", time_value=as.Date("2020-01-01")+0:1, value=rep(NA_integer_, 2L)), + ignore_attr = TRUE) + expect_identical(ea_fill_lvcf$observed_versions_end, later_unobserved_version) + expect_identical(ea_fill_lvcf$as_of(first_unobserved_version), + ea_fill_lvcf$as_of(ea_orig$observed_versions_end) %>% + {attr(., "metadata")$as_of <- first_unobserved_version; .}) + }, epiprocess__snapshot_as_of_clobberable_version = function(wrn) invokeRestart("muffleWarning")) +}) + +test_that("epix_fill_through_version does not mutate x", { + ea_orig = as_epi_archive(data.table::data.table(geo_value = "g1", time_value = as.Date("2020-01-01"), + version = 1:5, value = 1:5)) + # We want to perform a strict comparison of the contents of `ea_orig` before + # and `ea_orig` after. `clone` + `expect_identical` based on waldo would sort + # of work, but we might want something stricter. `as.list` + `identical` seems + # to do the trick + ea_orig_before_as_list = as.list(ea_orig) + some_unobserved_version = 8L + ea_fill_na = epix_fill_through_version(ea_orig, some_unobserved_version, "na") + ea_orig_after_as_list = as.list(ea_orig) + # use identical, not expect_identical; latter isn't as strict + expect_true(identical(ea_orig_before_as_list, ea_orig_after_as_list)) +}) + +test_that("x$fill_through_version mutates x (if needed)", { + ea = as_epi_archive(data.table::data.table(geo_value = "g1", time_value = as.Date("2020-01-01"), + version = 1:5, value = 1:5)) + # We want the contents to change in a substantial way that makes waldo compare + # different (if the contents need to change). + ea_before_copies_as_list = lapply(ea, data.table::copy) + some_unobserved_version = 8L + ea$fill_through_version(some_unobserved_version, "na") + ea_after_copies_as_list = lapply(ea, data.table::copy) + expect_failure(expect_identical(ea_before_copies_as_list, ea_after_copies_as_list)) +}) + +test_that("{epix_,$}fill_through_version return with expected visibility", { + ea = as_epi_archive(data.table::data.table(geo_value = "g1", time_value = as.Date("2020-01-01"), + version = 1:5, value = 1:5)) + expect_true(withVisible(epix_fill_through_version(ea, 10L, "na"))[["visible"]]) + expect_false(withVisible(ea$fill_through_version(15L, "na"))[["visible"]]) +}) diff --git a/tests/testthat/test-epix_merge.R b/tests/testthat/test-epix_merge.R new file mode 100644 index 00000000..f6e2fc14 --- /dev/null +++ b/tests/testthat/test-epix_merge.R @@ -0,0 +1,56 @@ + +test_that("epix_merge merges and carries forward updates properly", { + x = as_epi_archive( + data.table::as.data.table( + tibble::tribble(~geo_value, ~time_value, ~version, ~x_value, + # same version set for x and y + "g1", 1L, 1:3, paste0("XA", 1:3), + # versions of x surround those of y + this measurement has + # max update version beyond some others + "g1", 2L, 1:5, paste0("XB", 1:5), + # mirror case + "g1", 3L, 2L, paste0("XC", 2L), + # x has 1 version, y has 0 + "g1", 4L, 1L, paste0("XD", 1L), + # values that should be LVCF'd + NAs that should be LVCF's as NA + "g1", 6L, c(1L,3L,5L), paste0("XE", c(1L, NA, 5L)) + ) %>% + tidyr::unchop(c(version, x_value)) %>% + dplyr::mutate(dplyr::across(c(x_value), ~ dplyr::if_else(grepl("NA", .x), NA_character_, .x))) + ) + ) + y = as_epi_archive( + data.table::as.data.table( + tibble::tribble(~geo_value, ~time_value, ~version, ~y_value, + "g1", 1L, 1:3, paste0("YA", 1:3), + "g1", 2L, 2L, paste0("YB", 2L), + "g1", 3L, 1:5, paste0("YC", 1:5), + "g1", 5L, 1L, paste0("YD", 1L), + "g1", 6L, 1:5, paste0("YE", 1:5), + ) %>% + tidyr::unchop(c(version, y_value)) %>% + dplyr::mutate(dplyr::across(c(y_value), ~ dplyr::if_else(grepl("NA", .x), NA_character_, .x))) + ) + ) + xy = epix_merge(x, y) + xy_expected = as_epi_archive( + data.table::as.data.table( + tibble::tribble(~geo_value, ~time_value, ~version, ~x_value, ~y_value, + "g1", 1L, 1:3, paste0("XA", 1:3), paste0("YA", 1:3), + "g1", 2L, 1:5, paste0("XB", 1:5), paste0("YB", c(NA,2L,2L,2L,2L)), + "g1", 3L, 1:5, paste0("XC", c(NA,2L,2L,2L,2L)), paste0("YC", 1:5), + "g1", 4L, 1L, paste0("XD", 1L), paste0("YD", NA), + "g1", 5L, 1L, paste0("XD", NA), paste0("YD", 1L), + "g1", 6L, 1:5, paste0("XE", c(1L,1L,NA,NA,5L)), paste0("YE", 1:5), + ) %>% + tidyr::unchop(c(version, x_value, y_value)) %>% + dplyr::mutate(dplyr::across(c(x_value, y_value), ~ dplyr::if_else(grepl("NA", .x), NA_character_, .x))) + ) + ) + # We rely on testthat edition 3 expect_identical using waldo, not identical. See + # test-epix_fill_through_version.R comments for details. + local_edition(3) + expect_identical(xy, xy_expected) +}) + +# TODO test other behaviors From 35956073889d53c125d3cc45c44a6cb4296a2ef8 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 21 Jul 2022 18:53:05 -0700 Subject: [PATCH 73/96] Fix "always compactifies" epix_merge doc, be precise about mutation in docs --- R/archive.R | 10 ++++++---- R/methods-epi_archive.R | 8 +++++--- man/epi_archive.Rd | 10 ++++++---- man/epix_merge.Rd | 8 +++++--- 4 files changed, 22 insertions(+), 14 deletions(-) diff --git a/R/archive.R b/R/archive.R index 3f8bd86b..fa60ddf6 100644 --- a/R/archive.R +++ b/R/archive.R @@ -557,10 +557,12 @@ epi_archive = return (invisible(self)) }, ##### -#' @description Merges another `epi_archive` with the current one, overwriting -#' fields of `self`, reseating its fields, and returning invisibly. See -#' [`epix_merge`] for a full description of the non-R6-method version, which -#' does not overwrite, and does not alias either archive's `DT`. +#' @description Merges another `epi_archive` with the current one, mutating the +#' current one by reseating its `DT` and several other fields, but avoiding +#' mutation of the old `DT`; returns the current archive +#' \link{base:invisible}[invisibly]. See [`epix_merge`] for a full description +#' of the non-R6-method version, which does not overwrite, and does not alias +#' either archive's `DT`. #' @param y as in [`epix_merge`] #' @param observed_versions_end_conflict as in [`epix_merge`] #' @param compactify as in [`epix_merge`] diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index a89cbf96..f72a3f9a 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -109,7 +109,10 @@ epix_fill_through_version = function(x, fill_versions_end, #' #' This function, [`epix_merge`], does not mutate its inputs and will not alias #' either archive's `DT`, but may alias other fields; `x$merge` will overwrite -#' `x` with the result of the merge, reseating its fields. +#' `x` with the result of the merge, reseating its `DT` and several other fields +#' (making them point to different objects), but avoiding mutation of the +#' contents of the old `DT` (only relevant if you have another reference to the +#' old `DT` in another object). #' #' @param x,y Two `epi_archive` objects to join together. #' @param observed_versions_end_conflict Optional; `"stop"`, `"na"`, `"lvcf"`, @@ -130,8 +133,7 @@ epix_fill_through_version = function(x, fill_versions_end, #' #' @details In all cases, `additional_metadata` will be an empty list, and #' `clobberable_versions_start` will be set to the earliest version that could -#' be clobbered in either input archive. The result's `DT` will always be -#' compactified. +#' be clobbered in either input archive. #' #' @examples #' # create two example epi_archive datasets diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index d67bdbb3..cb1ac264 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -258,10 +258,12 @@ version, which doesn't mutate the input archive but might alias its fields. \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-epi_archive-merge}{}}} \subsection{Method \code{merge()}}{ -Merges another \code{epi_archive} with the current one, overwriting -fields of \code{self}, reseating its fields, and returning invisibly. See -\code{\link{epix_merge}} for a full description of the non-R6-method version, which -does not overwrite, and does not alias either archive's \code{DT}. +Merges another \code{epi_archive} with the current one, mutating the +current one by reseating its \code{DT} and several other fields, but avoiding +mutation of the old \code{DT}; returns the current archive +\link{base:invisible}\link{invisibly}. See \code{\link{epix_merge}} for a full description +of the non-R6-method version, which does not overwrite, and does not alias +either archive's \code{DT}. \subsection{Usage}{ \if{html}{\out{
}}\preformatted{epi_archive$merge( y, diff --git a/man/epix_merge.Rd b/man/epix_merge.Rd index bef4cccf..f24f3d45 100644 --- a/man/epix_merge.Rd +++ b/man/epix_merge.Rd @@ -45,12 +45,14 @@ versions). If the \code{observed_versions_end} values differ, the \details{ This function, \code{\link{epix_merge}}, does not mutate its inputs and will not alias either archive's \code{DT}, but may alias other fields; \code{x$merge} will overwrite -\code{x} with the result of the merge, reseating its fields. +\code{x} with the result of the merge, reseating its \code{DT} and several other fields +(making them point to different objects), but avoiding mutation of the +contents of the old \code{DT} (only relevant if you have another reference to the +old \code{DT} in another object). In all cases, \code{additional_metadata} will be an empty list, and \code{clobberable_versions_start} will be set to the earliest version that could -be clobbered in either input archive. The result's \code{DT} will always be -compactified. +be clobbered in either input archive. } \examples{ # create two example epi_archive datasets From 22aa41718a0a9e0bd530322b7d82dfafcb9442b8 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 21 Jul 2022 19:58:29 -0700 Subject: [PATCH 74/96] Fix example archive to match old val Use LVCF, not NA, to fill in versions in the less up-to-date archive. This should get closer to matching the old data, but still needs to be checked. The old nafill would have filled in some things we wouldn't want, but this doesn't seem to be one of these scenarios, as git thinks the generated sysdata is the same. --- data-raw/archive_cases_dv_subset.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data-raw/archive_cases_dv_subset.R b/data-raw/archive_cases_dv_subset.R index 1af33e1d..becd9c57 100644 --- a/data-raw/archive_cases_dv_subset.R +++ b/data-raw/archive_cases_dv_subset.R @@ -32,7 +32,7 @@ case_rate_subset <- covidcast( as_epi_archive(compactify=FALSE) archive_cases_dv_subset = epix_merge(dv_subset, case_rate_subset, - observed_versions_end_conflict="na", + observed_versions_end_conflict="lvcf", compactify=FALSE) # If we directly store an epi_archive R6 object as data, it will store its class From 350ea447b01601644c67a6ee0a26bc100123262e Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 21 Jul 2022 20:17:11 -0700 Subject: [PATCH 75/96] Change "LOCF" to "LVCF" where it's referring to last version carried forward --- R/archive.R | 60 +++++++++++++++----------- R/methods-epi_archive.R | 6 +-- README.md | 8 +++- man/as_epi_archive.Rd | 6 +-- man/epi_archive.Rd | 32 +++++++++----- man/epix_merge.Rd | 2 +- tests/testthat/test-compactify.R | 26 ++++++------ vignettes/archive.Rmd | 73 +++++++++++++++++--------------- vignettes/compactify.Rmd | 71 ++++++++++++++++--------------- 9 files changed, 157 insertions(+), 127 deletions(-) diff --git a/R/archive.R b/R/archive.R index fa60ddf6..46569003 100644 --- a/R/archive.R +++ b/R/archive.R @@ -128,12 +128,22 @@ next_after.Date = function(x) x + 1L #' key variables, and thus the key variables are critical for figuring out how #' to generate a snapshot of data from the archive, as of a given version. #' -#' In general, last observation carried forward (LOCF) is used to data in -#' between recorded versions. Currently, deletions must be represented as -#' revising a row to a special state (e.g., making the entries `NA` or -#' including a special column that flags the data as removed and performing -#' some kind of post-processing), and the archive is unaware of what this -#' state is. +#' In general, the last version of each observation is carried forward (LVCF) to +#' fill in data between recorded versions, and between the last recorded +#' update and the `observed_versions_end`. One consequence is that the `DT` +#' doesn't have to contain a full snapshot of every version (although this +#' generally works), but can instead contain only the rows that are new or +#' changed from the previous version (see `compactify`, which does this +#' automatically). Currently, deletions must be represented as revising a row +#' to a special state (e.g., making the entries `NA` or including a special +#' column that flags the data as removed and performing some kind of +#' post-processing), and the archive is unaware of what this state is. Note +#' that `NA`s *can* be introduced by `epi_archive` methods for other reasons, +#' e.g., in [`epix_fill_through_version`] and [`epix_merge`], if requested, to +#' represent potential update data that we do not yet have access to; or in +#' [`epix_merge`] to represent the "value" of an observation before the +#' version in which it was first released, or if no version of that +#' observation appears in the archive data at all. #' #' **A word of caution:** R6 objects, unlike most other objects in R, have #' reference semantics. A primary consequence of this is that objects are not @@ -219,9 +229,9 @@ epi_archive = #' fields; named entries from the passed list or will be included as well. #' @param compactify Optional; Boolean or `NULL`: should we remove rows that are #' considered redundant for the purposes of `epi_archive`'s built-in methods -#' such as `as_of`? As these methods use the last (version of an) observation -#' carried forward (LOCF) to interpolate between the version data provided, -#' rows that don't change these LOCF results can potentially be omitted to +#' such as `as_of`? As these methods use the last version of each observation +#' carried forward (LVCF) to interpolate between the version data provided, +#' rows that don't change these LVCF results can potentially be omitted to #' save space while maintaining the same behavior (with the help of the #' `clobberable_versions_start` and `observed_versions_end` fields in some #' edge cases). `TRUE` will remove these rows, `FALSE` will not, and missing @@ -356,30 +366,30 @@ epi_archive = DT = as.data.table(x, key = key_vars) if (!identical(key_vars, key(DT))) setkeyv(DT, cols = key_vars) - # Checks to see if a value in a vector is LOCF - is_locf <- function(vec) { + # Checks to see if a value in a vector is LVCF + is_lvcf <- function(vec) { dplyr::if_else(!is.na(vec) & !is.na(dplyr::lag(vec)), vec == dplyr::lag(vec), is.na(vec) & is.na(dplyr::lag(vec))) } - # LOCF is defined by a row where all values except for the version + # LVCF is defined by a row where all values except for the version # differ from their respective lag values - # Checks for LOCF's in a data frame - rm_locf <- function(df) { - dplyr::filter(df,if_any(c(everything(),-version),~ !is_locf(.))) + # Checks for LVCF's in a data frame + rm_lvcf <- function(df) { + dplyr::filter(df,if_any(c(everything(),-version),~ !is_lvcf(.))) } - # Keeps LOCF values, such as to be printed - keep_locf <- function(df) { - dplyr::filter(df,if_all(c(everything(),-version),~ is_locf(.))) + # Keeps LVCF values, such as to be printed + keep_lvcf <- function(df) { + dplyr::filter(df,if_all(c(everything(),-version),~ is_lvcf(.))) } # Runs compactify on data frame if (is.null(compactify) || compactify == TRUE) { - elim = keep_locf(DT) - DT = rm_locf(DT) + elim = keep_lvcf(DT) + DT = rm_lvcf(DT) } else { # Create empty data frame for nrow(elim) to be 0 elim = tibble::tibble() @@ -561,8 +571,8 @@ epi_archive = #' current one by reseating its `DT` and several other fields, but avoiding #' mutation of the old `DT`; returns the current archive #' \link{base:invisible}[invisibly]. See [`epix_merge`] for a full description -#' of the non-R6-method version, which does not overwrite, and does not alias -#' either archive's `DT`. +#' of the non-R6-method version, which does not mutate either archive, and +#' does not alias either archive's `DT`. #' @param y as in [`epix_merge`] #' @param observed_versions_end_conflict as in [`epix_merge`] #' @param compactify as in [`epix_merge`] @@ -762,9 +772,9 @@ epi_archive = #' fields; named entries from the passed list or will be included as well. #' @param compactify Optional; Boolean or `NULL`: should we remove rows that are #' considered redundant for the purposes of `epi_archive`'s built-in methods -#' such as `as_of`? As these methods use the last (version of an) observation -#' carried forward (LOCF) to interpolate between the version data provided, -#' rows that don't change these LOCF results can potentially be omitted to +#' such as `as_of`? As these methods use the last version of each observation +#' carried forward (LVCF) to interpolate between the version data provided, +#' rows that don't change these LVCF results can potentially be omitted to #' save space. `TRUE` will remove these rows, `FALSE` will not, and missing or #' `NULL` will remove these rows and issue a warning. Generally, this can be #' set to `TRUE`, but if you directly inspect or edit the fields of the diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index f72a3f9a..726ac4cd 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -121,7 +121,7 @@ epix_fill_through_version = function(x, fill_versions_end, #' use `max(x$observed_versions_end, y$observed_versions_end)`, but in the #' less up-to-date input archive, imagine there was an update immediately #' after its last observed version which revised all observations to be `NA`; -#' `"locf"`: use `max(x$observed_versions_end, y$observed_versions_end)`, and +#' `"lvcf"`: use `max(x$observed_versions_end, y$observed_versions_end)`, and #' last-version-carried-forward extrapolation to invent update data for the #' less up-to-date input archive; or `"truncate"`: use #' `min(x$observed_versions_end, y$observed_versions_end)` and discard any @@ -319,12 +319,12 @@ epix_merge = function(x, y, time_type = x$time_type, other_keys = setdiff(key(result_DT), c("geo_value","time_value","version")), additional_metadata = result_additional_metadata, - # it'd probably be better to pre-compactify before the merge, and might be + # It'd probably be better to pre-compactify before the merge, and might be # guaranteed not to be necessary to compactify the merge result if the # inputs are already compactified, but at time of writing we don't have # compactify in its own method or field, and it seems like it should be # pretty fast anyway. - compactify=compactify, + compactify = compactify, clobberable_versions_start = result_clobberable_versions_start, observed_versions_end = new_observed_versions_end )) diff --git a/README.md b/README.md index cdace7fc..3241cb0f 100644 --- a/README.md +++ b/README.md @@ -53,9 +53,15 @@ class. For example: - `epix_as_of()`, for generating a snapshot in `epi_df` from the data archive, which represents the most up-to-date values of the signal variables, as of the specified version; + +- `epix_fill_through_version()`, for filling in some fake version data following + simple rules, for use when downstream methods expect an archive that is more + up-to-date (e.g., if it is a forecasting deadline date and one of our data + sources cannot be accessed to provide the latest versions of its data) - `epix_merge()`, for merging two data archives with each other, with support - for filling in missing values via last observation carried forward (LOCF); + for various approaches to handling when one of the archives is more up-to-date + version-wise than the other; - `epix_slide()`, for sliding a custom computation to a data archive over local windows in time, much like `epi_slide` for an `epi_df` object, but with one diff --git a/man/as_epi_archive.Rd b/man/as_epi_archive.Rd index b9f30dcc..4b3d97ea 100644 --- a/man/as_epi_archive.Rd +++ b/man/as_epi_archive.Rd @@ -37,9 +37,9 @@ fields; named entries from the passed list or will be included as well.} \item{compactify}{Optional; Boolean or \code{NULL}: should we remove rows that are considered redundant for the purposes of \code{epi_archive}'s built-in methods -such as \code{as_of}? As these methods use the last (version of an) observation -carried forward (LOCF) to interpolate between the version data provided, -rows that don't change these LOCF results can potentially be omitted to +such as \code{as_of}? As these methods use the last version of each observation +carried forward (LVCF) to interpolate between the version data provided, +rows that don't change these LVCF results can potentially be omitted to save space. \code{TRUE} will remove these rows, \code{FALSE} will not, and missing or \code{NULL} will remove these rows and issue a warning. Generally, this can be set to \code{TRUE}, but if you directly inspect or edit the fields of the diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index cb1ac264..e0d8f1d3 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -32,12 +32,22 @@ Note that there can only be a single row per unique combination of key variables, and thus the key variables are critical for figuring out how to generate a snapshot of data from the archive, as of a given version. -In general, last observation carried forward (LOCF) is used to data in -between recorded versions. Currently, deletions must be represented as -revising a row to a special state (e.g., making the entries \code{NA} or -including a special column that flags the data as removed and performing -some kind of post-processing), and the archive is unaware of what this -state is. +In general, the last version of each observation is carried forward (LVCF) to +fill in data between recorded versions, and between the last recorded +update and the \code{observed_versions_end}. One consequence is that the \code{DT} +doesn't have to contain a full snapshot of every version (although this +generally works), but can instead contain only the rows that are new or +changed from the previous version (see \code{compactify}, which does this +automatically). Currently, deletions must be represented as revising a row +to a special state (e.g., making the entries \code{NA} or including a special +column that flags the data as removed and performing some kind of +post-processing), and the archive is unaware of what this state is. Note +that \code{NA}s \emph{can} be introduced by \code{epi_archive} methods for other reasons, +e.g., in \code{\link{epix_fill_through_version}} and \code{\link{epix_merge}}, if requested, to +represent potential update data that we do not yet have access to; or in +\code{\link{epix_merge}} to represent the "value" of an observation before the +version in which it was first released, or if no version of that +observation appears in the archive data at all. \strong{A word of caution:} R6 objects, unlike most other objects in R, have reference semantics. A primary consequence of this is that objects are not @@ -155,9 +165,9 @@ fields; named entries from the passed list or will be included as well.} \item{\code{compactify}}{Optional; Boolean or \code{NULL}: should we remove rows that are considered redundant for the purposes of \code{epi_archive}'s built-in methods -such as \code{as_of}? As these methods use the last (version of an) observation -carried forward (LOCF) to interpolate between the version data provided, -rows that don't change these LOCF results can potentially be omitted to +such as \code{as_of}? As these methods use the last version of each observation +carried forward (LVCF) to interpolate between the version data provided, +rows that don't change these LVCF results can potentially be omitted to save space while maintaining the same behavior (with the help of the \code{clobberable_versions_start} and \code{observed_versions_end} fields in some edge cases). \code{TRUE} will remove these rows, \code{FALSE} will not, and missing @@ -262,8 +272,8 @@ Merges another \code{epi_archive} with the current one, mutating the current one by reseating its \code{DT} and several other fields, but avoiding mutation of the old \code{DT}; returns the current archive \link{base:invisible}\link{invisibly}. See \code{\link{epix_merge}} for a full description -of the non-R6-method version, which does not overwrite, and does not alias -either archive's \code{DT}. +of the non-R6-method version, which does not mutate either archive, and +does not alias either archive's \code{DT}. \subsection{Usage}{ \if{html}{\out{
}}\preformatted{epi_archive$merge( y, diff --git a/man/epix_merge.Rd b/man/epix_merge.Rd index f24f3d45..866102ce 100644 --- a/man/epix_merge.Rd +++ b/man/epix_merge.Rd @@ -20,7 +20,7 @@ or \code{"truncate"}; in the case that \code{x$observed_versions_end} doesn't ma use \code{max(x$observed_versions_end, y$observed_versions_end)}, but in the less up-to-date input archive, imagine there was an update immediately after its last observed version which revised all observations to be \code{NA}; -\code{"locf"}: use \code{max(x$observed_versions_end, y$observed_versions_end)}, and +\code{"lvcf"}: use \code{max(x$observed_versions_end, y$observed_versions_end)}, and last-version-carried-forward extrapolation to invent update data for the less up-to-date input archive; or \code{"truncate"}: use \code{min(x$observed_versions_end, y$observed_versions_end)} and discard any diff --git a/tests/testthat/test-compactify.R b/tests/testthat/test-compactify.R index 96d709f8..e13455a1 100644 --- a/tests/testthat/test-compactify.R +++ b/tests/testthat/test-compactify.R @@ -23,7 +23,7 @@ row_replace <- function(dt,row,x,y) { } # Rows 1 should not be eliminated even if NA -dt <- row_replace(dt,1,NA,NA) # Not LOCF +dt <- row_replace(dt,1,NA,NA) # Not LVCF # NOTE! We are assuming that there are no NA's in geo_value, time_value, # and version. Even though compactify may erroneously remove the first row @@ -31,43 +31,43 @@ dt <- row_replace(dt,1,NA,NA) # Not LOCF # has problems beyond the scope of this test # Rows 11 and 12 correspond to different time_values -dt <- row_replace(dt,12,11,11) # Not LOCF +dt <- row_replace(dt,12,11,11) # Not LVCF # Rows 20 and 21 only differ in version -dt <- row_replace(dt,21,20,20) # LOCF +dt <- row_replace(dt,21,20,20) # LVCF # Rows 21 and 22 only differ in version -dt <- row_replace(dt,22,20,20) # LOCF +dt <- row_replace(dt,22,20,20) # LVCF # Row 39 comprises the first NA's -dt <- row_replace(dt,39,NA,NA) # Not LOCF +dt <- row_replace(dt,39,NA,NA) # Not LVCF # Row 40 has two NA's, just like its lag, row 39 -dt <- row_replace(dt,40,NA,NA) # LOCF +dt <- row_replace(dt,40,NA,NA) # LVCF # Row 62's values already exist in row 15, but row 15 is not a preceding row -dt <- row_replace(dt,62,15,15) # Not LOCF +dt <- row_replace(dt,62,15,15) # Not LVCF # Row 73 only has one value carried over -dt <- row_replace(dt,74,73,74) # Not LOCF +dt <- row_replace(dt,74,73,74) # Not LVCF dt_true <- as_tibble(as_epi_archive(dt,compactify=TRUE)$DT) dt_false <- as_tibble(as_epi_archive(dt,compactify=FALSE)$DT) dt_null <- suppressWarnings(as_tibble(as_epi_archive(dt,compactify=NULL)$DT)) -test_that("Warning for LOCF with compactify as NULL", { +test_that("Warning for LVCF with compactify as NULL", { expect_warning(as_epi_archive(dt,compactify=NULL)) }) -test_that("No warning when there is no LOCF", { +test_that("No warning when there is no LVCF", { expect_warning(as_epi_archive(dt[1:5],compactify=NULL),NA) }) -test_that("LOCF values are ignored with compactify=FALSE", { +test_that("LVCF values are ignored with compactify=FALSE", { expect_identical(nrow(dt),nrow(dt_false)) }) -test_that("LOCF values are taken out with compactify=TRUE", { +test_that("LVCF values are taken out with compactify=TRUE", { dt_test <- as_tibble(as_epi_archive(dt[-c(21,22,40),],compactify=FALSE)$DT) expect_identical(dt_true,dt_null) @@ -78,7 +78,7 @@ test_that("as_of produces the same results with compactify=TRUE as with compacti ea_true <- as_epi_archive(dt,compactify=TRUE) ea_false <- as_epi_archive(dt,compactify=FALSE) - # Row 22, an LOCF row corresponding to the latest version, is omitted in + # Row 22, an LVCF row corresponding to the latest version, is omitted in # ea_true latest_version = max(ea_false$DT$version) expect_warning({ diff --git a/vignettes/archive.Rmd b/vignettes/archive.Rmd index 99d3d676..f497bc63 100644 --- a/vignettes/archive.Rmd +++ b/vignettes/archive.Rmd @@ -63,17 +63,17 @@ has (at least) the following columns: `time_value` is January 14, 2022, then this row contains the measurements of the data for January 14, 2022 that were available one day later. -As we can see from the above, the data frame returned by -`delphi.epidata::covidcast()` has the columns required for the `epi_archive` -format, with `issue` playing the role of `version`. We can now use +As we can see from the above, the data frame returned by +`delphi.epidata::covidcast()` has the columns required for the `epi_archive` +format, with `issue` playing the role of `version`. We can now use `as_epi_archive()` to bring it into `epi_archive` format. For removal of -LOCF values using `as_epi_archive` using compactify, please refer to the -compactify vignette. +redundant version updates in `as_epi_archive` using compactify, please refer +to the compactify vignette. ```{r, eval=FALSE} x <- dv %>% select(geo_value, time_value, version = issue, percent_cli = value) %>% - as_epi_archive() + as_epi_archive(compactify=TRUE) class(x) print(x) @@ -82,7 +82,7 @@ print(x) ```{r, echo=FALSE, message=FALSE, warning=FALSE} x <- archive_cases_dv_subset$DT %>% select(geo_value, time_value, version , percent_cli) %>% - as_epi_archive() + as_epi_archive(compactify=TRUE) class(x) print(x) @@ -108,10 +108,10 @@ snapshot of data from the archive, as of a given version (also described below). key(x$DT) ``` -In general, last observation carried forward (LOCF) is used to data in between -recorded versions. **A word of caution:** R6 objects, unlike most other objects -in R, have reference semantics. An important consequence of this is that objects -are not copied when modified. +In general, the last version of each observation is carried forward (LVCF) to +fill in data between recorded versions. **A word of caution:** R6 objects, +unlike most other objects in R, have reference semantics. An important +consequence of this is that objects are not copied when modified. ```{r} original_value <- x$DT$percent_cli[1] @@ -173,7 +173,7 @@ date was June 1, 2021. From this we can infer that the doctor's visits signal was 2 days latent on June 1. Also, we can see that the metadata in the `epi_df` object has the version date recorded in the `as_of` field. -Using the maximum of the `version` column in the underlying data table in an +By default, using the maximum of the `version` column in the underlying data table in an `epi_archive` object itself generates a snapshot of the latest values of signal variables in the entire archive. The `epix_as_of()` function issues a warning in this case, since updates to the current version may still come in at a later @@ -226,25 +226,27 @@ quite as dramatically. Modeling the revision process, which is often called ## Merging `epi_archive` objects -Now we demonstrate how to merge the underlying data tables in two `epi_archive` -objects together. The `epi_archive` class provides a method `merge()` precisely -for this purpose. The wrapper function is called `epix_merge()`; as before, this -is just offered as a matter of convenience/familiarity for some users. Below we -merge the working `epi_archive` of versioned percentage CLI from outpatient -visits to another one of versioned COVID-19 case reporting data, which we fetch -the from the [COVIDcast +Now we demonstrate how to merge two `epi_archive` objects together, e.g., so +that grabbing data from multiple sources as of a particular version can be +performed with a single `as_of` call. The `epi_archive` class provides a method +`merge()` precisely for this purpose. The wrapper function is called +`epix_merge()`; this wrapper avoids mutating its inputs, while `x$merge` will +mutate `x`. Below we merge the working `epi_archive` of versioned percentage CLI +from outpatient visits to another one of versioned COVID-19 case reporting data, +which we fetch the from the [COVIDcast API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast.html/), on the rate scale (counts per 100,000 people in the population). -When merging archives, we typically want to perform a *full join*, otherwise we -will be throwing out versioned data from one table or the other. This is -accomplished by setting `all = TRUE` in the call to `epix_merge()`. Furthermore, -this function provides an option for filling `NA` values via LOCF by setting -`locf = TRUE`. In general, unless two data tables have the exact same pattern of -updates, we will get `NA` values in the signals after performing a full join. -Because the original data archives are stored in LOCF (last observation carried -forward) format in the first place, it generally makes sense to perform `NA` -filling after merging using LOCF. Therefore `locf = TRUE` is the default. +When merging archives, unless the archives have identical data release patterns, +`NA`s can be introduced in the non-key variables for a few reasons: +- to represent the "value" of an observation before its initial release (when we + need to pair it with additional observations from the other archive that have + been released) +- to represent the "value" of an observation that has no recorded versions at + all (in the same sort of situation) +- if requested via `observed_versions_end_conflict="na"`, to represent potential + update data that we do not yet have access to (e.g., due to one of the + archives being out of sync). ```{r, message = FALSE, warning = FALSE,eval=FALSE} y <- covidcast( @@ -252,15 +254,15 @@ y <- covidcast( signals = "confirmed_7dav_incidence_prop", time_type = "day", geo_type = "state", - time_value = epirange(20200601, 20211201), + time_values = epirange(20200601, 20211201), geo_values = "ca,fl,ny,tx", issues = epirange(20200601, 20211201) ) %>% fetch_tbl() %>% select(geo_value, time_value, version = issue, case_rate_7d_av = value) %>% - as_epi_archive() + as_epi_archive(compactify=TRUE) -epix_merge(x, y, all = TRUE) +x$epix_merge(y) print(x) head(x$DT) ``` @@ -271,9 +273,10 @@ print(x) head(x$DT) ``` -Importantly, as we can see, the way `epix_merge()` works is that it -**overwrites** the data table in the first `epi_archive` object `x` by the -merged data table. +Importantly, see that `x$merge` mutated `x` to hold the result of the merge. We +could also have used `xy = epix_merge(x,y)` to avoid mutating `x`. See the +documentation for either for more detailed descriptions of what mutation, +pointer aliasing, and pointer reseating is possible. ## Sliding version-aware computations @@ -419,7 +422,7 @@ ggplot(fc, aes(x = target_date, group = time_value, fill = as_of)) + theme(legend.position = "none") ``` -Each row displays the forecasts for a different location (CA and FL), and each +Each row displays the forecasts for a different location (CA, FL, NY, and TX), and each column corresponds to whether properly-versioned data is used (`FALSE` means no, and `TRUE` means yes). We can see that the properly-versioned forecaster is, at some points in time, more problematic; for example, it massively overpredicts diff --git a/vignettes/compactify.Rmd b/vignettes/compactify.Rmd index 3cedf332..70aaf632 100644 --- a/vignettes/compactify.Rmd +++ b/vignettes/compactify.Rmd @@ -1,32 +1,31 @@ --- -title: Compactify to remove LOCF values +title: Compactify to remove redundant archive data output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{Compactify to remove LOCF values} + %\VignetteIndexEntry{Compactify to remove redundant archive data} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- -## Removing LOCF data to save space +## Removing redundant update data to save space + +We do not need to store version update rows that look like the last version +carried forward (LVCF) for use with `epiprocess`'s' `epi_archive`-related +functions, as they all apply LVCF to fill in data between explicit updates. By +default, we even detect and remove these LVCF-redundant rows to save space; this +should not impact results as long as you do not directly work with the archive's +`DT` field in a way that expects these rows to remain. -We need not even store rows that look like the last observation carried forward, -as they use up extra space. Furthermore, we already apply LOCF in the -epi_archive-related functions, such that the results should be the same with or -without the rows, aas long as use code does not rely on directly modifying -epi_archive's fields in a way that expects all the original records and breaks -if they are trimmed down. - There are three different values that can be assigned to `compactify`: -* No argument: Does not put in LOCF values, and prints the first six LOCF values -that have been omitted but could have been placed. -* `TRUE`: Does not put in LOCF values, but doesn't print anything relating -to which values have been omitted. -* `FALSE`: Includes LOCF values. +* No argument: if there are LVCF-redundant rows, removes them and issues a + warning with some information about what rows were removed +* `TRUE`: removes any LVCF-redundant rows without any warning or other feedback +* `FALSE`: keeps any LVCF-redundant rows without any warning or other feedback -For this example, we have one chart using LOCF values, while another doesn't -use them to illustrate LOCF. Notice how the head of the first dataset differs +For this example, we have one chart using LVCF values, while another doesn't +use them to illustrate LVCF. Notice how the head of the first dataset differs from the second from the third value included. ```{r} @@ -34,35 +33,37 @@ library(dplyr) dt <- archive_cases_dv_subset$DT -locf_omitted <- as_epi_archive(dt) -locf_included <- as_epi_archive(dt,compactify = FALSE) +lvcf_omitted <- as_epi_archive(dt) +lvcf_included <- as_epi_archive(dt,compactify = FALSE) -head(locf_omitted$DT) -head(locf_included$DT) +head(lvcf_omitted$DT) +head(lvcf_included$DT) ``` -LOCF can mar the performance of dataset operations. As the column -`case_rate_7d_av` has many more LOCF values than `percent_cli`, we will omit the +LVCF-redundant values can mar the performance of dataset operations. As the column +`case_rate_7d_av` has many more LVCF-redundant values than `percent_cli`, we will omit the `percent_cli` column for comparing performance. ```{r} dt2 <- select(dt,-percent_cli) -locf_included_2 <- as_epi_archive(dt2,compactify=FALSE) -locf_omitted_2 <- as_epi_archive(dt2,compactify=TRUE) +lvcf_included_2 <- as_epi_archive(dt2,compactify=FALSE) +lvcf_omitted_2 <- as_epi_archive(dt2,compactify=TRUE) ``` -We can see how large each dataset is to better understand why LOCF uses up -space that may slow down performance. +In this example, a huge proportion of the original version update data were +LVCF-redundant, and compactifying saves a large amount of space. The proportion +of data that is LVCF-redundant can vary widely between data sets, so we won't +always be this lucky. ```{r} -nrow(locf_included_2$DT) -nrow(locf_omitted_2$DT) +nrow(lvcf_included_2$DT) +nrow(lvcf_omitted_2$DT) ``` -As we can see, performing 200 iterations of `dplyr::filter` is faster when the -LOCF values are omitted. +As we would expect, performing 200 iterations of `dplyr::filter` is faster when +the LVCF values are omitted. ```{r} # Performance of filtering @@ -77,8 +78,8 @@ elapsed_time <- function(fx) c(system.time(fx))[[3]] speed_test <- function(f,name) { data.frame( operation = name, - locf=elapsed_time(f(locf_included_2)), - no_locf=elapsed_time(f(locf_omitted_2)) + lvcf = elapsed_time(f(lvcf_included_2)), + no_lvcf = elapsed_time(f(lvcf_omitted_2)) ) } @@ -108,10 +109,10 @@ speeds <- rbind(speeds, speed_test(slide_median,"slide_median")) Here is a detailed performance comparison: ```{r} -speeds_tidy <- gather(speeds,key="is_locf",value="time_in_s",locf,no_locf) +speeds_tidy <- tidyr::gather(speeds,key="is_lvcf",value="time_in_s",lvcf,no_lvcf) library(ggplot2) ggplot(speeds_tidy) + - geom_bar(aes(x=is_locf,y=time_in_s,fill=operation),stat = "identity") + geom_bar(aes(x=is_lvcf,y=time_in_s,fill=operation),stat = "identity") ``` From bb8bebeda68e85984009d2881a6ff2fdb67150e3 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Sun, 24 Jul 2022 13:42:59 -0700 Subject: [PATCH 76/96] Fix epix_fill_through_version bug from unique-aliasing edge case --- NAMESPACE | 2 + R/archive.R | 14 +++++- .../testthat/test-epix_fill_through_version.R | 50 ++++++++++++++----- tests/testthat/test-methods-epi_archive.R | 5 +- 4 files changed, 56 insertions(+), 15 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 755f2222..eeffd7b3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -52,8 +52,10 @@ export(ungroup) export(unnest) importFrom(R6,R6Class) importFrom(data.table,":=") +importFrom(data.table,address) importFrom(data.table,as.data.table) importFrom(data.table,between) +importFrom(data.table,copy) importFrom(data.table,key) importFrom(data.table,set) importFrom(data.table,setkeyv) diff --git a/R/archive.R b/R/archive.R index 46569003..9bd61e07 100644 --- a/R/archive.R +++ b/R/archive.R @@ -520,7 +520,7 @@ epi_archive = #' @param fill_versions_end as in [`epix_fill_through_version`] #' @param how as in [`epix_fill_through_version`] #' -#' @importFrom data.table key setkeyv := +#' @importFrom data.table key setkeyv := address copy #' @importFrom rlang arg_match fill_through_version = function(fill_versions_end, how=c("na", "lvcf")) { @@ -546,9 +546,19 @@ epi_archive = "to one > %3$s, implying at least one version in between." ), self$observed_versions_end, next_version_tag, fill_versions_end)) } - next_version_DT = unique(self$DT, by=nonversion_key_cols)[ + nonversion_key_vals_ever_recorded = unique(self$DT, by=nonversion_key_cols) + # In edge cases, the `unique` result can alias the original + # DT; detect and copy if necessary: + if (identical(address(self$DT), address(nonversion_key_vals_ever_recorded))) { + nonversion_key_vals_ever_recorded <- copy(nonversion_key_vals_ever_recorded) + } + next_version_DT = nonversion_key_vals_ever_recorded[ , version := next_version_tag][ + # this makes the class of these columns logical (`NA` is a + # logical NA; we're relying on the rbind below to convert to + # the proper class&typeof) , (nonkey_cols) := NA] + # full result DT: setkeyv(rbind(self$DT, next_version_DT), key(self$DT))[] }, "lvcf" = { diff --git a/tests/testthat/test-epix_fill_through_version.R b/tests/testthat/test-epix_fill_through_version.R index 265ebc08..92c4cbbd 100644 --- a/tests/testthat/test-epix_fill_through_version.R +++ b/tests/testthat/test-epix_fill_through_version.R @@ -46,18 +46,34 @@ test_that("epix_fill_through_version can extend observed versions, gives expecte }) test_that("epix_fill_through_version does not mutate x", { - ea_orig = as_epi_archive(data.table::data.table(geo_value = "g1", time_value = as.Date("2020-01-01"), - version = 1:5, value = 1:5)) - # We want to perform a strict comparison of the contents of `ea_orig` before - # and `ea_orig` after. `clone` + `expect_identical` based on waldo would sort - # of work, but we might want something stricter. `as.list` + `identical` seems - # to do the trick - ea_orig_before_as_list = as.list(ea_orig) - some_unobserved_version = 8L - ea_fill_na = epix_fill_through_version(ea_orig, some_unobserved_version, "na") - ea_orig_after_as_list = as.list(ea_orig) - # use identical, not expect_identical; latter isn't as strict - expect_true(identical(ea_orig_before_as_list, ea_orig_after_as_list)) + for (ea_orig in list( + # vanilla case + as_epi_archive(data.table::data.table(geo_value = "g1", time_value = as.Date("2020-01-01"), + version = 1:5, value = 1:5)), + # data.table unique yielding original DT by reference special case (maybe + # having only 1 row is the trigger? having no revisions of initial values + # doesn't seem sufficient to trigger) + as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, value=10L)) + )) { + # We want to perform a strict comparison of the contents of `ea_orig` before + # and `ea_orig` after. `clone` + `expect_identical` based on waldo would + # sort of work, but we might want something stricter. `as.list` + + # `identical` plus a check of the DT seems to do the trick. + ea_orig_before_as_list = as.list(ea_orig) + ea_orig_DT_before_copy = data.table::copy(ea_orig$DT) + some_unobserved_version = 8L + # + ea_fill_na = epix_fill_through_version(ea_orig, some_unobserved_version, "na") + ea_orig_after_as_list = as.list(ea_orig) + # use identical, not expect_identical, for the R6-as-list test; latter isn't as strict + expect_true(identical(ea_orig_before_as_list, ea_orig_after_as_list)) + expect_identical(ea_orig_DT_before_copy, ea_orig$DT) + # + ea_fill_lvcf = epix_fill_through_version(ea_orig, some_unobserved_version, "lvcf") + ea_orig_after_as_list = as.list(ea_orig) + expect_true(identical(ea_orig_before_as_list, ea_orig_after_as_list)) + expect_identical(ea_orig_DT_before_copy, ea_orig$DT) + } }) test_that("x$fill_through_version mutates x (if needed)", { @@ -78,3 +94,13 @@ test_that("{epix_,$}fill_through_version return with expected visibility", { expect_true(withVisible(epix_fill_through_version(ea, 10L, "na"))[["visible"]]) expect_false(withVisible(ea$fill_through_version(15L, "na"))[["visible"]]) }) + +test_that("epix_fill_through_version returns same key & doesn't mutate old DT or its key", { + ea = as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, value=10L)) + old_DT = ea$DT + old_DT_copy = data.table::copy(old_DT) + old_key = data.table::key(ea$DT) + expect_identical(data.table::key(epix_fill_through_version(ea, 5L, "na")$DT), old_key) + expect_identical(data.table::key(epix_fill_through_version(ea, 5L, "lvcf")$DT), old_key) + expect_identical(data.table::key(ea$DT), old_key) +}) diff --git a/tests/testthat/test-methods-epi_archive.R b/tests/testthat/test-methods-epi_archive.R index 91367271..4377035f 100644 --- a/tests/testthat/test-methods-epi_archive.R +++ b/tests/testthat/test-methods-epi_archive.R @@ -22,7 +22,9 @@ test_that("Warning against max_version being same as edf's max version",{ expect_warning(ea$as_of(max_version = min(ea$DT$version)),NA) }) -test_that("as_of properly grabs the data",{ +test_that("as_of properly grabs the data and doesn't mutate key",{ + old_key = data.table::key(ea$DT) + df_as_of <- ea %>% epix_as_of(max_version = as.Date("2020-07-01")) %>% na.omit() %>% @@ -35,6 +37,7 @@ test_that("as_of properly grabs the data",{ as.data.frame() expect_equal(df_as_of[1:4],df_filter) + expect_equal(data.table::key(ea$DT), old_key) }) # epix_merge tests From 74cc3cffbf8dc0d6f653ef70b7c26fcc7d573e27 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Sun, 24 Jul 2022 13:59:21 -0700 Subject: [PATCH 77/96] Fill out tests for new epix_merge Remove tests for undocumented and now unsupported merge behavior with data.tables. --- R/methods-epi_archive.R | 10 +- tests/testthat/test-epix_merge.R | 135 +++++++++++++++++++++- tests/testthat/test-methods-epi_archive.R | 29 ----- 3 files changed, 139 insertions(+), 35 deletions(-) diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 7de41e52..11cd6538 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -170,12 +170,12 @@ epix_merge = function(x, y, } if (length(x$additional_metadata) != 0L) { - Warn("x$additional_metadata will be dropped", - class = "epiprocess__epix_merge_drops_additional_metadata") + Warn("x$additional_metadata won't appear in merge result", + class = "epiprocess__epix_merge_ignores_additional_metadata") } if (length(y$additional_metadata) != 0L) { - Warn("y$additional_metadata will be dropped", - class = "epiprocess__epix_merge_drops_additional_metadata") + Warn("y$additional_metadata won't appear in merge result", + class = "epiprocess__epix_merge_ignores_additional_metadata") } result_additional_metadata = list() @@ -197,7 +197,7 @@ epix_merge = function(x, y, "`x$observed_versions_end` was not identical to `y$observed_versions_end`;", "either ensure that `x` and `y` are equally up to date before merging,", "or specify how to deal with this using `observed_versions_end_conflict`" - )) + ), class="epiprocess__epix_merge_unresolved_observed_versions_end_conflict") } else { new_observed_versions_end = x$observed_versions_end x_DT = x$DT diff --git a/tests/testthat/test-epix_merge.R b/tests/testthat/test-epix_merge.R index f6e2fc14..527c808d 100644 --- a/tests/testthat/test-epix_merge.R +++ b/tests/testthat/test-epix_merge.R @@ -1,4 +1,9 @@ +test_that("epix_merge requires stops on invalid `y`",{ + ea = archive_cases_dv_subset$clone() + expect_error(epix_merge(ea, data.frame(x=1))) +}) + test_that("epix_merge merges and carries forward updates properly", { x = as_epi_archive( data.table::as.data.table( @@ -53,4 +58,132 @@ test_that("epix_merge merges and carries forward updates properly", { expect_identical(xy, xy_expected) }) -# TODO test other behaviors +test_that('epix_merge stops and warns on metadata and naming issues', { + expect_error( + epix_merge( + as_epi_archive(tibble::tibble(geo_value="tx", time_value=1L, version=1L, x_value=1L)), + as_epi_archive(tibble::tibble(geo_value="us", time_value=1L, version=5L, y_value=2L)) + ), + regexp = "must have the same.*geo_type" + ) + expect_error( + epix_merge( + as_epi_archive(tibble::tibble(geo_value="pa", time_value=1L, version=1L, x_value=1L)), + as_epi_archive(tibble::tibble(geo_value="pa", time_value=as.Date("2020-01-01"), version=5L, y_value=2L)) + ), + regexp = "must have the same.*time_type" + ) + expect_error( + epix_merge( + as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, value=1L)), + as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, value=2L)) + ), + regexp = "overlapping.*names" + ) + expect_warning( + epix_merge( + as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, x_value=1L), + additional_metadata=list("updates_fetched"=lubridate::ymd_hms("2022-05-01 16:00:00", tz="UTC"))), + as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, y_value=2L)) + ), + regexp = "x\\$additional_metadata", + class = "epiprocess__epix_merge_ignores_additional_metadata" + ) + expect_warning( + epix_merge( + as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, x_value=1L)), + as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, y_value=2L), + additional_metadata=list("updates_fetched"=lubridate::ymd_hms("2022-05-01 16:00:00", tz="UTC"))) + ), + regexp = "y\\$additional_metadata", + class = "epiprocess__epix_merge_ignores_additional_metadata" + ) +}) + +# use `local` to prevent accidentally using the x, y, xy bindings here +# elsewhere, while allowing reuse across a couple tests +local({ + x = as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, x_value=1L), + clobberable_versions_start=1L, observed_versions_end = 10L) + y = as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, y_value=2L), + clobberable_versions_start=3L, observed_versions_end = 10L) + xy = epix_merge(x,y) + test_that('epix_merge considers partially-clobberable row to be clobberable', { + expect_identical(xy$clobberable_versions_start, 1L) + }) + test_that('epix_merge result uses observed_versions_end metadata not max version val', { + expect_identical(xy$observed_versions_end, 10L) + }) +}) + +local({ + x = as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, x_value=10L)) + y = as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=5L, y_value=20L)) + print(epix_merge(x,y, observed_versions_end_conflict = "na")) + test_that('epix_merge stops on observed_versions_end_conflict default or "stop"', { + expect_error(epix_merge(x,y), + class="epiprocess__epix_merge_unresolved_observed_versions_end_conflict") + expect_error(epix_merge(x,y, observed_versions_end_conflict = "stop"), + class="epiprocess__epix_merge_unresolved_observed_versions_end_conflict") + }) + test_that('epix_merge observed_versions_end_conflict="na" works', { + expect_equal( + epix_merge(x,y, observed_versions_end_conflict = "na"), + as_epi_archive(tibble::tribble( + ~geo_value, ~time_value, ~version, ~x_value, ~y_value, + 1L, 1L, 1L, 10L, NA_integer_, # x updated, y not observed yet + 1L, 1L, 2L, NA_integer_, NA_integer_, # NA-ing out x, y not observed yet + 1L, 1L, 5L, NA_integer_, 20L, # x still NA, y updated + ), clobberable_versions_start=1L) + ) + }) + test_that('epix_merge observed_versions_end_conflict="lvcf" works', { + expect_equal( + epix_merge(x,y, observed_versions_end_conflict = "lvcf"), + as_epi_archive(tibble::tribble( + ~geo_value, ~time_value, ~version, ~x_value, ~y_value, + 1L, 1L, 1L, 10L, NA_integer_, # x updated, y not observed yet + 1L, 1L, 5L, 10L, 20L, # x LVCF'd, y updated + ), clobberable_versions_start=1L) + ) + }) + x_no_conflict = as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, x_value=10L)) + y_no_conflict = as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, y_value=20L)) + xy_no_conflict_expected = as_epi_archive(tibble::tribble( + ~geo_value, ~time_value, ~version, ~x_value, ~y_value, + 1L, 1L, 1L, 10L, 20L, # x updated, y not observed yet + )) + test_that('epix_merge observed_versions_end_conflict="stop" on no-conflict works', { + expect_equal( + epix_merge(x_no_conflict, y_no_conflict, observed_versions_end_conflict = "stop"), + xy_no_conflict_expected + ) + }) + test_that('epix_merge observed_versions_end_conflict="na" on no-conflict works', { + # This test is the main reason for these no-conflict tests. We want to make + # sure that we don't add an unnecessary NA-ing-out version beyond a common + # observed_versions_end. + expect_equal( + epix_merge(x_no_conflict, y_no_conflict, observed_versions_end_conflict = "na"), + xy_no_conflict_expected + ) + }) + test_that('epix_merge observed_versions_end_conflict="lvcf" on no-conflict works', { + expect_equal( + epix_merge(x_no_conflict, y_no_conflict, observed_versions_end_conflict = "lvcf"), + xy_no_conflict_expected + ) + }) +}) + + +test_that('epix_merge observed_versions_end_conflict="na" balks if do not know next_after', { + expect_error( + epix_merge( + as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=as.POSIXct(as.Date("2020-01-01")), x_value=10L)), + as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=as.POSIXct(as.Date("2020-01-02")), y_value=20L)), + observed_versions_end_conflict = "na" + ), + regexp = "no applicable method.*next_after" + ) +}) diff --git a/tests/testthat/test-methods-epi_archive.R b/tests/testthat/test-methods-epi_archive.R index 4377035f..4f8f3c42 100644 --- a/tests/testthat/test-methods-epi_archive.R +++ b/tests/testthat/test-methods-epi_archive.R @@ -39,32 +39,3 @@ test_that("as_of properly grabs the data and doesn't mutate key",{ expect_equal(df_as_of[1:4],df_filter) expect_equal(data.table::key(ea$DT), old_key) }) - -# epix_merge tests -test_that("epix_merge requires second argument to be a data.table or - epi_archive",{ - expect_error(epix_merge(ea,data.frame(x=1))) -}) - -test_that("data.table merging is utilized if second argument is a data.table",{ - dt1 <- select(ea$DT , -case_rate_7d_av) - ea1 <- as_epi_archive(dt1) - dt2 <- select(ea$DT , -percent_cli) - - expect_identical( - epix_merge(ea1,dt2), - merge(dt1,dt2) - ) -}) - -test_that("data.table merging works as intended",{ - ea <- archive_cases_dv_subset$clone() - dt1 <- select(ea$DT , -case_rate_7d_av) - ea1 <- as_epi_archive(dt1) - dt2 <- select(ea$DT , -percent_cli) - - expect_identical( - as_epi_archive(ea$DT), - as_epi_archive(merge(dt1,dt2)) - ) -}) From 2691294870562253ba8a1fabfd4292229696fdfd Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Sun, 24 Jul 2022 14:08:08 -0700 Subject: [PATCH 78/96] Fix head -> utils::head --- R/data.R | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/R/data.R b/R/data.R index c15bb96d..f642ac47 100644 --- a/R/data.R +++ b/R/data.R @@ -96,13 +96,13 @@ some_package_is_being_unregistered = function(parent_n = 0L) { # triggering if, in a later version, we decide to loosen the `call_name` # checks below to something that would be `TRUE` for the name of this function # or one of the undesired call ancestors. - calls_to_inspect = head(calls, n = -(parent_n + 1L)) - # Note that `head(sys.calls(), n=-1L)` isn't equivalent, due to lazy argument - # evaluation. Note that copy-pasting the body of this function without this - # `head` operation isn't always equivalent to calling it; e.g., within the - # `value` argument of a package-level `delayedAssign`, `sys.calls()` will - # return `NULL` is some or all cases, including when its evaluation has been - # triggered via `unregister`. + calls_to_inspect = utils::head(calls, n = -(parent_n + 1L)) + # Note that `utils::head(sys.calls(), n=-1L)` isn't equivalent, due to lazy + # argument evaluation. Note that copy-pasting the body of this function + # without this `utils::head` operation isn't always equivalent to calling it; + # e.g., within the `value` argument of a package-level `delayedAssign`, + # `sys.calls()` will return `NULL` is some or all cases, including when its + # evaluation has been triggered via `unregister`. simple_call_names = purrr::map_chr(calls_to_inspect, function(call) { maybe_simple_call_name = rlang::call_name(call) if (is.null(maybe_simple_call_name)) NA_character_ else maybe_simple_call_name From 19322bc41ae71336b1fdadd1f9593a94f89173a7 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Sun, 24 Jul 2022 14:22:46 -0700 Subject: [PATCH 79/96] Fix some undoc'd archive params, base::invisible links --- DESCRIPTION | 2 +- R/archive.R | 66 +++++++++++++++++++++-------------------- R/methods-epi_archive.R | 2 +- man/as_epi_archive.Rd | 32 ++++++++++++++++++++ man/as_epi_df.Rd | 10 +++---- man/epi_archive.Rd | 34 ++------------------- 6 files changed, 76 insertions(+), 70 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index b0b83ca0..773df354 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -57,7 +57,7 @@ Config/testthat/edition: 3 Encoding: UTF-8 LazyData: true Roxygen: list(markdown = TRUE) -RoxygenNote: 7.2.0 +RoxygenNote: 7.2.1 Depends: R (>= 2.10) URL: https://cmu-delphi.github.io/epiprocess/ diff --git a/R/archive.R b/R/archive.R index 9bd61e07..eb7ff929 100644 --- a/R/archive.R +++ b/R/archive.R @@ -242,37 +242,8 @@ epi_archive = #' here is removing a large proportion of the rows, this may indicate a #' potential for space, time, or bandwidth savings upstream the data pipeline, #' e.g., when fetching, storing, or preparing the input data `x` -#' @param clobberable_versions_start Optional; `length`-1; either a value of the -#' same `class` and `typeof` as `x$version`, or an `NA` of any `class` and -#' `typeof`: specifically, either (a) the earliest version that could be -#' subject to "clobbering" (being overwritten with different update data, but -#' using the same version tag as the old update data), or (b) `NA`, to -#' indicate that no versions are clobberable. There are a variety of reasons -#' why versions could be clobberable, such as upstream hotfixes to the latest -#' version, or delays in data synchronization that were mistaken for versions -#' with no updates; potential causes vary between different data pipelines. -#' The default value is `max_version_with_row_in(x)`; this default assumes -#' that (i) if a row in `x` (even one that `compactify` would consider -#' redundant) is present with version `ver`, then all previous versions must -#' be finalized and non-clobberable, although `ver` (and onward) might still -#' be modified, (ii) even if we have "observed" empty updates for some -#' versions beyond `max(x$version)` (as indicated by `observed_versions_end`; -#' see below), we can't assume `max(x$version)` has been finalized, because we -#' might see a nonfinalized version + empty subsequent versions due to -#' upstream database replication delays in combination with the upstream -#' replicas using last-version-carried-forward to extrapolate that there were -#' no updates, (iii) "redundant" update rows that would be removed by -#' `compactify` are not redundant, and actually come from an explicit version -#' release that indicates that preceding versions are finalized. If `nrow(x) -#' == 0`, then this argument is mandatory. -#' @param observed_versions_end Optional; length-1, same `class` and `typeof` as -#' `x$version`: what is the last version we have observed? The default is -#' `max_version_with_row_in(x)`, but values greater than this could also be -#' valid, and would indicate that we observed additional versions of the data -#' beyond `max(x$version)`, but they all contained empty updates. (The default -#' value of `clobberable_versions_start` does not fully trust these empty -#' updates, and assumes that any version `>= max(x$version)` could be -#' clobbered.) If `nrow(x) == 0`, then this argument is mandatory. +#' @param clobberable_versions_start Optional; as in [`as_epi_archive`] +#' @param observed_versions_end Optiona; as in [`as_epi_archive`] #' @return An `epi_archive` object. #' @importFrom data.table as.data.table key setkeyv #' @@ -580,7 +551,7 @@ epi_archive = #' @description Merges another `epi_archive` with the current one, mutating the #' current one by reseating its `DT` and several other fields, but avoiding #' mutation of the old `DT`; returns the current archive -#' \link{base:invisible}[invisibly]. See [`epix_merge`] for a full description +#' [invisibly][base::invisible]. See [`epix_merge`] for a full description #' of the non-R6-method version, which does not mutate either archive, and #' does not alias either archive's `DT`. #' @param y as in [`epix_merge`] @@ -793,6 +764,37 @@ epi_archive = #' here is removing a large proportion of the rows, this may indicate a #' potential for space, time, or bandwidth savings upstream the data pipeline, #' e.g., when fetching, storing, or preparing the input data `x` +#' @param clobberable_versions_start Optional; `length`-1; either a value of the +#' same `class` and `typeof` as `x$version`, or an `NA` of any `class` and +#' `typeof`: specifically, either (a) the earliest version that could be +#' subject to "clobbering" (being overwritten with different update data, but +#' using the same version tag as the old update data), or (b) `NA`, to +#' indicate that no versions are clobberable. There are a variety of reasons +#' why versions could be clobberable, such as upstream hotfixes to the latest +#' version, or delays in data synchronization that were mistaken for versions +#' with no updates; potential causes vary between different data pipelines. +#' The default value is `max_version_with_row_in(x)`; this default assumes +#' that (i) if a row in `x` (even one that `compactify` would consider +#' redundant) is present with version `ver`, then all previous versions must +#' be finalized and non-clobberable, although `ver` (and onward) might still +#' be modified, (ii) even if we have "observed" empty updates for some +#' versions beyond `max(x$version)` (as indicated by `observed_versions_end`; +#' see below), we can't assume `max(x$version)` has been finalized, because we +#' might see a nonfinalized version + empty subsequent versions due to +#' upstream database replication delays in combination with the upstream +#' replicas using last-version-carried-forward to extrapolate that there were +#' no updates, (iii) "redundant" update rows that would be removed by +#' `compactify` are not redundant, and actually come from an explicit version +#' release that indicates that preceding versions are finalized. If `nrow(x) +#' == 0`, then this argument is mandatory. +#' @param observed_versions_end Optional; length-1, same `class` and `typeof` as +#' `x$version`: what is the last version we have observed? The default is +#' `max_version_with_row_in(x)`, but values greater than this could also be +#' valid, and would indicate that we observed additional versions of the data +#' beyond `max(x$version)`, but they all contained empty updates. (The default +#' value of `clobberable_versions_start` does not fully trust these empty +#' updates, and assumes that any version `>= max(x$version)` could be +#' clobbered.) If `nrow(x) == 0`, then this argument is mandatory. #' @return An `epi_archive` object. #' #' @details This simply a wrapper around the `new()` method of the `epi_archive` diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 11cd6538..bd61e4f6 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -73,7 +73,7 @@ epix_as_of = function(x, max_version, min_time_value = -Inf) { #' `x$DT`). The R6 method variant, `x$fill_through_version`, will mutate `x` to #' give the result, but might reseat its fields (e.g., references to the old #' `x$DT` might not be updated by this function or subsequent operations on -#' `x`), and returns the updated `x` \link[base:invisible]{invisibly}. +#' `x`), and returns the updated `x` [invisibly][base::invisible]. #' #' @param x An `epi_archive` #' @param fill_versions_end Length-1, same class&type as `%s$version`: the diff --git a/man/as_epi_archive.Rd b/man/as_epi_archive.Rd index 4b3d97ea..ad00f531 100644 --- a/man/as_epi_archive.Rd +++ b/man/as_epi_archive.Rd @@ -48,6 +48,38 @@ set to \code{TRUE}, but if you directly inspect or edit the fields of the here is removing a large proportion of the rows, this may indicate a potential for space, time, or bandwidth savings upstream the data pipeline, e.g., when fetching, storing, or preparing the input data \code{x}} + +\item{clobberable_versions_start}{Optional; \code{length}-1; either a value of the +same \code{class} and \code{typeof} as \code{x$version}, or an \code{NA} of any \code{class} and +\code{typeof}: specifically, either (a) the earliest version that could be +subject to "clobbering" (being overwritten with different update data, but +using the same version tag as the old update data), or (b) \code{NA}, to +indicate that no versions are clobberable. There are a variety of reasons +why versions could be clobberable, such as upstream hotfixes to the latest +version, or delays in data synchronization that were mistaken for versions +with no updates; potential causes vary between different data pipelines. +The default value is \code{max_version_with_row_in(x)}; this default assumes +that (i) if a row in \code{x} (even one that \code{compactify} would consider +redundant) is present with version \code{ver}, then all previous versions must +be finalized and non-clobberable, although \code{ver} (and onward) might still +be modified, (ii) even if we have "observed" empty updates for some +versions beyond \code{max(x$version)} (as indicated by \code{observed_versions_end}; +see below), we can't assume \code{max(x$version)} has been finalized, because we +might see a nonfinalized version + empty subsequent versions due to +upstream database replication delays in combination with the upstream +replicas using last-version-carried-forward to extrapolate that there were +no updates, (iii) "redundant" update rows that would be removed by +\code{compactify} are not redundant, and actually come from an explicit version +release that indicates that preceding versions are finalized. If \code{nrow(x) == 0}, then this argument is mandatory.} + +\item{observed_versions_end}{Optional; length-1, same \code{class} and \code{typeof} as +\code{x$version}: what is the last version we have observed? The default is +\code{max_version_with_row_in(x)}, but values greater than this could also be +valid, and would indicate that we observed additional versions of the data +beyond \code{max(x$version)}, but they all contained empty updates. (The default +value of \code{clobberable_versions_start} does not fully trust these empty +updates, and assumes that any version \verb{>= max(x$version)} could be +clobbered.) If \code{nrow(x) == 0}, then this argument is mandatory.} } \value{ An \code{epi_archive} object. diff --git a/man/as_epi_df.Rd b/man/as_epi_df.Rd index 5d1b1335..b5df1302 100644 --- a/man/as_epi_df.Rd +++ b/man/as_epi_df.Rd @@ -51,9 +51,9 @@ examples. } \section{Methods (by class)}{ \itemize{ -\item \code{epi_df}: Simply returns the \code{epi_df} object unchanged. +\item \code{as_epi_df(epi_df)}: Simply returns the \code{epi_df} object unchanged. -\item \code{tbl_df}: The input tibble \code{x} must contain the columns +\item \code{as_epi_df(tbl_df)}: The input tibble \code{x} must contain the columns \code{geo_value} and \code{time_value}. All other columns will be preserved as is, and treated as measured variables. If \code{as_of} is missing, then the function will try to guess it from an \code{as_of}, \code{issue}, or \code{version} column of \code{x} @@ -61,14 +61,14 @@ will try to guess it from an \code{as_of}, \code{issue}, or \code{version} colum (stored in its attributes); if this fails, then the current day-time will be used. -\item \code{data.frame}: Works analogously to \code{as_epi_df.tbl_df()}. +\item \code{as_epi_df(data.frame)}: Works analogously to \code{as_epi_df.tbl_df()}. -\item \code{tbl_ts}: Works analogously to \code{as_epi_df.tbl_df()}, except that +\item \code{as_epi_df(tbl_ts)}: Works analogously to \code{as_epi_df.tbl_df()}, except that the \code{tbl_ts} class is dropped, and any key variables (other than "geo_value") are added to the metadata of the returned object, under the \code{other_keys} field. -}} +}} \examples{ # Convert a `tsibble` that has county code as an extra key # Notice that county code should be a character string to preserve any leading zeroes diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index e0d8f1d3..c2e8981d 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -179,37 +179,9 @@ here is removing a large proportion of the rows, this may indicate a potential for space, time, or bandwidth savings upstream the data pipeline, e.g., when fetching, storing, or preparing the input data \code{x}} -\item{\code{clobberable_versions_start}}{Optional; \code{length}-1; either a value of the -same \code{class} and \code{typeof} as \code{x$version}, or an \code{NA} of any \code{class} and -\code{typeof}: specifically, either (a) the earliest version that could be -subject to "clobbering" (being overwritten with different update data, but -using the same version tag as the old update data), or (b) \code{NA}, to -indicate that no versions are clobberable. There are a variety of reasons -why versions could be clobberable, such as upstream hotfixes to the latest -version, or delays in data synchronization that were mistaken for versions -with no updates; potential causes vary between different data pipelines. -The default value is \code{max_version_with_row_in(x)}; this default assumes -that (i) if a row in \code{x} (even one that \code{compactify} would consider -redundant) is present with version \code{ver}, then all previous versions must -be finalized and non-clobberable, although \code{ver} (and onward) might still -be modified, (ii) even if we have "observed" empty updates for some -versions beyond \code{max(x$version)} (as indicated by \code{observed_versions_end}; -see below), we can't assume \code{max(x$version)} has been finalized, because we -might see a nonfinalized version + empty subsequent versions due to -upstream database replication delays in combination with the upstream -replicas using last-version-carried-forward to extrapolate that there were -no updates, (iii) "redundant" update rows that would be removed by -\code{compactify} are not redundant, and actually come from an explicit version -release that indicates that preceding versions are finalized. If \code{nrow(x) == 0}, then this argument is mandatory.} +\item{\code{clobberable_versions_start}}{Optional; as in \code{\link{as_epi_archive}}} -\item{\code{observed_versions_end}}{Optional; length-1, same \code{class} and \code{typeof} as -\code{x$version}: what is the last version we have observed? The default is -\code{max_version_with_row_in(x)}, but values greater than this could also be -valid, and would indicate that we observed additional versions of the data -beyond \code{max(x$version)}, but they all contained empty updates. (The default -value of \code{clobberable_versions_start} does not fully trust these empty -updates, and assumes that any version \verb{>= max(x$version)} could be -clobbered.) If \code{nrow(x) == 0}, then this argument is mandatory.} +\item{\code{observed_versions_end}}{Optiona; as in \code{\link{as_epi_archive}}} } \if{html}{\out{
}} } @@ -271,7 +243,7 @@ version, which doesn't mutate the input archive but might alias its fields. Merges another \code{epi_archive} with the current one, mutating the current one by reseating its \code{DT} and several other fields, but avoiding mutation of the old \code{DT}; returns the current archive -\link{base:invisible}\link{invisibly}. See \code{\link{epix_merge}} for a full description +\link[base:invisible]{invisibly}. See \code{\link{epix_merge}} for a full description of the non-R6-method version, which does not mutate either archive, and does not alias either archive's \code{DT}. \subsection{Usage}{ From dc8fc1025c47f5acb58af541014cca51a779297f Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Sun, 24 Jul 2022 14:58:17 -0700 Subject: [PATCH 80/96] Merge in epix_slide quosure-passing fix + test --- NAMESPACE | 1 + R/archive.R | 19 ++-- R/methods-epi_archive.R | 2 +- tests/testthat/test-methods-epi_archive.R | 104 ++++++++++++++++++++++ 4 files changed, 114 insertions(+), 12 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index eeffd7b3..a290ab27 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -80,6 +80,7 @@ importFrom(rlang,arg_match) importFrom(rlang,enquo) importFrom(rlang,enquos) importFrom(rlang,is_quosure) +importFrom(rlang,quo_is_missing) importFrom(rlang,sym) importFrom(rlang,syms) importFrom(stats,cor) diff --git a/R/archive.R b/R/archive.R index eb7ff929..a5d548ca 100644 --- a/R/archive.R +++ b/R/archive.R @@ -579,7 +579,7 @@ epi_archive = #' object. See the documentation for the wrapper function [`epix_slide()`] for #' details. #' @importFrom data.table key -#' @importFrom rlang !! !!! enquo enquos is_quosure sym syms +#' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms slide = function(f, ..., n = 7, group_by, ref_time_values, time_step, new_col_name = "slide_value", as_list_col = FALSE, names_sep = "_", @@ -598,19 +598,16 @@ epi_archive = before_num = n-1 if (!missing(time_step)) before_num = time_step(n-1) - # What to group by? If missing, set according to internal keys - if (missing(group_by)) { - group_by = setdiff(key(self$DT), c("time_value", "version")) + # What to group by? If missing, set according to internal keys; + # otherwise, tidyselect. + if (quo_is_missing(enquo(group_by))) { + group_by <- syms(setdiff(key(self$DT), c("time_value", "version"))) + } else { + group_by <- syms(names(eval_select(enquo(group_by), self$DT))) } - # Symbolize column name, defuse grouping variables. We have to do - # the middle step here which is a bit complicated (unfortunately) - # since the function epix_slide() could have called the current one, - # and in doing so, it may have already needed to defuse the grouping - # variables + # Symbolize column name new_col = sym(new_col_name) - if (!is_quosure(group_by)) group_by = enquo(group_by) - group_by = syms(names(eval_select(group_by, self$DT))) # Key variable names, apart from time value and version key_vars = setdiff(key(self$DT), c("time_value", "version")) diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index bd61e4f6..fe344182 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -447,7 +447,7 @@ epix_slide = function(x, f, ..., n = 7, group_by, ref_time_values, as_list_col = FALSE, names_sep = "_", all_rows = FALSE) { if (!inherits(x, "epi_archive")) Abort("`x` must be of class `epi_archive`.") return(x$slide(f, ..., n = n, - group_by = enquo(group_by), + group_by = {{group_by}}, ref_time_values = ref_time_values, time_step = time_step, new_col_name = new_col_name, diff --git a/tests/testthat/test-methods-epi_archive.R b/tests/testthat/test-methods-epi_archive.R index 4f8f3c42..df107750 100644 --- a/tests/testthat/test-methods-epi_archive.R +++ b/tests/testthat/test-methods-epi_archive.R @@ -39,3 +39,107 @@ test_that("as_of properly grabs the data and doesn't mutate key",{ expect_equal(df_as_of[1:4],df_filter) expect_equal(data.table::key(ea$DT), old_key) }) + +test_that("quosure passing issue in epix_slide is resolved + other potential issues", { + # (First part adapted from @examples) + time_values <- seq(as.Date("2020-06-01"), + as.Date("2020-06-02"), + by = "1 day") + reference = epix_slide(x = archive_cases_dv_subset, + f = ~ mean(.x$case_rate_7d_av), + n = 3, + group_by = geo_value, + ref_time_values = time_values, + new_col_name = 'case_rate_3d_av') + # test the passing-something-that-must-be-enquosed behavior: + expect_identical( + archive_cases_dv_subset$slide( + f = ~ mean(.x$case_rate_7d_av), + n = 3, + group_by = geo_value, + ref_time_values = time_values, + new_col_name = 'case_rate_3d_av' + ), + reference + ) + # test the passing-string-literal behavior: + expect_identical( + epix_slide(x = archive_cases_dv_subset, + f = ~ mean(.x$case_rate_7d_av), + n = 3, + group_by = "geo_value", + ref_time_values = time_values, + new_col_name = 'case_rate_3d_av'), + reference + ) + expect_identical( + archive_cases_dv_subset$slide( + f = ~ mean(.x$case_rate_7d_av), + n = 3, + group_by = "geo_value", + ref_time_values = time_values, + new_col_name = 'case_rate_3d_av' + ), + reference + ) + # test the passing-string-var behavior: + my_group_by = "geo_value" + expect_identical( + epix_slide(x = archive_cases_dv_subset, + f = ~ mean(.x$case_rate_7d_av), + n = 3, + group_by = my_group_by, + ref_time_values = time_values, + new_col_name = 'case_rate_3d_av'), + reference + ) + expect_identical( + archive_cases_dv_subset$slide( + f = ~ mean(.x$case_rate_7d_av), + n = 3, + group_by = my_group_by, + ref_time_values = time_values, + new_col_name = 'case_rate_3d_av' + ), + reference + ) + # test the passing-splatted-string-var behavior: + my_group_by = "geo_value" + expect_identical( + epix_slide(x = archive_cases_dv_subset, + f = ~ mean(.x$case_rate_7d_av), + n = 3, + group_by = !!!my_group_by, + ref_time_values = time_values, + new_col_name = 'case_rate_3d_av'), + reference + ) + expect_identical( + archive_cases_dv_subset$slide( + f = ~ mean(.x$case_rate_7d_av), + n = 3, + group_by = !!!my_group_by, + ref_time_values = time_values, + new_col_name = 'case_rate_3d_av' + ), + reference + ) + # test the default behavior (default in this case should just be "geo_value"): + expect_identical( + epix_slide(x = archive_cases_dv_subset, + f = ~ mean(.x$case_rate_7d_av), + n = 3, + ref_time_values = time_values, + new_col_name = 'case_rate_3d_av'), + reference + ) + expect_identical( + archive_cases_dv_subset$slide( + f = ~ mean(.x$case_rate_7d_av), + n = 3, + ref_time_values = time_values, + new_col_name = 'case_rate_3d_av' + ), + reference + ) +}) From 1738e0050e3af29d2d4fb0899483c2c08f1a254b Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Sun, 24 Jul 2022 15:30:28 -0700 Subject: [PATCH 81/96] Fix tail -> utils::tail in epix_merge --- R/methods-epi_archive.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index fe344182..2478375b 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -244,7 +244,7 @@ epix_merge = function(x, y, "geo_value", "time_value", and "version".', class="epiprocess__epi_archive_must_have_required_key_cols") } - if (length(by) < 1L || tail(by, 1L) != "version") { + if (length(by) < 1L || utils::tail(by, 1L) != "version") { Abort('Invalid `by`; `by` is currently set to the common `key` of the two archives, and is expected to have a "version" as the last key col.', From a7bf0b6f33faaa8254c2bdd04b73fdb150feae97 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Sun, 24 Jul 2022 15:32:55 -0700 Subject: [PATCH 82/96] Update examples, vignettes to use current merge (but keeping dead? code) Change examples, vignettes built expecting old mutating `epix_merge` to use nonmutating new `epix_merge` or mutating `$merge` instead. Remove `all=TRUE` that is now required. Also add `withr` under `Suggests:` since it appears in tests, vignettes, and maybe examples. --- DESCRIPTION | 3 ++- R/methods-epi_archive.R | 6 ++++-- man/as_epi_df.Rd | 10 +++++----- man/epix_merge.Rd | 6 ++++-- vignettes/advanced.Rmd | 6 +++--- 5 files changed, 18 insertions(+), 13 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 773df354..6f5c3c81 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -47,7 +47,8 @@ Suggests: outbreaks, rmarkdown, testthat (>= 3.0.0), - waldo (>= 0.3.1) + waldo (>= 0.3.1), + withr VignetteBuilder: knitr Remotes: diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 2478375b..89a99d36 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -143,8 +143,10 @@ epix_fill_through_version = function(x, fill_versions_end, #' y <- archive_cases_dv_subset$DT %>% #' dplyr::select(geo_value,time_value,version,percent_cli) %>% #' as_epi_archive(compactify=TRUE) -#' # a full join stored in x -#' epix_merge(x, y) +#' # merge results stored in a third object: +#' xy = epix_merge(x, y) +#' # vs. mutating x to hold the merge result: +#' x$merge(y) #' #' @importFrom data.table key set #' @export diff --git a/man/as_epi_df.Rd b/man/as_epi_df.Rd index b5df1302..5d1b1335 100644 --- a/man/as_epi_df.Rd +++ b/man/as_epi_df.Rd @@ -51,9 +51,9 @@ examples. } \section{Methods (by class)}{ \itemize{ -\item \code{as_epi_df(epi_df)}: Simply returns the \code{epi_df} object unchanged. +\item \code{epi_df}: Simply returns the \code{epi_df} object unchanged. -\item \code{as_epi_df(tbl_df)}: The input tibble \code{x} must contain the columns +\item \code{tbl_df}: The input tibble \code{x} must contain the columns \code{geo_value} and \code{time_value}. All other columns will be preserved as is, and treated as measured variables. If \code{as_of} is missing, then the function will try to guess it from an \code{as_of}, \code{issue}, or \code{version} column of \code{x} @@ -61,14 +61,14 @@ will try to guess it from an \code{as_of}, \code{issue}, or \code{version} colum (stored in its attributes); if this fails, then the current day-time will be used. -\item \code{as_epi_df(data.frame)}: Works analogously to \code{as_epi_df.tbl_df()}. +\item \code{data.frame}: Works analogously to \code{as_epi_df.tbl_df()}. -\item \code{as_epi_df(tbl_ts)}: Works analogously to \code{as_epi_df.tbl_df()}, except that +\item \code{tbl_ts}: Works analogously to \code{as_epi_df.tbl_df()}, except that the \code{tbl_ts} class is dropped, and any key variables (other than "geo_value") are added to the metadata of the returned object, under the \code{other_keys} field. - }} + \examples{ # Convert a `tsibble` that has county code as an extra key # Notice that county code should be a character string to preserve any leading zeroes diff --git a/man/epix_merge.Rd b/man/epix_merge.Rd index 866102ce..44f7559d 100644 --- a/man/epix_merge.Rd +++ b/man/epix_merge.Rd @@ -62,7 +62,9 @@ x <- archive_cases_dv_subset$DT \%>\% y <- archive_cases_dv_subset$DT \%>\% dplyr::select(geo_value,time_value,version,percent_cli) \%>\% as_epi_archive(compactify=TRUE) -# a full join stored in x -epix_merge(x, y) +# merge results stored in a third object: +xy = epix_merge(x, y) +# vs. mutating x to hold the merge result: +x$merge(y) } diff --git a/vignettes/advanced.Rmd b/vignettes/advanced.Rmd index a76c5225..04c73af3 100644 --- a/vignettes/advanced.Rmd +++ b/vignettes/advanced.Rmd @@ -224,13 +224,13 @@ x <- y1 %>% ) %>% as_epi_archive() -epix_merge(x, y2 %>% +# mutating merge operation: +x$merge(y2 %>% select(geo_value, time_value, version = issue, case_rate_7d_av = value ) %>% - as_epi_archive(), -all = TRUE + as_epi_archive() ) ``` From 22c055905b489b2ea805c7ae86ef166d57d87f8a Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Sun, 24 Jul 2022 15:52:13 -0700 Subject: [PATCH 83/96] Fix compactify vignette rebuilding in `check()` Add missing `library(epiprocess)` --- vignettes/compactify.Rmd | 1 + 1 file changed, 1 insertion(+) diff --git a/vignettes/compactify.Rmd b/vignettes/compactify.Rmd index 70aaf632..871cc3bc 100644 --- a/vignettes/compactify.Rmd +++ b/vignettes/compactify.Rmd @@ -29,6 +29,7 @@ use them to illustrate LVCF. Notice how the head of the first dataset differs from the second from the third value included. ```{r} +library(epiprocess) library(dplyr) dt <- archive_cases_dv_subset$DT From e26f6ddfbac7c21a9914ed11dde58d492782f00b Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Sun, 24 Jul 2022 15:56:41 -0700 Subject: [PATCH 84/96] Avoid some check() NOTEs from data.table dotdot prefix Rewrite things to use `with=FALSE` and avoid `..`. --- R/methods-epi_archive.R | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 89a99d36..1c256e42 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -211,8 +211,8 @@ epix_merge = function(x, y, y_DT = epix_fill_through_version(y, new_observed_versions_end, observed_versions_end_conflict)$DT } else if (observed_versions_end_conflict == "truncate") { new_observed_versions_end = min(x$observed_versions_end, y$observed_versions_end) - x_DT = x$DT[version <= ..new_observed_versions_end] - y_DT = y$DT[version <= ..new_observed_versions_end] + x_DT = x$DT[x[["DT"]][["version"]] <= new_observed_versions_end, with=FALSE] + y_DT = y$DT[y[["DT"]][["version"]] <= new_observed_versions_end, with=FALSE] } else Abort("unimplemented") if (!identical(key(x$DT), key(x_DT)) || !identical(key(y$DT), key(y_DT))) { @@ -262,7 +262,7 @@ epix_merge = function(x, y, incorporated into the key, and other columns should be renamed. ", class="epiprocess__epix_merge_x_y_must_not_have_overlapping_nonby_colnames") } - x_by_vals = x_DT[, ..by] + x_by_vals = x_DT[, by, with=FALSE] if (anyDuplicated(x_by_vals) != 0L) { Abort(" The `by` columns must uniquely determine rows of `x$DT`; @@ -271,7 +271,7 @@ epix_merge = function(x, y, to `x`'s key (to get a unique key). ", class="epiprocess__epix_merge_by_cols_must_act_as_unique_key") } - y_by_vals = y_DT[, ..by] + y_by_vals = y_DT[, by, with=FALSE] if (anyDuplicated(y_by_vals) != 0L) { Abort(" The `by` columns must uniquely determine rows of `y$DT`; @@ -291,7 +291,7 @@ epix_merge = function(x, y, # Disable superfluous check: allow.cartesian=TRUE) set(result_DT,, x_nonby_colnames, - x_DT[result_DT[, ..by], ..x_nonby_colnames, + x_DT[result_DT[, by, with=FALSE], x_nonby_colnames, with=FALSE, # It's good practice to specify `on`, and we must # explicitly specify `on` if there's a potential key vs. # by order mismatch (not possible currently for x @@ -306,7 +306,7 @@ epix_merge = function(x, y, # similar story here. allow.cartesian=TRUE]) set(result_DT,, y_nonby_colnames, - y_DT[result_DT[, ..by], ..y_nonby_colnames, + y_DT[result_DT[, by, with=FALSE], y_nonby_colnames, with=FALSE, on = by, roll=TRUE, nomatch=NA, From adc367f3f2ecfcf3f41330f42fa846e4c35a71ea Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 25 Jul 2022 00:18:34 -0700 Subject: [PATCH 85/96] Remove improper and faulty tidyselect usage from group_by tests - Directly "passing" a reference to a character vector object is ambiguous; we might want to test that we trigger the message about this, but this might require mucking about in the internals of tidyselect (it seems to have its own frequency mechanism separate from rlang conditions). - `tidyselect::eval_select` doesn't support `!!!`, but testthat expectations somehow makes this empty out the `group_by` arg, which _coincidentally_ passes the tests, because the default for the test archive is to group by `geo_value`. Plus, `!!!` is intended to splice/splat a list of language objects, not a character vector. --- tests/testthat/test-methods-epi_archive.R | 31 +++++------------------ 1 file changed, 7 insertions(+), 24 deletions(-) diff --git a/tests/testthat/test-methods-epi_archive.R b/tests/testthat/test-methods-epi_archive.R index df107750..5a0522c9 100644 --- a/tests/testthat/test-methods-epi_archive.R +++ b/tests/testthat/test-methods-epi_archive.R @@ -82,13 +82,17 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss ), reference ) - # test the passing-string-var behavior: + # Might also want to test the passing-string-var-without-all_of behavior, but + # make sure to set, trigger, then reset (or restore to old value) the + # tidyselect once-per-session message about the ambiguity + # + # test the passing-all-of-string-var behavior: my_group_by = "geo_value" expect_identical( epix_slide(x = archive_cases_dv_subset, f = ~ mean(.x$case_rate_7d_av), n = 3, - group_by = my_group_by, + group_by = tidyselect::all_of(my_group_by), ref_time_values = time_values, new_col_name = 'case_rate_3d_av'), reference @@ -97,28 +101,7 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss archive_cases_dv_subset$slide( f = ~ mean(.x$case_rate_7d_av), n = 3, - group_by = my_group_by, - ref_time_values = time_values, - new_col_name = 'case_rate_3d_av' - ), - reference - ) - # test the passing-splatted-string-var behavior: - my_group_by = "geo_value" - expect_identical( - epix_slide(x = archive_cases_dv_subset, - f = ~ mean(.x$case_rate_7d_av), - n = 3, - group_by = !!!my_group_by, - ref_time_values = time_values, - new_col_name = 'case_rate_3d_av'), - reference - ) - expect_identical( - archive_cases_dv_subset$slide( - f = ~ mean(.x$case_rate_7d_av), - n = 3, - group_by = !!!my_group_by, + group_by = tidyselect::all_of(my_group_by), ref_time_values = time_values, new_col_name = 'case_rate_3d_av' ), From b2bde0ee111b924190c75eff553acb717780670c Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 25 Jul 2022 01:29:18 -0700 Subject: [PATCH 86/96] Make epix_slide tidyselect tests tougher (not just default grping) Existing tests of the resolution of some rlang&tidyselect issues in epix_slide were a bit loose, testing only different ways of explicitly specifying a grouping that all were equal to the default grouping. This came into play with a test that contained faulty usage of `!!!` which appeared to be somehow transformed by testthat expectations into something zapping the `group_by` arg from the related call. This tougher line of testing is intended to catch any similar types of issues with the other test cases. --- tests/testthat/test-methods-epi_archive.R | 61 ++++++++++++++--------- 1 file changed, 37 insertions(+), 24 deletions(-) diff --git a/tests/testthat/test-methods-epi_archive.R b/tests/testthat/test-methods-epi_archive.R index 5a0522c9..a5662638 100644 --- a/tests/testthat/test-methods-epi_archive.R +++ b/tests/testthat/test-methods-epi_archive.R @@ -45,84 +45,97 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss time_values <- seq(as.Date("2020-06-01"), as.Date("2020-06-02"), by = "1 day") - reference = epix_slide(x = archive_cases_dv_subset, - f = ~ mean(.x$case_rate_7d_av), - n = 3, - group_by = geo_value, - ref_time_values = time_values, - new_col_name = 'case_rate_3d_av') + # We only have one non-version, non-time key in the example archive. Add + # another so that we don't accidentally pass tests due to accidentally + # matching the default grouping. + ea = as_epi_archive(archive_cases_dv_subset$DT %>% + dplyr::mutate(modulus = seq_len(nrow(.)) %% 5L), + other_keys = "modulus", + compactify = TRUE) + reference_by_modulus = epix_slide(x = ea, + f = ~ mean(.x$case_rate_7d_av), + n = 3, + group_by = modulus, + ref_time_values = time_values, + new_col_name = 'case_rate_3d_av') + reference_by_both = epix_slide(x = ea, + f = ~ mean(.x$case_rate_7d_av), + n = 3, + group_by = c(geo_value, modulus), + ref_time_values = time_values, + new_col_name = 'case_rate_3d_av') # test the passing-something-that-must-be-enquosed behavior: expect_identical( - archive_cases_dv_subset$slide( + ea$slide( f = ~ mean(.x$case_rate_7d_av), n = 3, - group_by = geo_value, + group_by = modulus, ref_time_values = time_values, new_col_name = 'case_rate_3d_av' ), - reference + reference_by_modulus ) # test the passing-string-literal behavior: expect_identical( - epix_slide(x = archive_cases_dv_subset, + epix_slide(x = ea, f = ~ mean(.x$case_rate_7d_av), n = 3, - group_by = "geo_value", + group_by = "modulus", ref_time_values = time_values, new_col_name = 'case_rate_3d_av'), - reference + reference_by_modulus ) expect_identical( - archive_cases_dv_subset$slide( + ea$slide( f = ~ mean(.x$case_rate_7d_av), n = 3, - group_by = "geo_value", + group_by = "modulus", ref_time_values = time_values, new_col_name = 'case_rate_3d_av' ), - reference + reference_by_modulus ) # Might also want to test the passing-string-var-without-all_of behavior, but # make sure to set, trigger, then reset (or restore to old value) the # tidyselect once-per-session message about the ambiguity # # test the passing-all-of-string-var behavior: - my_group_by = "geo_value" + my_group_by = "modulus" expect_identical( - epix_slide(x = archive_cases_dv_subset, + epix_slide(x = ea, f = ~ mean(.x$case_rate_7d_av), n = 3, group_by = tidyselect::all_of(my_group_by), ref_time_values = time_values, new_col_name = 'case_rate_3d_av'), - reference + reference_by_modulus ) expect_identical( - archive_cases_dv_subset$slide( + ea$slide( f = ~ mean(.x$case_rate_7d_av), n = 3, group_by = tidyselect::all_of(my_group_by), ref_time_values = time_values, new_col_name = 'case_rate_3d_av' ), - reference + reference_by_modulus ) # test the default behavior (default in this case should just be "geo_value"): expect_identical( - epix_slide(x = archive_cases_dv_subset, + epix_slide(x = ea, f = ~ mean(.x$case_rate_7d_av), n = 3, ref_time_values = time_values, new_col_name = 'case_rate_3d_av'), - reference + reference_by_both ) expect_identical( - archive_cases_dv_subset$slide( + ea$slide( f = ~ mean(.x$case_rate_7d_av), n = 3, ref_time_values = time_values, new_col_name = 'case_rate_3d_av' ), - reference + reference_by_both ) }) From 8271a978fcff5f9fb3b3422736b24e058c948ab0 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 27 Jul 2022 04:12:24 -0700 Subject: [PATCH 87/96] Change "LVCF" back to "LOCF", but with consistent intro text Always introduce version-wise LOCF as "last observation of each observation carried forward (LOCF)" or something quite similar. Clarify time-wise LOCF in one place. --- R/archive.R | 42 ++++++------- R/methods-epi_archive.R | 40 ++++++------- data-raw/archive_cases_dv_subset.R | 2 +- man/as_epi_archive.Rd | 4 +- man/as_epi_df.Rd | 10 ++-- man/epi_archive.Rd | 10 ++-- man/epix_as_of.Rd | 14 ++--- man/epix_fill_through_version.Rd | 10 ++-- man/epix_merge.Rd | 14 ++--- .../testthat/test-epix_fill_through_version.R | 16 ++--- tests/testthat/test-epix_merge.R | 16 +++-- vignettes/aggregation.Rmd | 4 +- vignettes/archive.Rmd | 2 +- vignettes/compactify.Rmd | 60 +++++++++---------- 14 files changed, 124 insertions(+), 120 deletions(-) diff --git a/R/archive.R b/R/archive.R index a5d548ca..783d3bb1 100644 --- a/R/archive.R +++ b/R/archive.R @@ -128,7 +128,7 @@ next_after.Date = function(x) x + 1L #' key variables, and thus the key variables are critical for figuring out how #' to generate a snapshot of data from the archive, as of a given version. #' -#' In general, the last version of each observation is carried forward (LVCF) to +#' In general, the last version of each observation is carried forward (LOCF) to #' fill in data between recorded versions, and between the last recorded #' update and the `observed_versions_end`. One consequence is that the `DT` #' doesn't have to contain a full snapshot of every version (although this @@ -230,8 +230,8 @@ epi_archive = #' @param compactify Optional; Boolean or `NULL`: should we remove rows that are #' considered redundant for the purposes of `epi_archive`'s built-in methods #' such as `as_of`? As these methods use the last version of each observation -#' carried forward (LVCF) to interpolate between the version data provided, -#' rows that don't change these LVCF results can potentially be omitted to +#' carried forward (LOCF) to interpolate between the version data provided, +#' rows that don't change these LOCF results can potentially be omitted to #' save space while maintaining the same behavior (with the help of the #' `clobberable_versions_start` and `observed_versions_end` fields in some #' edge cases). `TRUE` will remove these rows, `FALSE` will not, and missing @@ -337,30 +337,30 @@ epi_archive = DT = as.data.table(x, key = key_vars) if (!identical(key_vars, key(DT))) setkeyv(DT, cols = key_vars) - # Checks to see if a value in a vector is LVCF - is_lvcf <- function(vec) { + # Checks to see if a value in a vector is LOCF + is_locf <- function(vec) { dplyr::if_else(!is.na(vec) & !is.na(dplyr::lag(vec)), vec == dplyr::lag(vec), is.na(vec) & is.na(dplyr::lag(vec))) } - # LVCF is defined by a row where all values except for the version + # LOCF is defined by a row where all values except for the version # differ from their respective lag values - # Checks for LVCF's in a data frame - rm_lvcf <- function(df) { - dplyr::filter(df,if_any(c(everything(),-version),~ !is_lvcf(.))) + # Checks for LOCF's in a data frame + rm_locf <- function(df) { + dplyr::filter(df,if_any(c(everything(),-version),~ !is_locf(.))) } - # Keeps LVCF values, such as to be printed - keep_lvcf <- function(df) { - dplyr::filter(df,if_all(c(everything(),-version),~ is_lvcf(.))) + # Keeps LOCF values, such as to be printed + keep_locf <- function(df) { + dplyr::filter(df,if_all(c(everything(),-version),~ is_locf(.))) } # Runs compactify on data frame if (is.null(compactify) || compactify == TRUE) { - elim = keep_lvcf(DT) - DT = rm_lvcf(DT) + elim = keep_locf(DT) + DT = rm_locf(DT) } else { # Create empty data frame for nrow(elim) to be 0 elim = tibble::tibble() @@ -370,7 +370,7 @@ epi_archive = if (is.null(compactify) && nrow(elim) > 0) { warning_intro <- break_str(paste( 'Found rows that appear redundant based on', - 'last (version of an) observation carried forward;', + 'last (version of each) observation carried forward;', 'these rows have been removed to "compactify" and save space:' )) @@ -494,7 +494,7 @@ epi_archive = #' @importFrom data.table key setkeyv := address copy #' @importFrom rlang arg_match fill_through_version = function(fill_versions_end, - how=c("na", "lvcf")) { + how=c("na", "locf")) { validate_version_bound(fill_versions_end, self$DT, na_ok=FALSE) how <- arg_match(how) if (self$observed_versions_end < fill_versions_end) { @@ -532,8 +532,8 @@ epi_archive = # full result DT: setkeyv(rbind(self$DT, next_version_DT), key(self$DT))[] }, - "lvcf" = { - # just the old DT; LVCF is built into other methods: + "locf" = { + # just the old DT; LOCF is built into other methods: self$DT } ) @@ -557,7 +557,7 @@ epi_archive = #' @param y as in [`epix_merge`] #' @param observed_versions_end_conflict as in [`epix_merge`] #' @param compactify as in [`epix_merge`] - merge = function(y, observed_versions_end_conflict = c("stop","na","lvcf","truncate"), compactify=TRUE) { + merge = function(y, observed_versions_end_conflict = c("stop","na","locf","truncate"), compactify=TRUE) { result = epix_merge(self, y, observed_versions_end_conflict = observed_versions_end_conflict, compactify = compactify) @@ -751,8 +751,8 @@ epi_archive = #' @param compactify Optional; Boolean or `NULL`: should we remove rows that are #' considered redundant for the purposes of `epi_archive`'s built-in methods #' such as `as_of`? As these methods use the last version of each observation -#' carried forward (LVCF) to interpolate between the version data provided, -#' rows that don't change these LVCF results can potentially be omitted to +#' carried forward (LOCF) to interpolate between the version data provided, +#' rows that don't change these LOCF results can potentially be omitted to #' save space. `TRUE` will remove these rows, `FALSE` will not, and missing or #' `NULL` will remove these rows and issue a warning. Generally, this can be #' set to `TRUE`, but if you directly inspect or edit the fields of the diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 1c256e42..659a98dc 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -41,13 +41,13 @@ #' max_version = as.Date("2020-06-12")) #' #' # When fetching a snapshot as of the latest version with update data in the -#' # archive, a warning is issued as this update data might not yet be finalized -#' # (for example, if data versions are labeled with dates, these versions might be -#' # overwritten throughout the day if the data can be updated multiple times per -#' # day; when we build an archive based on special update-data queries all made at -#' # the same time, the latest available update might still be subject to change, -#' # but previous versions should be finalized). We can muffle such warnings with -#' # the following pattern: +#' # archive, a warning is issued by default, as this update data might not yet +#' # be finalized (for example, if data versions are labeled with dates, these +#' # versions might be overwritten throughout the corresponding days with +#' # additional data or "hotfixes" of erroroneous data; when we build an archive +#' # based on database queries, the latest available update might still be +#' # subject to change, but previous versions should be finalized). We can +#' # muffle such warnings with the following pattern: #' withCallingHandlers({ #' epix_as_of(x = archive_cases_dv_subset, #' max_version = max(archive_cases_dv_subset$DT$version)) @@ -80,17 +80,17 @@ epix_as_of = function(x, max_version, min_time_value = -Inf) { #' version through which to fill in missing version history; this will be the #' result's `$observed_versions_end` unless it already had a later #' `$observed_versions_end`. -#' @param how Optional; `"na"` or `"lvcf"`: `"na"` will fill in any missing +#' @param how Optional; `"na"` or `"locf"`: `"na"` will fill in any missing #' required version history with `NA`s, by inserting (if necessary) an update #' immediately after the current `$observed_versions_end` that revises all #' existing measurements to be `NA` (this is only supported for `version` -#' classes with a `next_after` implementation); `"lvcf"` will fill in missing +#' classes with a `next_after` implementation); `"locf"` will fill in missing #' version history with the last version of each observation carried forward -#' (LVCF), by leaving the update `$DT` alone (other `epi_archive` methods are -#' based on LVCF). Default is `"na"`. +#' (LOCF), by leaving the update `$DT` alone (other `epi_archive` methods are +#' based on LOCF). Default is `"na"`. #' @return An `epi_archive` epix_fill_through_version = function(x, fill_versions_end, - how=c("na", "lvcf")) { + how=c("na", "locf")) { if (!inherits(x, "epi_archive")) Abort("`x` must be of class `epi_archive`.") # Enclosing parentheses drop the invisibility flag. See description above of # potential mutation and aliasing behavior. @@ -115,17 +115,17 @@ epix_fill_through_version = function(x, fill_versions_end, #' old `DT` in another object). #' #' @param x,y Two `epi_archive` objects to join together. -#' @param observed_versions_end_conflict Optional; `"stop"`, `"na"`, `"lvcf"`, +#' @param observed_versions_end_conflict Optional; `"stop"`, `"na"`, `"locf"`, #' or `"truncate"`; in the case that `x$observed_versions_end` doesn't match #' `y$observed_versions_end`, what do we do?: `"stop"`: emit an error; "na": #' use `max(x$observed_versions_end, y$observed_versions_end)`, but in the #' less up-to-date input archive, imagine there was an update immediately #' after its last observed version which revised all observations to be `NA`; -#' `"lvcf"`: use `max(x$observed_versions_end, y$observed_versions_end)`, and -#' last-version-carried-forward extrapolation to invent update data for the -#' less up-to-date input archive; or `"truncate"`: use -#' `min(x$observed_versions_end, y$observed_versions_end)` and discard any -#' rows containing update rows for later versions. +#' `"locf"`: use `max(x$observed_versions_end, y$observed_versions_end)`, +#' allowing the last version of each observation to be carried forward to +#' extrapolate unavailable versions for the less up-to-date input archive; or +#' `"truncate"`: use `min(x$observed_versions_end, y$observed_versions_end)` +#' and discard any rows containing update rows for later versions. #' @param compactify Optional; `TRUE`, `FALSE`, or `NULL`; should the result be #' compactified? See [`as_epi_archive`] for an explanation of what this means. #' Default here is `TRUE`. @@ -151,7 +151,7 @@ epix_fill_through_version = function(x, fill_versions_end, #' @importFrom data.table key set #' @export epix_merge = function(x, y, - observed_versions_end_conflict = c("stop","na","lvcf","truncate"), + observed_versions_end_conflict = c("stop","na","locf","truncate"), compactify = TRUE) { if (!inherits(x, "epi_archive")) { Abort("`x` must be of class `epi_archive`.") @@ -205,7 +205,7 @@ epix_merge = function(x, y, x_DT = x$DT y_DT = y$DT } - } else if (observed_versions_end_conflict %in% c("na", "lvcf")) { + } else if (observed_versions_end_conflict %in% c("na", "locf")) { new_observed_versions_end = max(x$observed_versions_end, y$observed_versions_end) x_DT = epix_fill_through_version(x, new_observed_versions_end, observed_versions_end_conflict)$DT y_DT = epix_fill_through_version(y, new_observed_versions_end, observed_versions_end_conflict)$DT diff --git a/data-raw/archive_cases_dv_subset.R b/data-raw/archive_cases_dv_subset.R index becd9c57..7e4cc9dd 100644 --- a/data-raw/archive_cases_dv_subset.R +++ b/data-raw/archive_cases_dv_subset.R @@ -32,7 +32,7 @@ case_rate_subset <- covidcast( as_epi_archive(compactify=FALSE) archive_cases_dv_subset = epix_merge(dv_subset, case_rate_subset, - observed_versions_end_conflict="lvcf", + observed_versions_end_conflict="locf", compactify=FALSE) # If we directly store an epi_archive R6 object as data, it will store its class diff --git a/man/as_epi_archive.Rd b/man/as_epi_archive.Rd index ad00f531..c933a9f6 100644 --- a/man/as_epi_archive.Rd +++ b/man/as_epi_archive.Rd @@ -38,8 +38,8 @@ fields; named entries from the passed list or will be included as well.} \item{compactify}{Optional; Boolean or \code{NULL}: should we remove rows that are considered redundant for the purposes of \code{epi_archive}'s built-in methods such as \code{as_of}? As these methods use the last version of each observation -carried forward (LVCF) to interpolate between the version data provided, -rows that don't change these LVCF results can potentially be omitted to +carried forward (LOCF) to interpolate between the version data provided, +rows that don't change these LOCF results can potentially be omitted to save space. \code{TRUE} will remove these rows, \code{FALSE} will not, and missing or \code{NULL} will remove these rows and issue a warning. Generally, this can be set to \code{TRUE}, but if you directly inspect or edit the fields of the diff --git a/man/as_epi_df.Rd b/man/as_epi_df.Rd index 5d1b1335..b5df1302 100644 --- a/man/as_epi_df.Rd +++ b/man/as_epi_df.Rd @@ -51,9 +51,9 @@ examples. } \section{Methods (by class)}{ \itemize{ -\item \code{epi_df}: Simply returns the \code{epi_df} object unchanged. +\item \code{as_epi_df(epi_df)}: Simply returns the \code{epi_df} object unchanged. -\item \code{tbl_df}: The input tibble \code{x} must contain the columns +\item \code{as_epi_df(tbl_df)}: The input tibble \code{x} must contain the columns \code{geo_value} and \code{time_value}. All other columns will be preserved as is, and treated as measured variables. If \code{as_of} is missing, then the function will try to guess it from an \code{as_of}, \code{issue}, or \code{version} column of \code{x} @@ -61,14 +61,14 @@ will try to guess it from an \code{as_of}, \code{issue}, or \code{version} colum (stored in its attributes); if this fails, then the current day-time will be used. -\item \code{data.frame}: Works analogously to \code{as_epi_df.tbl_df()}. +\item \code{as_epi_df(data.frame)}: Works analogously to \code{as_epi_df.tbl_df()}. -\item \code{tbl_ts}: Works analogously to \code{as_epi_df.tbl_df()}, except that +\item \code{as_epi_df(tbl_ts)}: Works analogously to \code{as_epi_df.tbl_df()}, except that the \code{tbl_ts} class is dropped, and any key variables (other than "geo_value") are added to the metadata of the returned object, under the \code{other_keys} field. -}} +}} \examples{ # Convert a `tsibble` that has county code as an extra key # Notice that county code should be a character string to preserve any leading zeroes diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index c2e8981d..81f1444f 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -32,7 +32,7 @@ Note that there can only be a single row per unique combination of key variables, and thus the key variables are critical for figuring out how to generate a snapshot of data from the archive, as of a given version. -In general, the last version of each observation is carried forward (LVCF) to +In general, the last version of each observation is carried forward (LOCF) to fill in data between recorded versions, and between the last recorded update and the \code{observed_versions_end}. One consequence is that the \code{DT} doesn't have to contain a full snapshot of every version (although this @@ -166,8 +166,8 @@ fields; named entries from the passed list or will be included as well.} \item{\code{compactify}}{Optional; Boolean or \code{NULL}: should we remove rows that are considered redundant for the purposes of \code{epi_archive}'s built-in methods such as \code{as_of}? As these methods use the last version of each observation -carried forward (LVCF) to interpolate between the version data provided, -rows that don't change these LVCF results can potentially be omitted to +carried forward (LOCF) to interpolate between the version data provided, +rows that don't change these LOCF results can potentially be omitted to save space while maintaining the same behavior (with the help of the \code{clobberable_versions_start} and \code{observed_versions_end} fields in some edge cases). \code{TRUE} will remove these rows, \code{FALSE} will not, and missing @@ -223,7 +223,7 @@ Fill in unobserved history using requested scheme by mutating \code{\link{epix_fill_through_version}} for a full description of the non-R6-method version, which doesn't mutate the input archive but might alias its fields. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{epi_archive$fill_through_version(fill_versions_end, how = c("na", "lvcf"))}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{epi_archive$fill_through_version(fill_versions_end, how = c("na", "locf"))}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -249,7 +249,7 @@ does not alias either archive's \code{DT}. \subsection{Usage}{ \if{html}{\out{
}}\preformatted{epi_archive$merge( y, - observed_versions_end_conflict = c("stop", "na", "lvcf", "truncate"), + observed_versions_end_conflict = c("stop", "na", "locf", "truncate"), compactify = TRUE )}\if{html}{\out{
}} } diff --git a/man/epix_as_of.Rd b/man/epix_as_of.Rd index 214fb680..4053cd28 100644 --- a/man/epix_as_of.Rd +++ b/man/epix_as_of.Rd @@ -51,13 +51,13 @@ epix_as_of(x = archive_cases_dv_subset, max_version = as.Date("2020-06-12")) # When fetching a snapshot as of the latest version with update data in the -# archive, a warning is issued as this update data might not yet be finalized -# (for example, if data versions are labeled with dates, these versions might be -# overwritten throughout the day if the data can be updated multiple times per -# day; when we build an archive based on special update-data queries all made at -# the same time, the latest available update might still be subject to change, -# but previous versions should be finalized). We can muffle such warnings with -# the following pattern: +# archive, a warning is issued by default, as this update data might not yet +# be finalized (for example, if data versions are labeled with dates, these +# versions might be overwritten throughout the corresponding days with +# additional data or "hotfixes" of erroroneous data; when we build an archive +# based on database queries, the latest available update might still be +# subject to change, but previous versions should be finalized). We can +# muffle such warnings with the following pattern: withCallingHandlers({ epix_as_of(x = archive_cases_dv_subset, max_version = max(archive_cases_dv_subset$DT$version)) diff --git a/man/epix_fill_through_version.Rd b/man/epix_fill_through_version.Rd index 929dfda5..aa8109fb 100644 --- a/man/epix_fill_through_version.Rd +++ b/man/epix_fill_through_version.Rd @@ -4,7 +4,7 @@ \alias{epix_fill_through_version} \title{\code{epi_archive} with unobserved history filled in (won't mutate, might alias)} \usage{ -epix_fill_through_version(x, fill_versions_end, how = c("na", "lvcf")) +epix_fill_through_version(x, fill_versions_end, how = c("na", "locf")) } \arguments{ \item{x}{An \code{epi_archive}} @@ -14,14 +14,14 @@ version through which to fill in missing version history; this will be the result's \verb{$observed_versions_end} unless it already had a later \verb{$observed_versions_end}.} -\item{how}{Optional; \code{"na"} or \code{"lvcf"}: \code{"na"} will fill in any missing +\item{how}{Optional; \code{"na"} or \code{"locf"}: \code{"na"} will fill in any missing required version history with \code{NA}s, by inserting (if necessary) an update immediately after the current \verb{$observed_versions_end} that revises all existing measurements to be \code{NA} (this is only supported for \code{version} -classes with a \code{next_after} implementation); \code{"lvcf"} will fill in missing +classes with a \code{next_after} implementation); \code{"locf"} will fill in missing version history with the last version of each observation carried forward -(LVCF), by leaving the update \verb{$DT} alone (other \code{epi_archive} methods are -based on LVCF). Default is \code{"na"}.} +(LOCF), by leaving the update \verb{$DT} alone (other \code{epi_archive} methods are +based on LOCF). Default is \code{"na"}.} } \value{ An \code{epi_archive} diff --git a/man/epix_merge.Rd b/man/epix_merge.Rd index 44f7559d..9bd6a75e 100644 --- a/man/epix_merge.Rd +++ b/man/epix_merge.Rd @@ -7,24 +7,24 @@ epix_merge( x, y, - observed_versions_end_conflict = c("stop", "na", "lvcf", "truncate"), + observed_versions_end_conflict = c("stop", "na", "locf", "truncate"), compactify = TRUE ) } \arguments{ \item{x, y}{Two \code{epi_archive} objects to join together.} -\item{observed_versions_end_conflict}{Optional; \code{"stop"}, \code{"na"}, \code{"lvcf"}, +\item{observed_versions_end_conflict}{Optional; \code{"stop"}, \code{"na"}, \code{"locf"}, or \code{"truncate"}; in the case that \code{x$observed_versions_end} doesn't match \code{y$observed_versions_end}, what do we do?: \code{"stop"}: emit an error; "na": use \code{max(x$observed_versions_end, y$observed_versions_end)}, but in the less up-to-date input archive, imagine there was an update immediately after its last observed version which revised all observations to be \code{NA}; -\code{"lvcf"}: use \code{max(x$observed_versions_end, y$observed_versions_end)}, and -last-version-carried-forward extrapolation to invent update data for the -less up-to-date input archive; or \code{"truncate"}: use -\code{min(x$observed_versions_end, y$observed_versions_end)} and discard any -rows containing update rows for later versions.} +\code{"locf"}: use \code{max(x$observed_versions_end, y$observed_versions_end)}, +allowing the last version of each observation to be carried forward to +extrapolate unavailable versions for the less up-to-date input archive; or +\code{"truncate"}: use \code{min(x$observed_versions_end, y$observed_versions_end)} +and discard any rows containing update rows for later versions.} \item{compactify}{Optional; \code{TRUE}, \code{FALSE}, or \code{NULL}; should the result be compactified? See \code{\link{as_epi_archive}} for an explanation of what this means. diff --git a/tests/testthat/test-epix_fill_through_version.R b/tests/testthat/test-epix_fill_through_version.R index 92c4cbbd..1777607c 100644 --- a/tests/testthat/test-epix_fill_through_version.R +++ b/tests/testthat/test-epix_fill_through_version.R @@ -5,7 +5,7 @@ test_that("epix_fill_through_version mirrors input when it is sufficiently up to some_earlier_observed_version = 2L ea_trivial_fill_na1 = epix_fill_through_version(ea_orig, some_earlier_observed_version, "na") ea_trivial_fill_na2 = epix_fill_through_version(ea_orig, ea_orig$observed_versions_end, "na") - ea_trivial_fill_lvcf = epix_fill_through_version(ea_orig, some_earlier_observed_version, "lvcf") + ea_trivial_fill_locf = epix_fill_through_version(ea_orig, some_earlier_observed_version, "locf") # Below, we want R6 objects to be compared based on contents rather than # addresses. We appear to get this with `expect_identical` in `testthat` # edition 3, which is based on `waldo::compare` rather than `base::identical`; @@ -16,7 +16,7 @@ test_that("epix_fill_through_version mirrors input when it is sufficiently up to local_edition(3) expect_identical(ea_orig, ea_trivial_fill_na1) expect_identical(ea_orig, ea_trivial_fill_na2) - expect_identical(ea_orig, ea_trivial_fill_lvcf) + expect_identical(ea_orig, ea_trivial_fill_locf) }) test_that("epix_fill_through_version can extend observed versions, gives expected `as_of`s", { @@ -28,7 +28,7 @@ test_that("epix_fill_through_version can extend observed versions, gives expecte first_unobserved_version = 6L later_unobserved_version = 10L ea_fill_na = epix_fill_through_version(ea_orig, later_unobserved_version, "na") - ea_fill_lvcf = epix_fill_through_version(ea_orig, later_unobserved_version, "lvcf") + ea_fill_locf = epix_fill_through_version(ea_orig, later_unobserved_version, "locf") # We use edition 3 features here, passing `ignore_attr` to `waldo::compare`. # Ensure we are using edition 3: @@ -38,9 +38,9 @@ test_that("epix_fill_through_version can extend observed versions, gives expecte expect_identical(tibble::as_tibble(ea_fill_na$as_of(first_unobserved_version)), tibble::tibble(geo_value="g1", time_value=as.Date("2020-01-01")+0:1, value=rep(NA_integer_, 2L)), ignore_attr = TRUE) - expect_identical(ea_fill_lvcf$observed_versions_end, later_unobserved_version) - expect_identical(ea_fill_lvcf$as_of(first_unobserved_version), - ea_fill_lvcf$as_of(ea_orig$observed_versions_end) %>% + expect_identical(ea_fill_locf$observed_versions_end, later_unobserved_version) + expect_identical(ea_fill_locf$as_of(first_unobserved_version), + ea_fill_locf$as_of(ea_orig$observed_versions_end) %>% {attr(., "metadata")$as_of <- first_unobserved_version; .}) }, epiprocess__snapshot_as_of_clobberable_version = function(wrn) invokeRestart("muffleWarning")) }) @@ -69,7 +69,7 @@ test_that("epix_fill_through_version does not mutate x", { expect_true(identical(ea_orig_before_as_list, ea_orig_after_as_list)) expect_identical(ea_orig_DT_before_copy, ea_orig$DT) # - ea_fill_lvcf = epix_fill_through_version(ea_orig, some_unobserved_version, "lvcf") + ea_fill_locf = epix_fill_through_version(ea_orig, some_unobserved_version, "locf") ea_orig_after_as_list = as.list(ea_orig) expect_true(identical(ea_orig_before_as_list, ea_orig_after_as_list)) expect_identical(ea_orig_DT_before_copy, ea_orig$DT) @@ -101,6 +101,6 @@ test_that("epix_fill_through_version returns same key & doesn't mutate old DT or old_DT_copy = data.table::copy(old_DT) old_key = data.table::key(ea$DT) expect_identical(data.table::key(epix_fill_through_version(ea, 5L, "na")$DT), old_key) - expect_identical(data.table::key(epix_fill_through_version(ea, 5L, "lvcf")$DT), old_key) + expect_identical(data.table::key(epix_fill_through_version(ea, 5L, "locf")$DT), old_key) expect_identical(data.table::key(ea$DT), old_key) }) diff --git a/tests/testthat/test-epix_merge.R b/tests/testthat/test-epix_merge.R index 527c808d..281234f3 100644 --- a/tests/testthat/test-epix_merge.R +++ b/tests/testthat/test-epix_merge.R @@ -17,7 +17,11 @@ test_that("epix_merge merges and carries forward updates properly", { "g1", 3L, 2L, paste0("XC", 2L), # x has 1 version, y has 0 "g1", 4L, 1L, paste0("XD", 1L), - # values that should be LVCF'd + NAs that should be LVCF's as NA + # non-NA values that should be carried forward + # (version-wise LOCF) in other versions, plus NAs that + # should (similarly) be carried forward as NA (latter + # wouldn't work with an ordinary merge + post-processing + # with `data.table::nafill`) "g1", 6L, c(1L,3L,5L), paste0("XE", c(1L, NA, 5L)) ) %>% tidyr::unchop(c(version, x_value)) %>% @@ -137,13 +141,13 @@ local({ ), clobberable_versions_start=1L) ) }) - test_that('epix_merge observed_versions_end_conflict="lvcf" works', { + test_that('epix_merge observed_versions_end_conflict="locf" works', { expect_equal( - epix_merge(x,y, observed_versions_end_conflict = "lvcf"), + epix_merge(x,y, observed_versions_end_conflict = "locf"), as_epi_archive(tibble::tribble( ~geo_value, ~time_value, ~version, ~x_value, ~y_value, 1L, 1L, 1L, 10L, NA_integer_, # x updated, y not observed yet - 1L, 1L, 5L, 10L, 20L, # x LVCF'd, y updated + 1L, 1L, 5L, 10L, 20L, # x LOCF'd, y updated ), clobberable_versions_start=1L) ) }) @@ -168,9 +172,9 @@ local({ xy_no_conflict_expected ) }) - test_that('epix_merge observed_versions_end_conflict="lvcf" on no-conflict works', { + test_that('epix_merge observed_versions_end_conflict="locf" on no-conflict works', { expect_equal( - epix_merge(x_no_conflict, y_no_conflict, observed_versions_end_conflict = "lvcf"), + epix_merge(x_no_conflict, y_no_conflict, observed_versions_end_conflict = "locf"), xy_no_conflict_expected ) }) diff --git a/vignettes/aggregation.Rmd b/vignettes/aggregation.Rmd index f1d32616..617f0983 100644 --- a/vignettes/aggregation.Rmd +++ b/vignettes/aggregation.Rmd @@ -153,8 +153,8 @@ explicit value. The default is `NA`, but in the current case, where missingness is not at random but rather represents a small value that was censored (only a hypothetical with COVID-19 reports, but certainly a real phenomenon that occurs in other signals), it is better to replace it by zero, which is what we do here. -(Other approaches, such as LOCF: last-observation-carried-forward, could be -accomplished by first filling with `NA` values and then following up with a +(Other approaches, such as LOCF: last observation carried forward in time, could +be accomplished by first filling with `NA` values and then following up with a second call to `tidyr::fill()`.) ```{r} diff --git a/vignettes/archive.Rmd b/vignettes/archive.Rmd index f497bc63..0730b85a 100644 --- a/vignettes/archive.Rmd +++ b/vignettes/archive.Rmd @@ -108,7 +108,7 @@ snapshot of data from the archive, as of a given version (also described below). key(x$DT) ``` -In general, the last version of each observation is carried forward (LVCF) to +In general, the last version of each observation is carried forward (LOCF) to fill in data between recorded versions. **A word of caution:** R6 objects, unlike most other objects in R, have reference semantics. An important consequence of this is that objects are not copied when modified. diff --git a/vignettes/compactify.Rmd b/vignettes/compactify.Rmd index 871cc3bc..746262a2 100644 --- a/vignettes/compactify.Rmd +++ b/vignettes/compactify.Rmd @@ -9,23 +9,23 @@ vignette: > ## Removing redundant update data to save space -We do not need to store version update rows that look like the last version -carried forward (LVCF) for use with `epiprocess`'s' `epi_archive`-related -functions, as they all apply LVCF to fill in data between explicit updates. By -default, we even detect and remove these LVCF-redundant rows to save space; this -should not impact results as long as you do not directly work with the archive's -`DT` field in a way that expects these rows to remain. +We do not need to store version update rows that look like the last version of +the corresponding observations carried forward (LOCF) for use with +`epiprocess`'s' `epi_archive`-related functions, as they all apply LOCF to fill +in data between explicit updates. By default, we even detect and remove these +LOCF-redundant rows to save space; this should not impact results as long as you +do not directly work with the archive's `DT` field in a way that expects these +rows to remain. -There are three -different values that can be assigned to `compactify`: +There are three different values that can be assigned to `compactify`: -* No argument: if there are LVCF-redundant rows, removes them and issues a +* No argument: if there are LOCF-redundant rows, removes them and issues a warning with some information about what rows were removed -* `TRUE`: removes any LVCF-redundant rows without any warning or other feedback -* `FALSE`: keeps any LVCF-redundant rows without any warning or other feedback +* `TRUE`: removes any LOCF-redundant rows without any warning or other feedback +* `FALSE`: keeps any LOCF-redundant rows without any warning or other feedback -For this example, we have one chart using LVCF values, while another doesn't -use them to illustrate LVCF. Notice how the head of the first dataset differs +For this example, we have one chart using LOCF values, while another doesn't +use them to illustrate LOCF. Notice how the head of the first dataset differs from the second from the third value included. ```{r} @@ -34,37 +34,37 @@ library(dplyr) dt <- archive_cases_dv_subset$DT -lvcf_omitted <- as_epi_archive(dt) -lvcf_included <- as_epi_archive(dt,compactify = FALSE) +locf_omitted <- as_epi_archive(dt) +locf_included <- as_epi_archive(dt,compactify = FALSE) -head(lvcf_omitted$DT) -head(lvcf_included$DT) +head(locf_omitted$DT) +head(locf_included$DT) ``` -LVCF-redundant values can mar the performance of dataset operations. As the column -`case_rate_7d_av` has many more LVCF-redundant values than `percent_cli`, we will omit the +LOCF-redundant values can mar the performance of dataset operations. As the column +`case_rate_7d_av` has many more LOCF-redundant values than `percent_cli`, we will omit the `percent_cli` column for comparing performance. ```{r} dt2 <- select(dt,-percent_cli) -lvcf_included_2 <- as_epi_archive(dt2,compactify=FALSE) -lvcf_omitted_2 <- as_epi_archive(dt2,compactify=TRUE) +locf_included_2 <- as_epi_archive(dt2,compactify=FALSE) +locf_omitted_2 <- as_epi_archive(dt2,compactify=TRUE) ``` In this example, a huge proportion of the original version update data were -LVCF-redundant, and compactifying saves a large amount of space. The proportion -of data that is LVCF-redundant can vary widely between data sets, so we won't +LOCF-redundant, and compactifying saves a large amount of space. The proportion +of data that is LOCF-redundant can vary widely between data sets, so we won't always be this lucky. ```{r} -nrow(lvcf_included_2$DT) -nrow(lvcf_omitted_2$DT) +nrow(locf_included_2$DT) +nrow(locf_omitted_2$DT) ``` As we would expect, performing 200 iterations of `dplyr::filter` is faster when -the LVCF values are omitted. +the LOCF values are omitted. ```{r} # Performance of filtering @@ -79,8 +79,8 @@ elapsed_time <- function(fx) c(system.time(fx))[[3]] speed_test <- function(f,name) { data.frame( operation = name, - lvcf = elapsed_time(f(lvcf_included_2)), - no_lvcf = elapsed_time(f(lvcf_omitted_2)) + locf = elapsed_time(f(locf_included_2)), + no_locf = elapsed_time(f(locf_omitted_2)) ) } @@ -110,10 +110,10 @@ speeds <- rbind(speeds, speed_test(slide_median,"slide_median")) Here is a detailed performance comparison: ```{r} -speeds_tidy <- tidyr::gather(speeds,key="is_lvcf",value="time_in_s",lvcf,no_lvcf) +speeds_tidy <- tidyr::gather(speeds,key="is_locf",value="time_in_s",locf,no_locf) library(ggplot2) ggplot(speeds_tidy) + - geom_bar(aes(x=is_lvcf,y=time_in_s,fill=operation),stat = "identity") + geom_bar(aes(x=is_locf,y=time_in_s,fill=operation),stat = "identity") ``` From 155ea7344638d17f09e379e92c30eb5380b9a817 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 27 Jul 2022 04:45:33 -0700 Subject: [PATCH 88/96] Rename `observed_versions_end[_conflict]` -> `versions_end[_conflict]` --- R/archive.R | 72 +++++++++---------- R/methods-epi_archive.R | 58 +++++++-------- data-raw/archive_cases_dv_subset.R | 2 +- man/as_epi_archive.Rd | 6 +- man/epi_archive.Rd | 12 ++-- man/epix_fill_through_version.Rd | 6 +- man/epix_merge.Rd | 20 +++--- tests/testthat/test-archive-version-bounds.R | 16 ++--- tests/testthat/test-compactify.R | 2 +- .../testthat/test-epix_fill_through_version.R | 8 +-- tests/testthat/test-epix_merge.R | 44 ++++++------ vignettes/archive.Rmd | 2 +- 12 files changed, 124 insertions(+), 124 deletions(-) diff --git a/R/archive.R b/R/archive.R index 783d3bb1..129c1e21 100644 --- a/R/archive.R +++ b/R/archive.R @@ -8,7 +8,7 @@ #' Validate a version bound arg #' -#' Expected to be used on `clobberable_versions_start`, `observed_versions_end`, +#' Expected to be used on `clobberable_versions_start`, `versions_end`, #' and similar arguments. Some additional context-specific checks may be needed. #' #' @param version_bound the version bound to validate @@ -73,7 +73,7 @@ validate_version_bound = function(version_bound, x, na_ok, #' @export max_version_with_row_in = function(x) { if (nrow(x) == 0L) { - Abort(sprintf("`nrow(x)==0L`, representing a data set history with no row up through the latest observed version, but we don't have a sensible guess at what version that is, or whether any of the empty versions might be clobbered in the future; if we use `x` to form an `epi_archive`, then `clobberable_versions_start` and `observed_versions_end` must be manually specified."), + Abort(sprintf("`nrow(x)==0L`, representing a data set history with no row up through the latest observed version, but we don't have a sensible guess at what version that is, or whether any of the empty versions might be clobbered in the future; if we use `x` to form an `epi_archive`, then `clobberable_versions_start` and `versions_end` must be manually specified."), class="epiprocess__max_version_cannot_be_used") } else { version_col = purrr::pluck(x, "version") # error not NULL if doesn't exist @@ -130,7 +130,7 @@ next_after.Date = function(x) x + 1L #' #' In general, the last version of each observation is carried forward (LOCF) to #' fill in data between recorded versions, and between the last recorded -#' update and the `observed_versions_end`. One consequence is that the `DT` +#' update and the `versions_end`. One consequence is that the `DT` #' doesn't have to contain a full snapshot of every version (although this #' generally works), but can instead contain only the rows that are new or #' changed from the previous version (see `compactify`, which does this @@ -211,7 +211,7 @@ epi_archive = time_type = NULL, additional_metadata = NULL, clobberable_versions_start = NULL, - observed_versions_end = NULL, + versions_end = NULL, #' @description Creates a new `epi_archive` object. #' @param x A data frame, data table, or tibble, with columns `geo_value`, #' `time_value`, `version`, and then any additional number of columns. @@ -233,7 +233,7 @@ epi_archive = #' carried forward (LOCF) to interpolate between the version data provided, #' rows that don't change these LOCF results can potentially be omitted to #' save space while maintaining the same behavior (with the help of the -#' `clobberable_versions_start` and `observed_versions_end` fields in some +#' `clobberable_versions_start` and `versions_end` fields in some #' edge cases). `TRUE` will remove these rows, `FALSE` will not, and missing #' or `NULL` will remove these rows and issue a warning. Generally, this can #' be set to `TRUE`, but if you directly inspect or edit the fields of the @@ -243,7 +243,7 @@ epi_archive = #' potential for space, time, or bandwidth savings upstream the data pipeline, #' e.g., when fetching, storing, or preparing the input data `x` #' @param clobberable_versions_start Optional; as in [`as_epi_archive`] -#' @param observed_versions_end Optiona; as in [`as_epi_archive`] +#' @param versions_end Optional; as in [`as_epi_archive`] #' @return An `epi_archive` object. #' @importFrom data.table as.data.table key setkeyv #' @@ -252,7 +252,7 @@ epi_archive = #' and examples of parameter names. initialize = function(x, geo_type, time_type, other_keys, additional_metadata, compactify, - clobberable_versions_start, observed_versions_end) { + clobberable_versions_start, versions_end) { # Check that we have a data frame if (!is.data.frame(x)) { Abort("`x` must be a data frame.") @@ -306,26 +306,26 @@ epi_archive = } # Apply defaults and conduct checks and apply defaults for - # `clobberable_versions_start`, `observed_versions_end`: + # `clobberable_versions_start`, `versions_end`: if (missing(clobberable_versions_start)) { clobberable_versions_start <- max_version_with_row_in(x) } - if (missing(observed_versions_end)) { - observed_versions_end <- max_version_with_row_in(x) + if (missing(versions_end)) { + versions_end <- max_version_with_row_in(x) } validate_version_bound(clobberable_versions_start, x, na_ok=TRUE) - validate_version_bound(observed_versions_end, x, na_ok=FALSE) - if (nrow(x) > 0L && observed_versions_end < max(x[["version"]])) { - Abort(sprintf("`observed_versions_end` was %s, but `x` contained + validate_version_bound(versions_end, x, na_ok=FALSE) + if (nrow(x) > 0L && versions_end < max(x[["version"]])) { + Abort(sprintf("`versions_end` was %s, but `x` contained updates for a later version or versions, up through %s", - observed_versions_end, max(x[["version"]])), - class="epiprocess__observed_versions_end_earlier_than_updates") + versions_end, max(x[["version"]])), + class="epiprocess__versions_end_earlier_than_updates") } - if (!is.na(clobberable_versions_start) && clobberable_versions_start > observed_versions_end) { - Abort(sprintf("`observed_versions_end` was %s, but a `clobberable_versions_start` + if (!is.na(clobberable_versions_start) && clobberable_versions_start > versions_end) { + Abort(sprintf("`versions_end` was %s, but a `clobberable_versions_start` of %s indicated that there were later observed versions", - observed_versions_end, clobberable_versions_start), - class="epiprocess__observed_versions_end_earlier_than_clobberable_versions_start") + versions_end, clobberable_versions_start), + class="epiprocess__versions_end_earlier_than_clobberable_versions_start") } # --- End of validation and replacing missing args with defaults --- @@ -395,7 +395,7 @@ epi_archive = self$time_type = time_type self$additional_metadata = additional_metadata self$clobberable_versions_start = clobberable_versions_start - self$observed_versions_end = observed_versions_end + self$versions_end = versions_end }, print = function() { cat("An `epi_archive` object, with metadata:\n") @@ -421,8 +421,8 @@ epi_archive = cat(sprintf("* %-14s = %s\n", "clobberable versions start", self$clobberable_versions_start)) } - cat(sprintf("* %-14s = %s\n", "observed versions end", - self$observed_versions_end)) + cat(sprintf("* %-14s = %s\n", "versions end", + self$versions_end)) cat("----------\n") cat(sprintf("Data archive (stored in DT field): %i x %i\n", nrow(self$DT), ncol(self$DT))) @@ -458,8 +458,8 @@ epi_archive = if (is.na(max_version)) { Abort("`max_version` must not be NA.") } - if (max_version > self$observed_versions_end) { - Abort("`max_version` must be at most `self$observed_versions_end`.") + if (max_version > self$versions_end) { + Abort("`max_version` must be at most `self$versions_end`.") } if (!is.na(self$clobberable_versions_start) && max_version >= self$clobberable_versions_start) { Warn('Getting data as of some "clobberable" version that might be hotfixed, synced, or otherwise replaced later with different data using the same version tag. Thus, the snapshot that we produce here might not be reproducible later. See `?epi_archive` for more info and `?epix_as_of` on how to muffle.', @@ -497,7 +497,7 @@ epi_archive = how=c("na", "locf")) { validate_version_bound(fill_versions_end, self$DT, na_ok=FALSE) how <- arg_match(how) - if (self$observed_versions_end < fill_versions_end) { + if (self$versions_end < fill_versions_end) { new_DT = switch( how, "na" = { @@ -507,7 +507,7 @@ epi_archive = # added if `self` is outdated. nonversion_key_cols = setdiff(key(self$DT), "version") nonkey_cols = setdiff(names(self$DT), key(self$DT)) - next_version_tag = next_after(self$observed_versions_end) + next_version_tag = next_after(self$versions_end) if (next_version_tag > fill_versions_end) { Abort(sprintf(paste( "Apparent problem with `next_after` implementation:", @@ -515,7 +515,7 @@ epi_archive = "and the next possible version was supposed to be %s,", "but this appeared to jump from a version < %3$s", "to one > %3$s, implying at least one version in between." - ), self$observed_versions_end, next_version_tag, fill_versions_end)) + ), self$versions_end, next_version_tag, fill_versions_end)) } nonversion_key_vals_ever_recorded = unique(self$DT, by=nonversion_key_cols) # In edge cases, the `unique` result can alias the original @@ -537,11 +537,11 @@ epi_archive = self$DT } ) - new_observed_versions_end = fill_versions_end + new_versions_end = fill_versions_end # Update `self` all at once with simple, error-free operations + # return below: self$DT <- new_DT - self$observed_versions_end <- new_observed_versions_end + self$versions_end <- new_versions_end } else { # Already sufficiently up to date; nothing to do. } @@ -555,11 +555,11 @@ epi_archive = #' of the non-R6-method version, which does not mutate either archive, and #' does not alias either archive's `DT`. #' @param y as in [`epix_merge`] -#' @param observed_versions_end_conflict as in [`epix_merge`] +#' @param versions_end_conflict as in [`epix_merge`] #' @param compactify as in [`epix_merge`] - merge = function(y, observed_versions_end_conflict = c("stop","na","locf","truncate"), compactify=TRUE) { + merge = function(y, versions_end_conflict = c("stop","na","locf","truncate"), compactify=TRUE) { result = epix_merge(self, y, - observed_versions_end_conflict = observed_versions_end_conflict, + versions_end_conflict = versions_end_conflict, compactify = compactify) if (length(epi_archive$private_fields) != 0L) { @@ -775,7 +775,7 @@ epi_archive = #' redundant) is present with version `ver`, then all previous versions must #' be finalized and non-clobberable, although `ver` (and onward) might still #' be modified, (ii) even if we have "observed" empty updates for some -#' versions beyond `max(x$version)` (as indicated by `observed_versions_end`; +#' versions beyond `max(x$version)` (as indicated by `versions_end`; #' see below), we can't assume `max(x$version)` has been finalized, because we #' might see a nonfinalized version + empty subsequent versions due to #' upstream database replication delays in combination with the upstream @@ -784,7 +784,7 @@ epi_archive = #' `compactify` are not redundant, and actually come from an explicit version #' release that indicates that preceding versions are finalized. If `nrow(x) #' == 0`, then this argument is mandatory. -#' @param observed_versions_end Optional; length-1, same `class` and `typeof` as +#' @param versions_end Optional; length-1, same `class` and `typeof` as #' `x$version`: what is the last version we have observed? The default is #' `max_version_with_row_in(x)`, but values greater than this could also be #' valid, and would indicate that we observed additional versions of the data @@ -841,9 +841,9 @@ as_epi_archive = function(x, geo_type, time_type, other_keys, additional_metadata = list(), compactify = NULL, clobberable_versions_start = max_version_with_row_in(x), - observed_versions_end = max_version_with_row_in(x)) { + versions_end = max_version_with_row_in(x)) { epi_archive$new(x, geo_type, time_type, other_keys, additional_metadata, - compactify, clobberable_versions_start, observed_versions_end) + compactify, clobberable_versions_start, versions_end) } #' Test for `epi_archive` format diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 659a98dc..a22406a5 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -78,11 +78,11 @@ epix_as_of = function(x, max_version, min_time_value = -Inf) { #' @param x An `epi_archive` #' @param fill_versions_end Length-1, same class&type as `%s$version`: the #' version through which to fill in missing version history; this will be the -#' result's `$observed_versions_end` unless it already had a later -#' `$observed_versions_end`. +#' result's `$versions_end` unless it already had a later +#' `$versions_end`. #' @param how Optional; `"na"` or `"locf"`: `"na"` will fill in any missing #' required version history with `NA`s, by inserting (if necessary) an update -#' immediately after the current `$observed_versions_end` that revises all +#' immediately after the current `$versions_end` that revises all #' existing measurements to be `NA` (this is only supported for `version` #' classes with a `next_after` implementation); `"locf"` will fill in missing #' version history with the last version of each observation carried forward @@ -100,12 +100,12 @@ epix_fill_through_version = function(x, fill_versions_end, #' Merge two `epi_archive` objects #' #' Merges two `epi_archive`s that share a common `geo_value`, `time_value`, and -#' set of key columns. When they also share a common `observed_versions_end`, +#' set of key columns. When they also share a common `versions_end`, #' using `$as_of` on the result should be the same as using `$as_of` on `x` and #' `y` individually, then performing a full join of the `DT`s on the non-version #' key columns (potentially consolidating multiple warnings about clobberable -#' versions). If the `observed_versions_end` values differ, the -#' `observed_versions_end_conflict` parameter controls what is done. +#' versions). If the `versions_end` values differ, the +#' `versions_end_conflict` parameter controls what is done. #' #' This function, [`epix_merge`], does not mutate its inputs and will not alias #' either archive's `DT`, but may alias other fields; `x$merge` will overwrite @@ -115,16 +115,16 @@ epix_fill_through_version = function(x, fill_versions_end, #' old `DT` in another object). #' #' @param x,y Two `epi_archive` objects to join together. -#' @param observed_versions_end_conflict Optional; `"stop"`, `"na"`, `"locf"`, -#' or `"truncate"`; in the case that `x$observed_versions_end` doesn't match -#' `y$observed_versions_end`, what do we do?: `"stop"`: emit an error; "na": -#' use `max(x$observed_versions_end, y$observed_versions_end)`, but in the +#' @param versions_end_conflict Optional; `"stop"`, `"na"`, `"locf"`, +#' or `"truncate"`; in the case that `x$versions_end` doesn't match +#' `y$versions_end`, what do we do?: `"stop"`: emit an error; "na": +#' use `max(x$versions_end, y$versions_end)`, but in the #' less up-to-date input archive, imagine there was an update immediately #' after its last observed version which revised all observations to be `NA`; -#' `"locf"`: use `max(x$observed_versions_end, y$observed_versions_end)`, +#' `"locf"`: use `max(x$versions_end, y$versions_end)`, #' allowing the last version of each observation to be carried forward to #' extrapolate unavailable versions for the less up-to-date input archive; or -#' `"truncate"`: use `min(x$observed_versions_end, y$observed_versions_end)` +#' `"truncate"`: use `min(x$versions_end, y$versions_end)` #' and discard any rows containing update rows for later versions. #' @param compactify Optional; `TRUE`, `FALSE`, or `NULL`; should the result be #' compactified? See [`as_epi_archive`] for an explanation of what this means. @@ -151,7 +151,7 @@ epix_fill_through_version = function(x, fill_versions_end, #' @importFrom data.table key set #' @export epix_merge = function(x, y, - observed_versions_end_conflict = c("stop","na","locf","truncate"), + versions_end_conflict = c("stop","na","locf","truncate"), compactify = TRUE) { if (!inherits(x, "epi_archive")) { Abort("`x` must be of class `epi_archive`.") @@ -161,7 +161,7 @@ epix_merge = function(x, y, Abort("`y` must be of class `epi_archive`.") } - observed_versions_end_conflict <- rlang::arg_match(observed_versions_end_conflict) + versions_end_conflict <- rlang::arg_match(versions_end_conflict) if (!identical(x$geo_type, y$geo_type)) { Abort("`x` and `y` must have the same `$geo_type`") @@ -192,27 +192,27 @@ epix_merge = function(x, y, # preprocessing using non-mutating (but potentially aliasing) functions. This # approach potentially uses more memory, but won't leave behind a # partially-mutated `x` on failure. - if (observed_versions_end_conflict == "stop") { - if (!identical(x$observed_versions_end, y$observed_versions_end)) { + if (versions_end_conflict == "stop") { + if (!identical(x$versions_end, y$versions_end)) { Abort(paste( "`x` and `y` were not equally up to date version-wise:", - "`x$observed_versions_end` was not identical to `y$observed_versions_end`;", + "`x$versions_end` was not identical to `y$versions_end`;", "either ensure that `x` and `y` are equally up to date before merging,", - "or specify how to deal with this using `observed_versions_end_conflict`" - ), class="epiprocess__epix_merge_unresolved_observed_versions_end_conflict") + "or specify how to deal with this using `versions_end_conflict`" + ), class="epiprocess__epix_merge_unresolved_versions_end_conflict") } else { - new_observed_versions_end = x$observed_versions_end + new_versions_end = x$versions_end x_DT = x$DT y_DT = y$DT } - } else if (observed_versions_end_conflict %in% c("na", "locf")) { - new_observed_versions_end = max(x$observed_versions_end, y$observed_versions_end) - x_DT = epix_fill_through_version(x, new_observed_versions_end, observed_versions_end_conflict)$DT - y_DT = epix_fill_through_version(y, new_observed_versions_end, observed_versions_end_conflict)$DT - } else if (observed_versions_end_conflict == "truncate") { - new_observed_versions_end = min(x$observed_versions_end, y$observed_versions_end) - x_DT = x$DT[x[["DT"]][["version"]] <= new_observed_versions_end, with=FALSE] - y_DT = y$DT[y[["DT"]][["version"]] <= new_observed_versions_end, with=FALSE] + } else if (versions_end_conflict %in% c("na", "locf")) { + new_versions_end = max(x$versions_end, y$versions_end) + x_DT = epix_fill_through_version(x, new_versions_end, versions_end_conflict)$DT + y_DT = epix_fill_through_version(y, new_versions_end, versions_end_conflict)$DT + } else if (versions_end_conflict == "truncate") { + new_versions_end = min(x$versions_end, y$versions_end) + x_DT = x$DT[x[["DT"]][["version"]] <= new_versions_end, with=FALSE] + y_DT = y$DT[y[["DT"]][["version"]] <= new_versions_end, with=FALSE] } else Abort("unimplemented") if (!identical(key(x$DT), key(x_DT)) || !identical(key(y$DT), key(y_DT))) { @@ -328,7 +328,7 @@ epix_merge = function(x, y, # pretty fast anyway. compactify = compactify, clobberable_versions_start = result_clobberable_versions_start, - observed_versions_end = new_observed_versions_end + versions_end = new_versions_end )) } diff --git a/data-raw/archive_cases_dv_subset.R b/data-raw/archive_cases_dv_subset.R index 7e4cc9dd..b6a7c481 100644 --- a/data-raw/archive_cases_dv_subset.R +++ b/data-raw/archive_cases_dv_subset.R @@ -32,7 +32,7 @@ case_rate_subset <- covidcast( as_epi_archive(compactify=FALSE) archive_cases_dv_subset = epix_merge(dv_subset, case_rate_subset, - observed_versions_end_conflict="locf", + versions_end_conflict="locf", compactify=FALSE) # If we directly store an epi_archive R6 object as data, it will store its class diff --git a/man/as_epi_archive.Rd b/man/as_epi_archive.Rd index c933a9f6..a98798cc 100644 --- a/man/as_epi_archive.Rd +++ b/man/as_epi_archive.Rd @@ -12,7 +12,7 @@ as_epi_archive( additional_metadata = list(), compactify = NULL, clobberable_versions_start = max_version_with_row_in(x), - observed_versions_end = max_version_with_row_in(x) + versions_end = max_version_with_row_in(x) ) } \arguments{ @@ -63,7 +63,7 @@ that (i) if a row in \code{x} (even one that \code{compactify} would consider redundant) is present with version \code{ver}, then all previous versions must be finalized and non-clobberable, although \code{ver} (and onward) might still be modified, (ii) even if we have "observed" empty updates for some -versions beyond \code{max(x$version)} (as indicated by \code{observed_versions_end}; +versions beyond \code{max(x$version)} (as indicated by \code{versions_end}; see below), we can't assume \code{max(x$version)} has been finalized, because we might see a nonfinalized version + empty subsequent versions due to upstream database replication delays in combination with the upstream @@ -72,7 +72,7 @@ no updates, (iii) "redundant" update rows that would be removed by \code{compactify} are not redundant, and actually come from an explicit version release that indicates that preceding versions are finalized. If \code{nrow(x) == 0}, then this argument is mandatory.} -\item{observed_versions_end}{Optional; length-1, same \code{class} and \code{typeof} as +\item{versions_end}{Optional; length-1, same \code{class} and \code{typeof} as \code{x$version}: what is the last version we have observed? The default is \code{max_version_with_row_in(x)}, but values greater than this could also be valid, and would indicate that we observed additional versions of the data diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index 81f1444f..8de89cca 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -34,7 +34,7 @@ to generate a snapshot of data from the archive, as of a given version. In general, the last version of each observation is carried forward (LOCF) to fill in data between recorded versions, and between the last recorded -update and the \code{observed_versions_end}. One consequence is that the \code{DT} +update and the \code{versions_end}. One consequence is that the \code{DT} doesn't have to contain a full snapshot of every version (although this generally works), but can instead contain only the rows that are new or changed from the previous version (see \code{compactify}, which does this @@ -137,7 +137,7 @@ Creates a new \code{epi_archive} object. additional_metadata, compactify, clobberable_versions_start, - observed_versions_end + versions_end )}\if{html}{\out{
}} } @@ -169,7 +169,7 @@ such as \code{as_of}? As these methods use the last version of each observation carried forward (LOCF) to interpolate between the version data provided, rows that don't change these LOCF results can potentially be omitted to save space while maintaining the same behavior (with the help of the -\code{clobberable_versions_start} and \code{observed_versions_end} fields in some +\code{clobberable_versions_start} and \code{versions_end} fields in some edge cases). \code{TRUE} will remove these rows, \code{FALSE} will not, and missing or \code{NULL} will remove these rows and issue a warning. Generally, this can be set to \code{TRUE}, but if you directly inspect or edit the fields of the @@ -181,7 +181,7 @@ e.g., when fetching, storing, or preparing the input data \code{x}} \item{\code{clobberable_versions_start}}{Optional; as in \code{\link{as_epi_archive}}} -\item{\code{observed_versions_end}}{Optiona; as in \code{\link{as_epi_archive}}} +\item{\code{versions_end}}{Optional; as in \code{\link{as_epi_archive}}} } \if{html}{\out{
}} } @@ -249,7 +249,7 @@ does not alias either archive's \code{DT}. \subsection{Usage}{ \if{html}{\out{
}}\preformatted{epi_archive$merge( y, - observed_versions_end_conflict = c("stop", "na", "locf", "truncate"), + versions_end_conflict = c("stop", "na", "locf", "truncate"), compactify = TRUE )}\if{html}{\out{
}} } @@ -259,7 +259,7 @@ does not alias either archive's \code{DT}. \describe{ \item{\code{y}}{as in \code{\link{epix_merge}}} -\item{\code{observed_versions_end_conflict}}{as in \code{\link{epix_merge}}} +\item{\code{versions_end_conflict}}{as in \code{\link{epix_merge}}} \item{\code{compactify}}{as in \code{\link{epix_merge}}} } diff --git a/man/epix_fill_through_version.Rd b/man/epix_fill_through_version.Rd index aa8109fb..a1f27592 100644 --- a/man/epix_fill_through_version.Rd +++ b/man/epix_fill_through_version.Rd @@ -11,12 +11,12 @@ epix_fill_through_version(x, fill_versions_end, how = c("na", "locf")) \item{fill_versions_end}{Length-1, same class&type as \verb{\%s$version}: the version through which to fill in missing version history; this will be the -result's \verb{$observed_versions_end} unless it already had a later -\verb{$observed_versions_end}.} +result's \verb{$versions_end} unless it already had a later +\verb{$versions_end}.} \item{how}{Optional; \code{"na"} or \code{"locf"}: \code{"na"} will fill in any missing required version history with \code{NA}s, by inserting (if necessary) an update -immediately after the current \verb{$observed_versions_end} that revises all +immediately after the current \verb{$versions_end} that revises all existing measurements to be \code{NA} (this is only supported for \code{version} classes with a \code{next_after} implementation); \code{"locf"} will fill in missing version history with the last version of each observation carried forward diff --git a/man/epix_merge.Rd b/man/epix_merge.Rd index 9bd6a75e..552b51a0 100644 --- a/man/epix_merge.Rd +++ b/man/epix_merge.Rd @@ -7,23 +7,23 @@ epix_merge( x, y, - observed_versions_end_conflict = c("stop", "na", "locf", "truncate"), + versions_end_conflict = c("stop", "na", "locf", "truncate"), compactify = TRUE ) } \arguments{ \item{x, y}{Two \code{epi_archive} objects to join together.} -\item{observed_versions_end_conflict}{Optional; \code{"stop"}, \code{"na"}, \code{"locf"}, -or \code{"truncate"}; in the case that \code{x$observed_versions_end} doesn't match -\code{y$observed_versions_end}, what do we do?: \code{"stop"}: emit an error; "na": -use \code{max(x$observed_versions_end, y$observed_versions_end)}, but in the +\item{versions_end_conflict}{Optional; \code{"stop"}, \code{"na"}, \code{"locf"}, +or \code{"truncate"}; in the case that \code{x$versions_end} doesn't match +\code{y$versions_end}, what do we do?: \code{"stop"}: emit an error; "na": +use \code{max(x$versions_end, y$versions_end)}, but in the less up-to-date input archive, imagine there was an update immediately after its last observed version which revised all observations to be \code{NA}; -\code{"locf"}: use \code{max(x$observed_versions_end, y$observed_versions_end)}, +\code{"locf"}: use \code{max(x$versions_end, y$versions_end)}, allowing the last version of each observation to be carried forward to extrapolate unavailable versions for the less up-to-date input archive; or -\code{"truncate"}: use \code{min(x$observed_versions_end, y$observed_versions_end)} +\code{"truncate"}: use \code{min(x$versions_end, y$versions_end)} and discard any rows containing update rows for later versions.} \item{compactify}{Optional; \code{TRUE}, \code{FALSE}, or \code{NULL}; should the result be @@ -35,12 +35,12 @@ the resulting \code{epi_archive} } \description{ Merges two \code{epi_archive}s that share a common \code{geo_value}, \code{time_value}, and -set of key columns. When they also share a common \code{observed_versions_end}, +set of key columns. When they also share a common \code{versions_end}, using \verb{$as_of} on the result should be the same as using \verb{$as_of} on \code{x} and \code{y} individually, then performing a full join of the \code{DT}s on the non-version key columns (potentially consolidating multiple warnings about clobberable -versions). If the \code{observed_versions_end} values differ, the -\code{observed_versions_end_conflict} parameter controls what is done. +versions). If the \code{versions_end} values differ, the +\code{versions_end_conflict} parameter controls what is done. } \details{ This function, \code{\link{epix_merge}}, does not mutate its inputs and will not alias diff --git a/tests/testthat/test-archive-version-bounds.R b/tests/testthat/test-archive-version-bounds.R index 8223d64f..d4c94e09 100644 --- a/tests/testthat/test-archive-version-bounds.R +++ b/tests/testthat/test-archive-version-bounds.R @@ -86,24 +86,24 @@ test_that("archive version bounds args work as intended", { ) expect_error(as_epi_archive(update_tbl, clobberable_versions_start = 1241, - observed_versions_end = measurement_date), + versions_end = measurement_date), class="epiprocess__clobberable_versions_start_has_invalid_class_or_typeof") expect_error(as_epi_archive(update_tbl[integer(0L),]), class="epiprocess__max_version_cannot_be_used") expect_error(as_epi_archive(update_tbl, clobberable_versions_start = NA, - observed_versions_end = measurement_date), - class="epiprocess__observed_versions_end_earlier_than_updates") + versions_end = measurement_date), + class="epiprocess__versions_end_earlier_than_updates") expect_error(as_epi_archive(update_tbl, clobberable_versions_start=measurement_date+6L, - observed_versions_end = measurement_date+5L), - class="epiprocess__observed_versions_end_earlier_than_clobberable_versions_start") - expect_error(as_epi_archive(update_tbl, observed_versions_end = NA), - regexp="observed_versions_end.*must not satisfy.*is.na") + versions_end = measurement_date+5L), + class="epiprocess__versions_end_earlier_than_clobberable_versions_start") + expect_error(as_epi_archive(update_tbl, versions_end = NA), + regexp="versions_end.*must not satisfy.*is.na") ea_default = as_epi_archive(update_tbl) ea_default$as_of(measurement_date+4L) expect_warning(ea_default$as_of(measurement_date+5L), class = "epiprocess__snapshot_as_of_clobberable_version") expect_error(ea_default$as_of(measurement_date+6L), - regexp = "max_version.*at most.*observed_versions_end") + regexp = "max_version.*at most.*versions_end") }) diff --git a/tests/testthat/test-compactify.R b/tests/testthat/test-compactify.R index e13455a1..f02e261d 100644 --- a/tests/testthat/test-compactify.R +++ b/tests/testthat/test-compactify.R @@ -107,5 +107,5 @@ test_that("compactify does not alter the default clobberable and observed versio expect_true(max(ea_true$DT$version) != max(ea_false$DT$version)) # The actual test: expect_identical(ea_true$clobberable_versions_start, ea_false$clobberable_versions_start) - expect_identical(ea_true$observed_versions_end, ea_false$observed_versions_end) + expect_identical(ea_true$versions_end, ea_false$versions_end) }) diff --git a/tests/testthat/test-epix_fill_through_version.R b/tests/testthat/test-epix_fill_through_version.R index 1777607c..03e9c504 100644 --- a/tests/testthat/test-epix_fill_through_version.R +++ b/tests/testthat/test-epix_fill_through_version.R @@ -4,7 +4,7 @@ test_that("epix_fill_through_version mirrors input when it is sufficiently up to version = 1:5, value = 1:5)) some_earlier_observed_version = 2L ea_trivial_fill_na1 = epix_fill_through_version(ea_orig, some_earlier_observed_version, "na") - ea_trivial_fill_na2 = epix_fill_through_version(ea_orig, ea_orig$observed_versions_end, "na") + ea_trivial_fill_na2 = epix_fill_through_version(ea_orig, ea_orig$versions_end, "na") ea_trivial_fill_locf = epix_fill_through_version(ea_orig, some_earlier_observed_version, "locf") # Below, we want R6 objects to be compared based on contents rather than # addresses. We appear to get this with `expect_identical` in `testthat` @@ -34,13 +34,13 @@ test_that("epix_fill_through_version can extend observed versions, gives expecte # Ensure we are using edition 3: local_edition(3) withCallingHandlers({ - expect_identical(ea_fill_na$observed_versions_end, later_unobserved_version) + expect_identical(ea_fill_na$versions_end, later_unobserved_version) expect_identical(tibble::as_tibble(ea_fill_na$as_of(first_unobserved_version)), tibble::tibble(geo_value="g1", time_value=as.Date("2020-01-01")+0:1, value=rep(NA_integer_, 2L)), ignore_attr = TRUE) - expect_identical(ea_fill_locf$observed_versions_end, later_unobserved_version) + expect_identical(ea_fill_locf$versions_end, later_unobserved_version) expect_identical(ea_fill_locf$as_of(first_unobserved_version), - ea_fill_locf$as_of(ea_orig$observed_versions_end) %>% + ea_fill_locf$as_of(ea_orig$versions_end) %>% {attr(., "metadata")$as_of <- first_unobserved_version; .}) }, epiprocess__snapshot_as_of_clobberable_version = function(wrn) invokeRestart("muffleWarning")) }) diff --git a/tests/testthat/test-epix_merge.R b/tests/testthat/test-epix_merge.R index 281234f3..df76f184 100644 --- a/tests/testthat/test-epix_merge.R +++ b/tests/testthat/test-epix_merge.R @@ -108,31 +108,31 @@ test_that('epix_merge stops and warns on metadata and naming issues', { # elsewhere, while allowing reuse across a couple tests local({ x = as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, x_value=1L), - clobberable_versions_start=1L, observed_versions_end = 10L) + clobberable_versions_start=1L, versions_end = 10L) y = as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, y_value=2L), - clobberable_versions_start=3L, observed_versions_end = 10L) + clobberable_versions_start=3L, versions_end = 10L) xy = epix_merge(x,y) test_that('epix_merge considers partially-clobberable row to be clobberable', { expect_identical(xy$clobberable_versions_start, 1L) }) - test_that('epix_merge result uses observed_versions_end metadata not max version val', { - expect_identical(xy$observed_versions_end, 10L) + test_that('epix_merge result uses versions_end metadata not max version val', { + expect_identical(xy$versions_end, 10L) }) }) local({ x = as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, x_value=10L)) y = as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=5L, y_value=20L)) - print(epix_merge(x,y, observed_versions_end_conflict = "na")) - test_that('epix_merge stops on observed_versions_end_conflict default or "stop"', { + print(epix_merge(x,y, versions_end_conflict = "na")) + test_that('epix_merge stops on versions_end_conflict default or "stop"', { expect_error(epix_merge(x,y), - class="epiprocess__epix_merge_unresolved_observed_versions_end_conflict") - expect_error(epix_merge(x,y, observed_versions_end_conflict = "stop"), - class="epiprocess__epix_merge_unresolved_observed_versions_end_conflict") + class="epiprocess__epix_merge_unresolved_versions_end_conflict") + expect_error(epix_merge(x,y, versions_end_conflict = "stop"), + class="epiprocess__epix_merge_unresolved_versions_end_conflict") }) - test_that('epix_merge observed_versions_end_conflict="na" works', { + test_that('epix_merge versions_end_conflict="na" works', { expect_equal( - epix_merge(x,y, observed_versions_end_conflict = "na"), + epix_merge(x,y, versions_end_conflict = "na"), as_epi_archive(tibble::tribble( ~geo_value, ~time_value, ~version, ~x_value, ~y_value, 1L, 1L, 1L, 10L, NA_integer_, # x updated, y not observed yet @@ -141,9 +141,9 @@ local({ ), clobberable_versions_start=1L) ) }) - test_that('epix_merge observed_versions_end_conflict="locf" works', { + test_that('epix_merge versions_end_conflict="locf" works', { expect_equal( - epix_merge(x,y, observed_versions_end_conflict = "locf"), + epix_merge(x,y, versions_end_conflict = "locf"), as_epi_archive(tibble::tribble( ~geo_value, ~time_value, ~version, ~x_value, ~y_value, 1L, 1L, 1L, 10L, NA_integer_, # x updated, y not observed yet @@ -157,36 +157,36 @@ local({ ~geo_value, ~time_value, ~version, ~x_value, ~y_value, 1L, 1L, 1L, 10L, 20L, # x updated, y not observed yet )) - test_that('epix_merge observed_versions_end_conflict="stop" on no-conflict works', { + test_that('epix_merge versions_end_conflict="stop" on no-conflict works', { expect_equal( - epix_merge(x_no_conflict, y_no_conflict, observed_versions_end_conflict = "stop"), + epix_merge(x_no_conflict, y_no_conflict, versions_end_conflict = "stop"), xy_no_conflict_expected ) }) - test_that('epix_merge observed_versions_end_conflict="na" on no-conflict works', { + test_that('epix_merge versions_end_conflict="na" on no-conflict works', { # This test is the main reason for these no-conflict tests. We want to make # sure that we don't add an unnecessary NA-ing-out version beyond a common - # observed_versions_end. + # versions_end. expect_equal( - epix_merge(x_no_conflict, y_no_conflict, observed_versions_end_conflict = "na"), + epix_merge(x_no_conflict, y_no_conflict, versions_end_conflict = "na"), xy_no_conflict_expected ) }) - test_that('epix_merge observed_versions_end_conflict="locf" on no-conflict works', { + test_that('epix_merge versions_end_conflict="locf" on no-conflict works', { expect_equal( - epix_merge(x_no_conflict, y_no_conflict, observed_versions_end_conflict = "locf"), + epix_merge(x_no_conflict, y_no_conflict, versions_end_conflict = "locf"), xy_no_conflict_expected ) }) }) -test_that('epix_merge observed_versions_end_conflict="na" balks if do not know next_after', { +test_that('epix_merge versions_end_conflict="na" balks if do not know next_after', { expect_error( epix_merge( as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=as.POSIXct(as.Date("2020-01-01")), x_value=10L)), as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=as.POSIXct(as.Date("2020-01-02")), y_value=20L)), - observed_versions_end_conflict = "na" + versions_end_conflict = "na" ), regexp = "no applicable method.*next_after" ) diff --git a/vignettes/archive.Rmd b/vignettes/archive.Rmd index 0730b85a..59a6fc25 100644 --- a/vignettes/archive.Rmd +++ b/vignettes/archive.Rmd @@ -244,7 +244,7 @@ When merging archives, unless the archives have identical data release patterns, been released) - to represent the "value" of an observation that has no recorded versions at all (in the same sort of situation) -- if requested via `observed_versions_end_conflict="na"`, to represent potential +- if requested via `versions_end_conflict="na"`, to represent potential update data that we do not yet have access to (e.g., due to one of the archives being out of sync). From a71c3fe43eba6cc46282a4fb3b1bae0d364a0005 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 27 Jul 2022 05:14:47 -0700 Subject: [PATCH 89/96] Rename versions_end_conflict -> sync, "stop" -> "forbid" --- R/archive.R | 6 ++-- R/methods-epi_archive.R | 44 ++++++++++++++++-------------- data-raw/archive_cases_dv_subset.R | 2 +- man/epi_archive.Rd | 4 +-- man/epix_merge.Rd | 27 ++++++++++-------- tests/testthat/test-epix_merge.R | 38 +++++++++++++------------- vignettes/archive.Rmd | 7 +++-- 7 files changed, 68 insertions(+), 60 deletions(-) diff --git a/R/archive.R b/R/archive.R index 129c1e21..7a2bd175 100644 --- a/R/archive.R +++ b/R/archive.R @@ -555,11 +555,11 @@ epi_archive = #' of the non-R6-method version, which does not mutate either archive, and #' does not alias either archive's `DT`. #' @param y as in [`epix_merge`] -#' @param versions_end_conflict as in [`epix_merge`] +#' @param sync as in [`epix_merge`] #' @param compactify as in [`epix_merge`] - merge = function(y, versions_end_conflict = c("stop","na","locf","truncate"), compactify=TRUE) { + merge = function(y, sync = c("forbid","na","locf","truncate"), compactify = TRUE) { result = epix_merge(self, y, - versions_end_conflict = versions_end_conflict, + sync = sync, compactify = compactify) if (length(epi_archive$private_fields) != 0L) { diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index a22406a5..83c46d83 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -105,7 +105,7 @@ epix_fill_through_version = function(x, fill_versions_end, #' `y` individually, then performing a full join of the `DT`s on the non-version #' key columns (potentially consolidating multiple warnings about clobberable #' versions). If the `versions_end` values differ, the -#' `versions_end_conflict` parameter controls what is done. +#' `sync` parameter controls what is done. #' #' This function, [`epix_merge`], does not mutate its inputs and will not alias #' either archive's `DT`, but may alias other fields; `x$merge` will overwrite @@ -115,16 +115,20 @@ epix_fill_through_version = function(x, fill_versions_end, #' old `DT` in another object). #' #' @param x,y Two `epi_archive` objects to join together. -#' @param versions_end_conflict Optional; `"stop"`, `"na"`, `"locf"`, -#' or `"truncate"`; in the case that `x$versions_end` doesn't match -#' `y$versions_end`, what do we do?: `"stop"`: emit an error; "na": -#' use `max(x$versions_end, y$versions_end)`, but in the -#' less up-to-date input archive, imagine there was an update immediately -#' after its last observed version which revised all observations to be `NA`; -#' `"locf"`: use `max(x$versions_end, y$versions_end)`, -#' allowing the last version of each observation to be carried forward to -#' extrapolate unavailable versions for the less up-to-date input archive; or -#' `"truncate"`: use `min(x$versions_end, y$versions_end)` +#' @param sync Optional; `"forbid"`, `"na"`, `"locf"`, or `"truncate"`; in the +#' case that `x$versions_end` doesn't match `y$versions_end`, what do we do?: +#' `"forbid"`: emit an error; "na": use `max(x$versions_end, y$versions_end)` +#' as the result's `versions_end`, but ensure that, if we request a snapshot +#' as of a version after `min(x$versions_end, y$versions_end)`, the +#' observation columns from the less up-to-date archive will be all NAs (i.e., +#' imagine there was an update immediately after its `versions_end` which +#' revised all observations to be `NA`); `"locf"`: use `max(x$versions_end, +#' y$versions_end)` as the result's `versions_end`, allowing the last version +#' of each observation to be carried forward to extrapolate unavailable +#' versions for the less up-to-date input archive (i.e., imagining that in the +#' less up-to-date archive's data set remained unchanged between its actual +#' `versions_end` and the other archive's `versions_end`); or `"truncate"`: +#' use `min(x$versions_end, y$versions_end)` as the result's `versions_end`, #' and discard any rows containing update rows for later versions. #' @param compactify Optional; `TRUE`, `FALSE`, or `NULL`; should the result be #' compactified? See [`as_epi_archive`] for an explanation of what this means. @@ -151,7 +155,7 @@ epix_fill_through_version = function(x, fill_versions_end, #' @importFrom data.table key set #' @export epix_merge = function(x, y, - versions_end_conflict = c("stop","na","locf","truncate"), + sync = c("forbid","na","locf","truncate"), compactify = TRUE) { if (!inherits(x, "epi_archive")) { Abort("`x` must be of class `epi_archive`.") @@ -161,7 +165,7 @@ epix_merge = function(x, y, Abort("`y` must be of class `epi_archive`.") } - versions_end_conflict <- rlang::arg_match(versions_end_conflict) + sync <- rlang::arg_match(sync) if (!identical(x$geo_type, y$geo_type)) { Abort("`x` and `y` must have the same `$geo_type`") @@ -192,24 +196,24 @@ epix_merge = function(x, y, # preprocessing using non-mutating (but potentially aliasing) functions. This # approach potentially uses more memory, but won't leave behind a # partially-mutated `x` on failure. - if (versions_end_conflict == "stop") { + if (sync == "forbid") { if (!identical(x$versions_end, y$versions_end)) { Abort(paste( "`x` and `y` were not equally up to date version-wise:", "`x$versions_end` was not identical to `y$versions_end`;", "either ensure that `x` and `y` are equally up to date before merging,", - "or specify how to deal with this using `versions_end_conflict`" - ), class="epiprocess__epix_merge_unresolved_versions_end_conflict") + "or specify how to deal with this using `sync`" + ), class="epiprocess__epix_merge_unresolved_sync") } else { new_versions_end = x$versions_end x_DT = x$DT y_DT = y$DT } - } else if (versions_end_conflict %in% c("na", "locf")) { + } else if (sync %in% c("na", "locf")) { new_versions_end = max(x$versions_end, y$versions_end) - x_DT = epix_fill_through_version(x, new_versions_end, versions_end_conflict)$DT - y_DT = epix_fill_through_version(y, new_versions_end, versions_end_conflict)$DT - } else if (versions_end_conflict == "truncate") { + x_DT = epix_fill_through_version(x, new_versions_end, sync)$DT + y_DT = epix_fill_through_version(y, new_versions_end, sync)$DT + } else if (sync == "truncate") { new_versions_end = min(x$versions_end, y$versions_end) x_DT = x$DT[x[["DT"]][["version"]] <= new_versions_end, with=FALSE] y_DT = y$DT[y[["DT"]][["version"]] <= new_versions_end, with=FALSE] diff --git a/data-raw/archive_cases_dv_subset.R b/data-raw/archive_cases_dv_subset.R index b6a7c481..36750b02 100644 --- a/data-raw/archive_cases_dv_subset.R +++ b/data-raw/archive_cases_dv_subset.R @@ -32,7 +32,7 @@ case_rate_subset <- covidcast( as_epi_archive(compactify=FALSE) archive_cases_dv_subset = epix_merge(dv_subset, case_rate_subset, - versions_end_conflict="locf", + sync="locf", compactify=FALSE) # If we directly store an epi_archive R6 object as data, it will store its class diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index 8de89cca..6d22ee44 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -249,7 +249,7 @@ does not alias either archive's \code{DT}. \subsection{Usage}{ \if{html}{\out{
}}\preformatted{epi_archive$merge( y, - versions_end_conflict = c("stop", "na", "locf", "truncate"), + sync = c("forbid", "na", "locf", "truncate"), compactify = TRUE )}\if{html}{\out{
}} } @@ -259,7 +259,7 @@ does not alias either archive's \code{DT}. \describe{ \item{\code{y}}{as in \code{\link{epix_merge}}} -\item{\code{versions_end_conflict}}{as in \code{\link{epix_merge}}} +\item{\code{sync}}{as in \code{\link{epix_merge}}} \item{\code{compactify}}{as in \code{\link{epix_merge}}} } diff --git a/man/epix_merge.Rd b/man/epix_merge.Rd index 552b51a0..09f67fa2 100644 --- a/man/epix_merge.Rd +++ b/man/epix_merge.Rd @@ -7,23 +7,26 @@ epix_merge( x, y, - versions_end_conflict = c("stop", "na", "locf", "truncate"), + sync = c("forbid", "na", "locf", "truncate"), compactify = TRUE ) } \arguments{ \item{x, y}{Two \code{epi_archive} objects to join together.} -\item{versions_end_conflict}{Optional; \code{"stop"}, \code{"na"}, \code{"locf"}, -or \code{"truncate"}; in the case that \code{x$versions_end} doesn't match -\code{y$versions_end}, what do we do?: \code{"stop"}: emit an error; "na": -use \code{max(x$versions_end, y$versions_end)}, but in the -less up-to-date input archive, imagine there was an update immediately -after its last observed version which revised all observations to be \code{NA}; -\code{"locf"}: use \code{max(x$versions_end, y$versions_end)}, -allowing the last version of each observation to be carried forward to -extrapolate unavailable versions for the less up-to-date input archive; or -\code{"truncate"}: use \code{min(x$versions_end, y$versions_end)} +\item{sync}{Optional; \code{"forbid"}, \code{"na"}, \code{"locf"}, or \code{"truncate"}; in the +case that \code{x$versions_end} doesn't match \code{y$versions_end}, what do we do?: +\code{"forbid"}: emit an error; "na": use \code{max(x$versions_end, y$versions_end)} +as the result's \code{versions_end}, but ensure that, if we request a snapshot +as of a version after \code{min(x$versions_end, y$versions_end)}, the +observation columns from the less up-to-date archive will be all NAs (i.e., +imagine there was an update immediately after its \code{versions_end} which +revised all observations to be \code{NA}); \code{"locf"}: use \code{max(x$versions_end, y$versions_end)} as the result's \code{versions_end}, allowing the last version +of each observation to be carried forward to extrapolate unavailable +versions for the less up-to-date input archive (i.e., imagining that in the +less up-to-date archive's data set remained unchanged between its actual +\code{versions_end} and the other archive's \code{versions_end}); or \code{"truncate"}: +use \code{min(x$versions_end, y$versions_end)} as the result's \code{versions_end}, and discard any rows containing update rows for later versions.} \item{compactify}{Optional; \code{TRUE}, \code{FALSE}, or \code{NULL}; should the result be @@ -40,7 +43,7 @@ using \verb{$as_of} on the result should be the same as using \verb{$as_of} on \ \code{y} individually, then performing a full join of the \code{DT}s on the non-version key columns (potentially consolidating multiple warnings about clobberable versions). If the \code{versions_end} values differ, the -\code{versions_end_conflict} parameter controls what is done. +\code{sync} parameter controls what is done. } \details{ This function, \code{\link{epix_merge}}, does not mutate its inputs and will not alias diff --git a/tests/testthat/test-epix_merge.R b/tests/testthat/test-epix_merge.R index df76f184..51f2c3c6 100644 --- a/tests/testthat/test-epix_merge.R +++ b/tests/testthat/test-epix_merge.R @@ -1,5 +1,5 @@ -test_that("epix_merge requires stops on invalid `y`",{ +test_that("epix_merge requires forbids on invalid `y`",{ ea = archive_cases_dv_subset$clone() expect_error(epix_merge(ea, data.frame(x=1))) }) @@ -62,7 +62,7 @@ test_that("epix_merge merges and carries forward updates properly", { expect_identical(xy, xy_expected) }) -test_that('epix_merge stops and warns on metadata and naming issues', { +test_that('epix_merge forbids and warns on metadata and naming issues', { expect_error( epix_merge( as_epi_archive(tibble::tibble(geo_value="tx", time_value=1L, version=1L, x_value=1L)), @@ -123,16 +123,16 @@ local({ local({ x = as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, x_value=10L)) y = as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=5L, y_value=20L)) - print(epix_merge(x,y, versions_end_conflict = "na")) - test_that('epix_merge stops on versions_end_conflict default or "stop"', { + print(epix_merge(x,y, sync = "na")) + test_that('epix_merge forbids on sync default or "forbid"', { expect_error(epix_merge(x,y), - class="epiprocess__epix_merge_unresolved_versions_end_conflict") - expect_error(epix_merge(x,y, versions_end_conflict = "stop"), - class="epiprocess__epix_merge_unresolved_versions_end_conflict") + class="epiprocess__epix_merge_unresolved_sync") + expect_error(epix_merge(x,y, sync = "forbid"), + class="epiprocess__epix_merge_unresolved_sync") }) - test_that('epix_merge versions_end_conflict="na" works', { + test_that('epix_merge sync="na" works', { expect_equal( - epix_merge(x,y, versions_end_conflict = "na"), + epix_merge(x,y, sync = "na"), as_epi_archive(tibble::tribble( ~geo_value, ~time_value, ~version, ~x_value, ~y_value, 1L, 1L, 1L, 10L, NA_integer_, # x updated, y not observed yet @@ -141,9 +141,9 @@ local({ ), clobberable_versions_start=1L) ) }) - test_that('epix_merge versions_end_conflict="locf" works', { + test_that('epix_merge sync="locf" works', { expect_equal( - epix_merge(x,y, versions_end_conflict = "locf"), + epix_merge(x,y, sync = "locf"), as_epi_archive(tibble::tribble( ~geo_value, ~time_value, ~version, ~x_value, ~y_value, 1L, 1L, 1L, 10L, NA_integer_, # x updated, y not observed yet @@ -157,36 +157,36 @@ local({ ~geo_value, ~time_value, ~version, ~x_value, ~y_value, 1L, 1L, 1L, 10L, 20L, # x updated, y not observed yet )) - test_that('epix_merge versions_end_conflict="stop" on no-conflict works', { + test_that('epix_merge sync="forbid" on no-conflict works', { expect_equal( - epix_merge(x_no_conflict, y_no_conflict, versions_end_conflict = "stop"), + epix_merge(x_no_conflict, y_no_conflict, sync = "forbid"), xy_no_conflict_expected ) }) - test_that('epix_merge versions_end_conflict="na" on no-conflict works', { + test_that('epix_merge sync="na" on no-conflict works', { # This test is the main reason for these no-conflict tests. We want to make # sure that we don't add an unnecessary NA-ing-out version beyond a common # versions_end. expect_equal( - epix_merge(x_no_conflict, y_no_conflict, versions_end_conflict = "na"), + epix_merge(x_no_conflict, y_no_conflict, sync = "na"), xy_no_conflict_expected ) }) - test_that('epix_merge versions_end_conflict="locf" on no-conflict works', { + test_that('epix_merge sync="locf" on no-conflict works', { expect_equal( - epix_merge(x_no_conflict, y_no_conflict, versions_end_conflict = "locf"), + epix_merge(x_no_conflict, y_no_conflict, sync = "locf"), xy_no_conflict_expected ) }) }) -test_that('epix_merge versions_end_conflict="na" balks if do not know next_after', { +test_that('epix_merge sync="na" balks if do not know next_after', { expect_error( epix_merge( as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=as.POSIXct(as.Date("2020-01-01")), x_value=10L)), as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=as.POSIXct(as.Date("2020-01-02")), y_value=20L)), - versions_end_conflict = "na" + sync = "na" ), regexp = "no applicable method.*next_after" ) diff --git a/vignettes/archive.Rmd b/vignettes/archive.Rmd index 59a6fc25..1b8d0622 100644 --- a/vignettes/archive.Rmd +++ b/vignettes/archive.Rmd @@ -244,9 +244,10 @@ When merging archives, unless the archives have identical data release patterns, been released) - to represent the "value" of an observation that has no recorded versions at all (in the same sort of situation) -- if requested via `versions_end_conflict="na"`, to represent potential - update data that we do not yet have access to (e.g., due to one of the - archives being out of sync). +- if requested via `sync="na"`, to represent potential update data that we do + not yet have access to (e.g., due to encountering issues while attempting to + download the currently available version data for one of the archives, but not + the other). ```{r, message = FALSE, warning = FALSE,eval=FALSE} y <- covidcast( From ef45cf1a943b0e7dd1ca9940236ff8f009d860a6 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 28 Jul 2022 10:07:01 -0700 Subject: [PATCH 90/96] Fixup merge - max(self$DT$time_value) -> max_time - merge updates to as_of test: both - test that doesn't mutate key - use custom test archive, not archive_cases_dv_subset --- R/archive.R | 2 +- tests/testthat/test-methods-epi_archive.R | 45 ++++++++++++++++------- 2 files changed, 32 insertions(+), 15 deletions(-) diff --git a/R/archive.R b/R/archive.R index e7173dda..9d36b6f5 100644 --- a/R/archive.R +++ b/R/archive.R @@ -416,7 +416,7 @@ epi_archive = cat(sprintf("* %-14s = %s\n", "min time value", min_time)) cat(sprintf("* %-14s = %s\n", "max time value", - max(self$DT$time_value))) + max_time)) cat(sprintf("* %-14s = %s\n", "first version with update", min(self$DT$version))) cat(sprintf("* %-14s = %s\n", "last version with update", diff --git a/tests/testthat/test-methods-epi_archive.R b/tests/testthat/test-methods-epi_archive.R index a5662638..9ebe1e94 100644 --- a/tests/testthat/test-methods-epi_archive.R +++ b/tests/testthat/test-methods-epi_archive.R @@ -23,21 +23,38 @@ test_that("Warning against max_version being same as edf's max version",{ }) test_that("as_of properly grabs the data and doesn't mutate key",{ - old_key = data.table::key(ea$DT) - df_as_of <- ea %>% - epix_as_of(max_version = as.Date("2020-07-01")) %>% - na.omit() %>% - as.data.frame() - - df_filter <- ea$DT %>% - filter(version == as.Date("2020-07-01")) %>% - na.omit() %>% - select(-version) %>% - as.data.frame() - - expect_equal(df_as_of[1:4],df_filter) - expect_equal(data.table::key(ea$DT), old_key) + d <- as.Date("2020-06-01") + + ea2 = tibble::tribble( + ~geo_value, ~time_value, ~version, ~cases, + "ca", "2020-06-01", "2020-06-01", 1, + "ca", "2020-06-01", "2020-06-02", 2, + # + "ca", "2020-06-02", "2020-06-02", 0, + "ca", "2020-06-02", "2020-06-03", 1, + "ca", "2020-06-02", "2020-06-04", 2, + # + "ca", "2020-06-03", "2020-06-03", 1, + # + "ca", "2020-06-04", "2020-06-04", 4, + ) %>% + dplyr::mutate(dplyr::across(c(time_value, version), as.Date)) %>% + as_epi_archive() + + old_key = data.table::key(ea2$DT) + + edf_as_of <- ea2 %>% + epix_as_of(max_version = as.Date("2020-06-03")) + + edf_expected <- as_epi_df(tibble( + geo_value = "ca", + time_value = d + 0:2, + cases = c(2,1,1) + ), as_of = as.Date("2020-06-03")) + + expect_equal(edf_as_of, edf_expected, ignore_attr=c(".internal.selfref", "sorted")) + expect_equal(data.table::key(ea2$DT), old_key) }) test_that("quosure passing issue in epix_slide is resolved + other potential issues", { From 8bb37c4dc33e6b538b04ae67882532e4abc9be38 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 28 Jul 2022 10:19:41 -0700 Subject: [PATCH 91/96] Fixup formatting on merge fixup --- R/archive.R | 6 ++---- tests/testthat/test-methods-epi_archive.R | 14 +++++++------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/R/archive.R b/R/archive.R index 9d36b6f5..db3b0f7a 100644 --- a/R/archive.R +++ b/R/archive.R @@ -413,10 +413,8 @@ epi_archive = min_time = Min(self$DT$time_value) max_time = Max(self$DT$time_value) } - cat(sprintf("* %-14s = %s\n", "min time value", - min_time)) - cat(sprintf("* %-14s = %s\n", "max time value", - max_time)) + cat(sprintf("* %-14s = %s\n", "min time value", min_time)) + cat(sprintf("* %-14s = %s\n", "max time value", max_time)) cat(sprintf("* %-14s = %s\n", "first version with update", min(self$DT$version))) cat(sprintf("* %-14s = %s\n", "last version with update", diff --git a/tests/testthat/test-methods-epi_archive.R b/tests/testthat/test-methods-epi_archive.R index 9ebe1e94..d0434f59 100644 --- a/tests/testthat/test-methods-epi_archive.R +++ b/tests/testthat/test-methods-epi_archive.R @@ -28,16 +28,16 @@ test_that("as_of properly grabs the data and doesn't mutate key",{ ea2 = tibble::tribble( ~geo_value, ~time_value, ~version, ~cases, - "ca", "2020-06-01", "2020-06-01", 1, - "ca", "2020-06-01", "2020-06-02", 2, + "ca", "2020-06-01", "2020-06-01", 1, + "ca", "2020-06-01", "2020-06-02", 2, # - "ca", "2020-06-02", "2020-06-02", 0, - "ca", "2020-06-02", "2020-06-03", 1, - "ca", "2020-06-02", "2020-06-04", 2, + "ca", "2020-06-02", "2020-06-02", 0, + "ca", "2020-06-02", "2020-06-03", 1, + "ca", "2020-06-02", "2020-06-04", 2, # - "ca", "2020-06-03", "2020-06-03", 1, + "ca", "2020-06-03", "2020-06-03", 1, # - "ca", "2020-06-04", "2020-06-04", 4, + "ca", "2020-06-04", "2020-06-04", 4, ) %>% dplyr::mutate(dplyr::across(c(time_value, version), as.Date)) %>% as_epi_archive() From 174fecf15437e4bc958f58d72046cf4c279c27f1 Mon Sep 17 00:00:00 2001 From: kenmawer Date: Thu, 28 Jul 2022 15:57:48 -0700 Subject: [PATCH 92/96] Changed `n`'s default to be consistent with the rest of the branch. --- R/archive.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/archive.R b/R/archive.R index db3b0f7a..bf5a38a9 100644 --- a/R/archive.R +++ b/R/archive.R @@ -584,7 +584,7 @@ epi_archive = #' details. #' @importFrom data.table key #' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms - slide = function(f, ..., n = 7, group_by, ref_time_values, + slide = function(f, ..., n, group_by, ref_time_values, time_step, new_col_name = "slide_value", as_list_col = FALSE, names_sep = "_", all_rows = FALSE) { From 4eb7f110e325aac38d9693805c5a2d22415a96d6 Mon Sep 17 00:00:00 2001 From: kenmawer Date: Thu, 28 Jul 2022 16:15:45 -0700 Subject: [PATCH 93/96] Updated missing n. --- vignettes/compactify.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/compactify.Rmd b/vignettes/compactify.Rmd index 746262a2..143dbd6b 100644 --- a/vignettes/compactify.Rmd +++ b/vignettes/compactify.Rmd @@ -102,7 +102,7 @@ speeds <- rbind(speeds, speed_test(iterate_as_of,"as_of_1000x")) # Performance of slide slide_median <- function(my_ea) { - my_ea$slide(median = median(case_rate_7d_av)) + my_ea$slide(median = median, n = 7) } speeds <- rbind(speeds, speed_test(slide_median,"slide_median")) From c1c61e4408484eda696399595144c9a0ef77046e Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 28 Jul 2022 21:22:53 -0700 Subject: [PATCH 94/96] Repair compactify vignette median slide --- vignettes/compactify.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/compactify.Rmd b/vignettes/compactify.Rmd index 143dbd6b..39511e10 100644 --- a/vignettes/compactify.Rmd +++ b/vignettes/compactify.Rmd @@ -102,7 +102,7 @@ speeds <- rbind(speeds, speed_test(iterate_as_of,"as_of_1000x")) # Performance of slide slide_median <- function(my_ea) { - my_ea$slide(median = median, n = 7) + my_ea$slide(median = median(case_rate_7d_av), n = 7) } speeds <- rbind(speeds, speed_test(slide_median,"slide_median")) From bbd1c17b6f7db63eacea6e2522ca4c90fc125d3b Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 28 Jul 2022 21:25:14 -0700 Subject: [PATCH 95/96] Sync compactify.Rmd text with code: 1000 iterations of filter --- vignettes/compactify.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/compactify.Rmd b/vignettes/compactify.Rmd index 39511e10..034235b3 100644 --- a/vignettes/compactify.Rmd +++ b/vignettes/compactify.Rmd @@ -63,7 +63,7 @@ nrow(locf_omitted_2$DT) ``` -As we would expect, performing 200 iterations of `dplyr::filter` is faster when +As we would expect, performing 1000 iterations of `dplyr::filter` is faster when the LOCF values are omitted. ```{r} From 2a79c0823f0eb7508ec9047d6e48cf83d9d919be Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 28 Jul 2022 21:33:57 -0700 Subject: [PATCH 96/96] Fix incomplete change from "LVCF" back to "LOCF" --- tests/testthat/test-compactify.R | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/tests/testthat/test-compactify.R b/tests/testthat/test-compactify.R index f02e261d..962747d9 100644 --- a/tests/testthat/test-compactify.R +++ b/tests/testthat/test-compactify.R @@ -22,8 +22,11 @@ row_replace <- function(dt,row,x,y) { dt } +# Note that compactify is working on version-wise LOCF (last version of each +# observation carried forward) + # Rows 1 should not be eliminated even if NA -dt <- row_replace(dt,1,NA,NA) # Not LVCF +dt <- row_replace(dt,1,NA,NA) # Not LOCF # NOTE! We are assuming that there are no NA's in geo_value, time_value, # and version. Even though compactify may erroneously remove the first row @@ -31,43 +34,43 @@ dt <- row_replace(dt,1,NA,NA) # Not LVCF # has problems beyond the scope of this test # Rows 11 and 12 correspond to different time_values -dt <- row_replace(dt,12,11,11) # Not LVCF +dt <- row_replace(dt,12,11,11) # Not LOCF # Rows 20 and 21 only differ in version -dt <- row_replace(dt,21,20,20) # LVCF +dt <- row_replace(dt,21,20,20) # LOCF # Rows 21 and 22 only differ in version -dt <- row_replace(dt,22,20,20) # LVCF +dt <- row_replace(dt,22,20,20) # LOCF # Row 39 comprises the first NA's -dt <- row_replace(dt,39,NA,NA) # Not LVCF +dt <- row_replace(dt,39,NA,NA) # Not LOCF # Row 40 has two NA's, just like its lag, row 39 -dt <- row_replace(dt,40,NA,NA) # LVCF +dt <- row_replace(dt,40,NA,NA) # LOCF # Row 62's values already exist in row 15, but row 15 is not a preceding row -dt <- row_replace(dt,62,15,15) # Not LVCF +dt <- row_replace(dt,62,15,15) # Not LOCF # Row 73 only has one value carried over -dt <- row_replace(dt,74,73,74) # Not LVCF +dt <- row_replace(dt,74,73,74) # Not LOCF dt_true <- as_tibble(as_epi_archive(dt,compactify=TRUE)$DT) dt_false <- as_tibble(as_epi_archive(dt,compactify=FALSE)$DT) dt_null <- suppressWarnings(as_tibble(as_epi_archive(dt,compactify=NULL)$DT)) -test_that("Warning for LVCF with compactify as NULL", { +test_that("Warning for LOCF with compactify as NULL", { expect_warning(as_epi_archive(dt,compactify=NULL)) }) -test_that("No warning when there is no LVCF", { +test_that("No warning when there is no LOCF", { expect_warning(as_epi_archive(dt[1:5],compactify=NULL),NA) }) -test_that("LVCF values are ignored with compactify=FALSE", { +test_that("LOCF values are ignored with compactify=FALSE", { expect_identical(nrow(dt),nrow(dt_false)) }) -test_that("LVCF values are taken out with compactify=TRUE", { +test_that("LOCF values are taken out with compactify=TRUE", { dt_test <- as_tibble(as_epi_archive(dt[-c(21,22,40),],compactify=FALSE)$DT) expect_identical(dt_true,dt_null) @@ -78,7 +81,7 @@ test_that("as_of produces the same results with compactify=TRUE as with compacti ea_true <- as_epi_archive(dt,compactify=TRUE) ea_false <- as_epi_archive(dt,compactify=FALSE) - # Row 22, an LVCF row corresponding to the latest version, is omitted in + # Row 22, an LOCF row corresponding to the latest version, is omitted in # ea_true latest_version = max(ea_false$DT$version) expect_warning({