diff --git a/R/archive.R b/R/archive.R index 5afabb13..543cb451 100644 --- a/R/archive.R +++ b/R/archive.R @@ -438,7 +438,8 @@ removed_by_compactify <- function(updates_df, ukey_names, abs_tol) { updates_df[update_is_locf(updates_df, ukey_names, abs_tol), ] } -#' Internal helper; lgl; which updates are LOCF +#' Internal helper; lgl; which updates are LOCF and should thus be dropped when +#' compactifying #' #' (Not validated:) Must be called inside certain dplyr data masking verbs (e.g., #' `filter` or `mutate`) being run on an `epi_archive`'s `DT` or a data frame @@ -470,12 +471,18 @@ update_is_locf <- function(arranged_updates_df, ukey_names, abs_tol) { } else { ekts_tbl <- new_tibble(updates_col_refs[ekt_names]) vals_tbl <- new_tibble(updates_col_refs[val_names]) + # grab the data and a shifted version of the data, and compute the + # entry-wise difference to see if the value has changed # n_updates >= 2L so we can use `:` naturally (this is the reason for # separating out n_updates == 1L from this case): inds1 <- 2L:n_updates inds2 <- 1L:(n_updates - 1L) c( FALSE, # first observation is not LOCF + # for the rest, check that both the keys are exactly the same, and the + # values are within abs_tol + # the key comparison effectively implements a group_by, so that when the + # key changes we're guaranteed the value is correct vec_approx_equal0(ekts_tbl, inds1 = inds1, ekts_tbl, inds2 = inds2, # check ekt (key) cols with 0 tolerance: @@ -493,7 +500,8 @@ update_is_locf <- function(arranged_updates_df, ukey_names, abs_tol) { #' `epi_archive` object. #' #' @param x A data.frame, data.table, or tibble, with columns `geo_value`, -#' `time_value`, `version`, and then any additional number of columns. +#' `time_value`, `version`, and then any additional number of columns, either +#' keys or values. #' @param ... used for specifying column names, as in [`dplyr::rename`]. For #' example `version = release_date` #' @param .versions_end location based versions_end, used to avoid prefix diff --git a/R/epi_slide_opt_archive.R b/R/epi_slide_opt_archive.R index 48d43cce..d503483d 100644 --- a/R/epi_slide_opt_archive.R +++ b/R/epi_slide_opt_archive.R @@ -27,12 +27,12 @@ #' #' library(dplyr) #' grp_updates <- bind_rows( -#' tibble(version = 10, time_value = 1:20, value = 1:20), -#' tibble(version = 12, time_value = 4:5, value = 5:4), -#' tibble(version = 13, time_value = 8, value = 9), -#' tibble(version = 14, time_value = 11, value = NA), -#' tibble(version = 15, time_value = -10, value = -10), -#' tibble(version = 16, time_value = 50, value = 50) +#' tibble(version = 30, time_value = 1:20, value = 1:20), +#' tibble(version = 32, time_value = 4:5, value = 5:4), +#' tibble(version = 33, time_value = 8, value = 9), +#' tibble(version = 34, time_value = 11, value = NA), +#' tibble(version = 35, time_value = -10, value = -10), +#' tibble(version = 56, time_value = 50, value = 50) #' ) %>% #' mutate(across(c(version, time_value), ~ as.Date("2020-01-01") - 1 + .x)) #' @@ -108,7 +108,7 @@ epi_slide_opt_archive_one_epikey <- function( slide[[out_colnames[[col_i]]]] <- f_dots_baked(slide[[in_colnames[[col_i]]]], before = before, after = after) } } else { - cli_abort("epiprocess internal error: `f_from_package` was {format_chr_deparse(f_from_package)}", + cli_abort("epiprocess internal error: `f_from_package` was {format_chr_deparse(f_from_package)}, which is unsupported", class = "epiprocess__epi_slide_opt_archive__f_from_package_invalid" ) } diff --git a/R/epi_slide_opt_edf.R b/R/epi_slide_opt_edf.R index cb80aac0..1e2401e2 100644 --- a/R/epi_slide_opt_edf.R +++ b/R/epi_slide_opt_edf.R @@ -73,7 +73,8 @@ upstream_slide_f_info <- function(.f, ...) { ) } -#' Calculate input and output column names for an `{epiprocess}` [`dplyr::across`]-like operations +#' Calculate input and output column names for an `{epiprocess}` +#' [`dplyr::across`]-like operations #' #' @param .x data.frame to perform input column tidyselection on #' @param time_type as in [`new_epi_df`] diff --git a/R/patch.R b/R/patch.R index 929075de..26291463 100644 --- a/R/patch.R +++ b/R/patch.R @@ -121,17 +121,22 @@ vec_approx_equal0 <- function(vec1, vec2, na_equal, abs_tol, inds1 = NULL, inds2 na_or_nan1 <- is.na(vec1) na_or_nan2 <- is.na(vec2) # Since above are bare logical vectors, we can use `fifelse` - res <- fifelse( - !na_or_nan1 & !na_or_nan2, - abs(vec1 - vec2) <= abs_tol, - if (na_equal) { + if (na_equal) { + res <- fifelse( + !na_or_nan1 & !na_or_nan2, + abs(vec1 - vec2) <= abs_tol, na_or_nan1 & na_or_nan2 & (is.nan(vec1) == is.nan(vec2)) - } else { - # Like `==` and `vec_equal`, we consider NaN == {NA, NaN, anything else} - # to be NA. + ) + } else { + # Like `==` and `vec_equal`, we consider NaN == {NA, NaN, anything else} + # to be NA. + res <- fifelse( + !na_or_nan1 & !na_or_nan2, + abs(vec1 - vec2) <= abs_tol, NA - } - ) + ) + } + if (!is.null(dim(vec1))) { dim(res) <- dim(vec1) res <- rowSums(res) == ncol(res) @@ -278,9 +283,9 @@ tbl_diff2 <- function(earlier_snapshot, later_tbl, # ukey+val duplicates (cases 2. and 3.).) # Row indices of first occurrence of each ukey; will be the same as - # seq_len(combined_n) except for when that ukey has been re-reported in - # `later_tbl`, in which case (3. or 4.) it will point back to the row index of - # the same ukey in `earlier_snapshot`: + # seq_len(combined_n) for each ukey's first appearance (cases 1., 2., or 5.); + # for re-reported ukeys in `later_tbl` (cases 3. or 4.), it will point back to + # the row index of the same ukey in `earlier_snapshot`: combined_ukey_firsts <- vec_duplicate_id(combined_ukeys) # Which rows from combined are cases 3. or 4.? @@ -368,6 +373,11 @@ tbl_patch <- function(snapshot, update, ukey_names) { result_tbl <- vec_rbind(update, snapshot) dup_ids <- vec_duplicate_id(result_tbl[ukey_names]) + # Find the "first" appearance of each ukey; since `update` is ordered before `snapshot`, + # this means favoring the rows from `update` over those in `snapshot`. + # This is like `!duplicated()` but faster, and like `vec_unique_loc()` but guaranteeing + # that we get the first appearance since `vec_duplicate_id()` guarantees that + # it points to the first appearance. not_overwritten <- dup_ids == vec_seq_along(result_tbl) result_tbl <- result_tbl[not_overwritten, ] diff --git a/man/across_ish_names_info.Rd b/man/across_ish_names_info.Rd index c993c2bf..36b9ed04 100644 --- a/man/across_ish_names_info.Rd +++ b/man/across_ish_names_info.Rd @@ -2,7 +2,8 @@ % Please edit documentation in R/epi_slide_opt_edf.R \name{across_ish_names_info} \alias{across_ish_names_info} -\title{Calculate input and output column names for an \code{{epiprocess}} \code{\link[dplyr:across]{dplyr::across}}-like operations} +\title{Calculate input and output column names for an \code{{epiprocess}} +\code{\link[dplyr:across]{dplyr::across}}-like operations} \usage{ across_ish_names_info( .x, @@ -41,6 +42,7 @@ named list with two elements: \code{input_col_names}, chr, subset of \code{names(.x)}; and \code{output_colnames}, chr, same length as \code{input_col_names} } \description{ -Calculate input and output column names for an \code{{epiprocess}} \code{\link[dplyr:across]{dplyr::across}}-like operations +Calculate input and output column names for an \code{{epiprocess}} +\code{\link[dplyr:across]{dplyr::across}}-like operations } \keyword{internal} diff --git a/man/epi_slide_opt_archive_one_epikey.Rd b/man/epi_slide_opt_archive_one_epikey.Rd index 79d635c8..073e684c 100644 --- a/man/epi_slide_opt_archive_one_epikey.Rd +++ b/man/epi_slide_opt_archive_one_epikey.Rd @@ -55,12 +55,12 @@ Core operation of \code{epi_slide_opt.epi_archive} for a single epikey's history library(dplyr) grp_updates <- bind_rows( - tibble(version = 10, time_value = 1:20, value = 1:20), - tibble(version = 12, time_value = 4:5, value = 5:4), - tibble(version = 13, time_value = 8, value = 9), - tibble(version = 14, time_value = 11, value = NA), - tibble(version = 15, time_value = -10, value = -10), - tibble(version = 16, time_value = 50, value = 50) + tibble(version = 30, time_value = 1:20, value = 1:20), + tibble(version = 32, time_value = 4:5, value = 5:4), + tibble(version = 33, time_value = 8, value = 9), + tibble(version = 34, time_value = 11, value = NA), + tibble(version = 35, time_value = -10, value = -10), + tibble(version = 56, time_value = 50, value = 50) ) \%>\% mutate(across(c(version, time_value), ~ as.Date("2020-01-01") - 1 + .x)) diff --git a/man/update_is_locf.Rd b/man/update_is_locf.Rd index 722f3d5c..aaa70bad 100644 --- a/man/update_is_locf.Rd +++ b/man/update_is_locf.Rd @@ -2,7 +2,8 @@ % Please edit documentation in R/archive.R \name{update_is_locf} \alias{update_is_locf} -\title{Internal helper; lgl; which updates are LOCF} +\title{Internal helper; lgl; which updates are LOCF and should thus be dropped when +compactifying} \usage{ update_is_locf(arranged_updates_df, ukey_names, abs_tol) } diff --git a/tests/testthat/test-epi_slide_opt_archive.R b/tests/testthat/test-epi_slide_opt_archive.R index 6e6fd247..99923421 100644 --- a/tests/testthat/test-epi_slide_opt_archive.R +++ b/tests/testthat/test-epi_slide_opt_archive.R @@ -173,13 +173,14 @@ test_that("epi_slide_opt.epi_archive gives expected results on example data; als group_by(geo_value) ) - mini_case_death_rate_archive_b <- mini_case_death_rate_archive %>% - { - as_tibble(as.data.frame(.$DT)) - } %>% + mini_case_death_rate_archive_b <- + mini_case_death_rate_archive$DT %>% + as.data.frame() %>% + as_tibble() %>% mutate(age_group = "overall") %>% as_epi_archive(other_keys = "age_group") + # grouping shouldn't change the outcome expect_equal( mini_case_death_rate_archive_b %>% group_by(geo_value, age_group) %>%