cmu-delphi · brookslogan · Apr 5, 2025 · Apr 3, 2025 · Apr 3, 2025 · Apr 4, 2025
diff --git a/R/archive.R b/R/archive.R
@@ -438,7 +438,8 @@ removed_by_compactify <- function(updates_df, ukey_names, abs_tol) {
   updates_df[update_is_locf(updates_df, ukey_names, abs_tol), ]
 }
 
-#' Internal helper; lgl; which updates are LOCF
+#' Internal helper; lgl; which updates are LOCF and should thus be dropped when
+#' compactifying
 #'
 #' (Not validated:) Must be called inside certain dplyr data masking verbs (e.g.,
 #' `filter` or `mutate`) being run on an `epi_archive`'s `DT` or a data frame
@@ -470,12 +471,18 @@ update_is_locf <- function(arranged_updates_df, ukey_names, abs_tol) {
   } else {
     ekts_tbl <- new_tibble(updates_col_refs[ekt_names])
     vals_tbl <- new_tibble(updates_col_refs[val_names])
+    # grab the data and a shifted version of the data, and compute the
+    # entry-wise difference to see if the value has changed
     # n_updates >= 2L so we can use `:` naturally (this is the reason for
     # separating out n_updates == 1L from this case):
     inds1 <- 2L:n_updates
     inds2 <- 1L:(n_updates - 1L)
     c(
       FALSE, # first observation is not LOCF
+      # for the rest, check that both the keys are exactly the same, and the
+      # values are within abs_tol
+      # the key comparison effectively implements a group_by, so that when the
+      # key changes we're guaranteed the value is correct
       vec_approx_equal0(ekts_tbl,
         inds1 = inds1, ekts_tbl, inds2 = inds2,
         # check ekt (key) cols with 0 tolerance:
@@ -493,7 +500,8 @@ update_is_locf <- function(arranged_updates_df, ukey_names, abs_tol) {
 #' `epi_archive` object.
 #'
 #' @param x A data.frame, data.table, or tibble, with columns `geo_value`,
-#'   `time_value`, `version`, and then any additional number of columns.
+#'   `time_value`, `version`, and then any additional number of columns, either
+#'   keys or values.
 #' @param ... used for specifying column names, as in [`dplyr::rename`]. For
 #'   example `version = release_date`
 #' @param .versions_end location based versions_end, used to avoid prefix

diff --git a/R/epi_slide_opt_archive.R b/R/epi_slide_opt_archive.R
@@ -27,12 +27,12 @@
 #'
 #' library(dplyr)
 #' grp_updates <- bind_rows(
-#'   tibble(version = 10, time_value = 1:20, value = 1:20),
-#'   tibble(version = 12, time_value = 4:5, value = 5:4),
-#'   tibble(version = 13, time_value = 8, value = 9),
-#'   tibble(version = 14, time_value = 11, value = NA),
-#'   tibble(version = 15, time_value = -10, value = -10),
-#'   tibble(version = 16, time_value = 50, value = 50)
+#'   tibble(version = 30, time_value = 1:20, value = 1:20),
+#'   tibble(version = 32, time_value = 4:5, value = 5:4),
+#'   tibble(version = 33, time_value = 8, value = 9),
+#'   tibble(version = 34, time_value = 11, value = NA),
+#'   tibble(version = 35, time_value = -10, value = -10),
+#'   tibble(version = 56, time_value = 50, value = 50)
 #' ) %>%
 #'   mutate(across(c(version, time_value), ~ as.Date("2020-01-01") - 1 + .x))
 #'
@@ -108,7 +108,7 @@ epi_slide_opt_archive_one_epikey <- function(
         slide[[out_colnames[[col_i]]]] <- f_dots_baked(slide[[in_colnames[[col_i]]]], before = before, after = after)
       }
     } else {
-      cli_abort("epiprocess internal error: `f_from_package` was {format_chr_deparse(f_from_package)}",
+      cli_abort("epiprocess internal error: `f_from_package` was {format_chr_deparse(f_from_package)}, which is unsupported",
         class = "epiprocess__epi_slide_opt_archive__f_from_package_invalid"
       )
     }

diff --git a/R/epi_slide_opt_edf.R b/R/epi_slide_opt_edf.R
@@ -73,7 +73,8 @@ upstream_slide_f_info <- function(.f, ...) {
   )
 }
 
-#' Calculate input and output column names for an `{epiprocess}` [`dplyr::across`]-like operations
+#' Calculate input and output column names for an `{epiprocess}`
+#' [`dplyr::across`]-like operations
 #'
 #' @param .x data.frame to perform input column tidyselection on
 #' @param time_type as in [`new_epi_df`]

diff --git a/R/patch.R b/R/patch.R
@@ -121,17 +121,22 @@ vec_approx_equal0 <- function(vec1, vec2, na_equal, abs_tol, inds1 = NULL, inds2
     na_or_nan1 <- is.na(vec1)
     na_or_nan2 <- is.na(vec2)
     # Since above are bare logical vectors, we can use `fifelse`
-    res <- fifelse(
-      !na_or_nan1 & !na_or_nan2,
-      abs(vec1 - vec2) <= abs_tol,
-      if (na_equal) {
+    if (na_equal) {
+      res <- fifelse(
+        !na_or_nan1 & !na_or_nan2,
+        abs(vec1 - vec2) <= abs_tol,
         na_or_nan1 & na_or_nan2 & (is.nan(vec1) == is.nan(vec2))
-      } else {
-        # Like `==` and `vec_equal`, we consider NaN == {NA, NaN, anything else}
-        # to be NA.
+      )
+    } else {
+      # Like `==` and `vec_equal`, we consider NaN == {NA, NaN, anything else}
+      # to be NA.
+      res <- fifelse(
+        !na_or_nan1 & !na_or_nan2,
+        abs(vec1 - vec2) <= abs_tol,
         NA
-      }
-    )
+      )
+    }
+
     if (!is.null(dim(vec1))) {
       dim(res) <- dim(vec1)
       res <- rowSums(res) == ncol(res)
@@ -278,9 +283,9 @@ tbl_diff2 <- function(earlier_snapshot, later_tbl,
   # ukey+val duplicates (cases 2. and 3.).)
 
   # Row indices of first occurrence of each ukey; will be the same as
-  # seq_len(combined_n) except for when that ukey has been re-reported in
-  # `later_tbl`, in which case (3. or 4.) it will point back to the row index of
-  # the same ukey in `earlier_snapshot`:
+  # seq_len(combined_n) for each ukey's first appearance (cases 1., 2., or 5.);
+  # for re-reported ukeys in `later_tbl` (cases 3. or 4.), it will point back to
+  # the row index of the same ukey in `earlier_snapshot`:
   combined_ukey_firsts <- vec_duplicate_id(combined_ukeys)
 
   # Which rows from combined are cases 3. or 4.?
@@ -368,6 +373,11 @@ tbl_patch <- function(snapshot, update, ukey_names) {
   result_tbl <- vec_rbind(update, snapshot)
 
   dup_ids <- vec_duplicate_id(result_tbl[ukey_names])
+  # Find the "first" appearance of each ukey; since `update` is ordered before `snapshot`,
+  # this means favoring the rows from `update` over those in `snapshot`.
+  # This is like `!duplicated()` but faster, and like `vec_unique_loc()` but guaranteeing
+  # that we get the first appearance since `vec_duplicate_id()` guarantees that
+  # it points to the first appearance.
   not_overwritten <- dup_ids == vec_seq_along(result_tbl)
   result_tbl <- result_tbl[not_overwritten, ]
 

diff --git a/man/across_ish_names_info.Rd b/man/across_ish_names_info.Rd
diff --git a/man/epi_slide_opt_archive_one_epikey.Rd b/man/epi_slide_opt_archive_one_epikey.Rd
diff --git a/man/update_is_locf.Rd b/man/update_is_locf.Rd
diff --git a/tests/testthat/test-epi_slide_opt_archive.R b/tests/testthat/test-epi_slide_opt_archive.R
@@ -173,13 +173,14 @@ test_that("epi_slide_opt.epi_archive gives expected results on example data; als
       group_by(geo_value)
   )
 
-  mini_case_death_rate_archive_b <- mini_case_death_rate_archive %>%
-    {
-      as_tibble(as.data.frame(.$DT))
-    } %>%
+  mini_case_death_rate_archive_b <-
+    mini_case_death_rate_archive$DT %>%
+    as.data.frame() %>%
+    as_tibble() %>%
     mutate(age_group = "overall") %>%
     as_epi_archive(other_keys = "age_group")
 
+  # grouping shouldn't change the outcome
   expect_equal(
     mini_case_death_rate_archive_b %>%
       group_by(geo_value, age_group) %>%