Skip to content

comments and some nit rewrites #652

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Apr 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions R/archive.R
Original file line number Diff line number Diff line change
Expand Up @@ -438,7 +438,8 @@ removed_by_compactify <- function(updates_df, ukey_names, abs_tol) {
updates_df[update_is_locf(updates_df, ukey_names, abs_tol), ]
}

#' Internal helper; lgl; which updates are LOCF
#' Internal helper; lgl; which updates are LOCF and should thus be dropped when
#' compactifying
#'
#' (Not validated:) Must be called inside certain dplyr data masking verbs (e.g.,
#' `filter` or `mutate`) being run on an `epi_archive`'s `DT` or a data frame
Expand Down Expand Up @@ -470,12 +471,18 @@ update_is_locf <- function(arranged_updates_df, ukey_names, abs_tol) {
} else {
ekts_tbl <- new_tibble(updates_col_refs[ekt_names])
vals_tbl <- new_tibble(updates_col_refs[val_names])
# grab the data and a shifted version of the data, and compute the
# entry-wise difference to see if the value has changed
# n_updates >= 2L so we can use `:` naturally (this is the reason for
# separating out n_updates == 1L from this case):
inds1 <- 2L:n_updates
inds2 <- 1L:(n_updates - 1L)
c(
FALSE, # first observation is not LOCF
# for the rest, check that both the keys are exactly the same, and the
# values are within abs_tol
# the key comparison effectively implements a group_by, so that when the
# key changes we're guaranteed the value is correct
vec_approx_equal0(ekts_tbl,
inds1 = inds1, ekts_tbl, inds2 = inds2,
# check ekt (key) cols with 0 tolerance:
Expand All @@ -493,7 +500,8 @@ update_is_locf <- function(arranged_updates_df, ukey_names, abs_tol) {
#' `epi_archive` object.
#'
#' @param x A data.frame, data.table, or tibble, with columns `geo_value`,
#' `time_value`, `version`, and then any additional number of columns.
#' `time_value`, `version`, and then any additional number of columns, either
#' keys or values.
#' @param ... used for specifying column names, as in [`dplyr::rename`]. For
#' example `version = release_date`
#' @param .versions_end location based versions_end, used to avoid prefix
Expand Down
14 changes: 7 additions & 7 deletions R/epi_slide_opt_archive.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,12 @@
#'
#' library(dplyr)
#' grp_updates <- bind_rows(
#' tibble(version = 10, time_value = 1:20, value = 1:20),
#' tibble(version = 12, time_value = 4:5, value = 5:4),
#' tibble(version = 13, time_value = 8, value = 9),
#' tibble(version = 14, time_value = 11, value = NA),
#' tibble(version = 15, time_value = -10, value = -10),
#' tibble(version = 16, time_value = 50, value = 50)
#' tibble(version = 30, time_value = 1:20, value = 1:20),
#' tibble(version = 32, time_value = 4:5, value = 5:4),
#' tibble(version = 33, time_value = 8, value = 9),
#' tibble(version = 34, time_value = 11, value = NA),
#' tibble(version = 35, time_value = -10, value = -10),
#' tibble(version = 56, time_value = 50, value = 50)
#' ) %>%
#' mutate(across(c(version, time_value), ~ as.Date("2020-01-01") - 1 + .x))
#'
Expand Down Expand Up @@ -108,7 +108,7 @@ epi_slide_opt_archive_one_epikey <- function(
slide[[out_colnames[[col_i]]]] <- f_dots_baked(slide[[in_colnames[[col_i]]]], before = before, after = after)
}
} else {
cli_abort("epiprocess internal error: `f_from_package` was {format_chr_deparse(f_from_package)}",
cli_abort("epiprocess internal error: `f_from_package` was {format_chr_deparse(f_from_package)}, which is unsupported",
class = "epiprocess__epi_slide_opt_archive__f_from_package_invalid"
)
}
Expand Down
3 changes: 2 additions & 1 deletion R/epi_slide_opt_edf.R
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@ upstream_slide_f_info <- function(.f, ...) {
)
}

#' Calculate input and output column names for an `{epiprocess}` [`dplyr::across`]-like operations
#' Calculate input and output column names for an `{epiprocess}`
#' [`dplyr::across`]-like operations
#'
#' @param .x data.frame to perform input column tidyselection on
#' @param time_type as in [`new_epi_df`]
Expand Down
34 changes: 22 additions & 12 deletions R/patch.R
Original file line number Diff line number Diff line change
Expand Up @@ -121,17 +121,22 @@ vec_approx_equal0 <- function(vec1, vec2, na_equal, abs_tol, inds1 = NULL, inds2
na_or_nan1 <- is.na(vec1)
na_or_nan2 <- is.na(vec2)
# Since above are bare logical vectors, we can use `fifelse`
res <- fifelse(
!na_or_nan1 & !na_or_nan2,
abs(vec1 - vec2) <= abs_tol,
if (na_equal) {
if (na_equal) {
res <- fifelse(
!na_or_nan1 & !na_or_nan2,
abs(vec1 - vec2) <= abs_tol,
na_or_nan1 & na_or_nan2 & (is.nan(vec1) == is.nan(vec2))
} else {
# Like `==` and `vec_equal`, we consider NaN == {NA, NaN, anything else}
# to be NA.
)
} else {
# Like `==` and `vec_equal`, we consider NaN == {NA, NaN, anything else}
# to be NA.
res <- fifelse(
!na_or_nan1 & !na_or_nan2,
abs(vec1 - vec2) <= abs_tol,
NA
}
)
)
}

if (!is.null(dim(vec1))) {
dim(res) <- dim(vec1)
res <- rowSums(res) == ncol(res)
Expand Down Expand Up @@ -278,9 +283,9 @@ tbl_diff2 <- function(earlier_snapshot, later_tbl,
# ukey+val duplicates (cases 2. and 3.).)

# Row indices of first occurrence of each ukey; will be the same as
# seq_len(combined_n) except for when that ukey has been re-reported in
# `later_tbl`, in which case (3. or 4.) it will point back to the row index of
# the same ukey in `earlier_snapshot`:
# seq_len(combined_n) for each ukey's first appearance (cases 1., 2., or 5.);
# for re-reported ukeys in `later_tbl` (cases 3. or 4.), it will point back to
# the row index of the same ukey in `earlier_snapshot`:
combined_ukey_firsts <- vec_duplicate_id(combined_ukeys)

# Which rows from combined are cases 3. or 4.?
Expand Down Expand Up @@ -368,6 +373,11 @@ tbl_patch <- function(snapshot, update, ukey_names) {
result_tbl <- vec_rbind(update, snapshot)

dup_ids <- vec_duplicate_id(result_tbl[ukey_names])
# Find the "first" appearance of each ukey; since `update` is ordered before `snapshot`,
# this means favoring the rows from `update` over those in `snapshot`.
# This is like `!duplicated()` but faster, and like `vec_unique_loc()` but guaranteeing
# that we get the first appearance since `vec_duplicate_id()` guarantees that
# it points to the first appearance.
not_overwritten <- dup_ids == vec_seq_along(result_tbl)
result_tbl <- result_tbl[not_overwritten, ]

Expand Down
6 changes: 4 additions & 2 deletions man/across_ish_names_info.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 6 additions & 6 deletions man/epi_slide_opt_archive_one_epikey.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion man/update_is_locf.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 5 additions & 4 deletions tests/testthat/test-epi_slide_opt_archive.R
Original file line number Diff line number Diff line change
Expand Up @@ -173,13 +173,14 @@ test_that("epi_slide_opt.epi_archive gives expected results on example data; als
group_by(geo_value)
)

mini_case_death_rate_archive_b <- mini_case_death_rate_archive %>%
{
as_tibble(as.data.frame(.$DT))
} %>%
mini_case_death_rate_archive_b <-
mini_case_death_rate_archive$DT %>%
as.data.frame() %>%
as_tibble() %>%
mutate(age_group = "overall") %>%
as_epi_archive(other_keys = "age_group")

# grouping shouldn't change the outcome
expect_equal(
mini_case_death_rate_archive_b %>%
group_by(geo_value, age_group) %>%
Expand Down