Skip to content

Fix epix_merge sync="truncate" err (with=F no j), add tests #215

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Aug 23, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 28 additions & 10 deletions R/methods-epi_archive.R
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ epix_fill_through_version = function(x, fill_versions_end,
#' # vs. mutating x to hold the merge result:
#' x$merge(y)
#'
#' @importFrom data.table key set
#' @importFrom data.table key set setkeyv
#' @export
epix_merge = function(x, y,
sync = c("forbid","na","locf","truncate"),
Expand Down Expand Up @@ -215,18 +215,36 @@ epix_merge = function(x, y,
y_DT = epix_fill_through_version(y, new_versions_end, sync)$DT
} else if (sync == "truncate") {
new_versions_end = min(x$versions_end, y$versions_end)
x_DT = x$DT[x[["DT"]][["version"]] <= new_versions_end, with=FALSE]
y_DT = y$DT[y[["DT"]][["version"]] <= new_versions_end, with=FALSE]
x_DT = x$DT[x[["DT"]][["version"]] <= new_versions_end, names(x$DT), with=FALSE]
y_DT = y$DT[y[["DT"]][["version"]] <= new_versions_end, names(y$DT), with=FALSE]
} else Abort("unimplemented")

if (!identical(key(x$DT), key(x_DT)) || !identical(key(y$DT), key(y_DT))) {
Abort("preprocessing of data tables in merge changed the key unexpectedly",
internal=TRUE)
# key(x_DT) should be the same as key(x$DT) and key(y_DT) should be the same
# as key(y$DT). Below, we only use {x,y}_DT in the code (making it easier to
# split the code into separate functions if we wish), but still refer to
# {x,y}$DT in the error messages (further relying on this assumption).
#
# Check&ensure that the above assumption; if it didn't already hold, we likely
# have a bug in the preprocessing, a weird/invalid archive as input, and/or a
# data.table version with different semantics (which may break other parts of
# our code).
x_DT_key_as_expected = identical(key(x$DT), key(x_DT))
y_DT_key_as_expected = identical(key(y$DT), key(y_DT))
if (!x_DT_key_as_expected || !y_DT_key_as_expected) {
Warn("
`epiprocess` internal warning (please report): pre-processing for
epix_merge unexpectedly resulted in an intermediate data table (or
tables) with a different key than the corresponding input archive.
Manually setting intermediate data table keys to the expected values.
", internal=TRUE)
setkeyv(x_DT, key(x$DT))
setkeyv(y_DT, key(y$DT))
}
## key(x_DT) should be the same as key(x$DT) and key(y_DT) should be the same
## as key(y$DT). If we want to break this function into parts it makes sense
## to use {x,y}_DT below, but this makes the error checks and messages look a
## little weird and rely on the key-matching assumption above.
# Without some sort of annotations of what various columns represent, we can't
# do something that makes sense when merging archives with mismatched keys.
# E.g., even if we assume extra keys represent demographic breakdowns, a
# sensible default treatment of count-type and rate-type value columns would
# differ.
if (!identical(sort(key(x_DT)), sort(key(y_DT)))) {
Abort("
The archives must have the same set of key column names; if the
Expand Down
30 changes: 26 additions & 4 deletions tests/testthat/test-epix_merge.R
Original file line number Diff line number Diff line change
Expand Up @@ -121,9 +121,13 @@ local({
})

local({
x = as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, x_value=10L))
y = as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=5L, y_value=20L))
print(epix_merge(x,y, sync = "na"))
x = as_epi_archive(
tibble::tibble(geo_value=1L, time_value=1L, version=1L, x_value=10L),
versions_end = 3L
)
y = as_epi_archive(
tibble::tibble(geo_value=1L, time_value=1L, version=5L, y_value=20L)
)
test_that('epix_merge forbids on sync default or "forbid"', {
expect_error(epix_merge(x,y),
class="epiprocess__epix_merge_unresolved_sync")
Expand All @@ -136,8 +140,10 @@ local({
as_epi_archive(tibble::tribble(
~geo_value, ~time_value, ~version, ~x_value, ~y_value,
1L, 1L, 1L, 10L, NA_integer_, # x updated, y not observed yet
1L, 1L, 2L, NA_integer_, NA_integer_, # NA-ing out x, y not observed yet
1L, 1L, 4L, NA_integer_, NA_integer_, # NA-ing out x, y not observed yet
1L, 1L, 5L, NA_integer_, 20L, # x still NA, y updated
# (we should not have a y vals -> NA update here; version 5 should be
# the `versions_end` of the result)
), clobberable_versions_start=1L)
)
})
Expand All @@ -151,6 +157,16 @@ local({
), clobberable_versions_start=1L)
)
})
test_that('epix_merge sync="truncate" works', {
expect_equal(
epix_merge(x,y, sync = "truncate"),
as_epi_archive(tibble::tribble(
~geo_value, ~time_value, ~version, ~x_value, ~y_value,
1L, 1L, 1L, 10L, NA_integer_, # x updated, y not observed yet
# y's update beyond x's last update has been truncated
), clobberable_versions_start=1L, versions_end=3L)
)
})
x_no_conflict = as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, x_value=10L))
y_no_conflict = as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, y_value=20L))
xy_no_conflict_expected = as_epi_archive(tibble::tribble(
Expand Down Expand Up @@ -178,6 +194,12 @@ local({
xy_no_conflict_expected
)
})
test_that('epix_merge sync="truncate" on no-conflict works', {
expect_equal(
epix_merge(x_no_conflict, y_no_conflict, sync = "truncate"),
xy_no_conflict_expected
)
})
})


Expand Down