Skip to content

Make epix_slide more like reframe #290

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

2 changes: 1 addition & 1 deletion .github/workflows/R-CMD-check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ on:
push:
branches: [main, master]
pull_request:
branches: [main, master]
branches: [main, master, dev]

name: R-CMD-check

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pkgdown.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ on:
push:
branches: [main, master]
pull_request:
branches: [main, master]
branches: [main, master, dev]
release:
types: [published]
workflow_dispatch:
Expand Down
4 changes: 2 additions & 2 deletions R/archive.R
Original file line number Diff line number Diff line change
Expand Up @@ -640,7 +640,7 @@ epi_archive =
slide = function(f, ..., before, ref_time_values,
time_step, new_col_name = "slide_value",
as_list_col = FALSE, names_sep = "_",
all_rows = FALSE, all_versions = FALSE) {
all_versions = FALSE) {
# For an "ungrouped" slide, treat all rows as belonging to one big
# group (group by 0 vars), like `dplyr::summarize`, and let the
# resulting `grouped_epi_archive` handle the slide:
Expand All @@ -649,7 +649,7 @@ epi_archive =
before = before, ref_time_values = ref_time_values,
time_step = time_step, new_col_name = new_col_name,
as_list_col = as_list_col, names_sep = names_sep,
all_rows = all_rows, all_versions = all_versions
all_versions = all_versions
) %>%
# We want a slide on ungrouped archives to output something
# ungrouped, rather than retaining the trivial (0-variable)
Expand Down
123 changes: 28 additions & 95 deletions R/grouped_epi_archive.R
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,11 @@ grouped_epi_archive =
slide = function(f, ..., before, ref_time_values,
time_step, new_col_name = "slide_value",
as_list_col = FALSE, names_sep = "_",
all_rows = FALSE, all_versions = FALSE) {
all_versions = FALSE) {
# Perform some deprecated argument checks without using `<param> =
# deprecated()` in the function signature, because they are from
# early development versions and much more likely to be clutter than
# informative in the signature.
if ("group_by" %in% nse_dots_names(...)) {
Abort("
The `group_by` argument to `slide` has been removed; please use
Expand All @@ -200,7 +204,15 @@ grouped_epi_archive =
this check is a false positive, but you will still need to use a
different column name here and rename the resulting column after
the slide.)
")
", class = "epiprocess__epix_slide_group_by_parameter_deprecated")
}
if ("all_rows" %in% nse_dots_names(...)) {
Abort("
The `all_rows` argument has been removed from `epix_slide` (but
is still supported in `epi_slide`). Since `epix_slide` now
allows any number of rows out of slide computations, it's
unclear how `all_rows=TRUE` should fill in missing results.
", class = "epiprocess__epix_slide_all_rows_parameter_deprecated")
}

if (missing(ref_time_values)) {
Expand Down Expand Up @@ -247,30 +259,14 @@ grouped_epi_archive =
if (! (rlang::is_string(names_sep) || is.null(names_sep)) ) {
Abort("`names_sep` must be a (single) string or NULL.")
}
if (!rlang::is_bool(all_rows)) {
Abort("`all_rows` must be TRUE or FALSE.")
}
if (!rlang::is_bool(all_versions)) {
Abort("`all_versions` must be TRUE or FALSE.")
}

# Each computation is expected to output a data frame with either
# one element/row total or one element/row per encountered
# nongrouping, nontime, nonversion key value. These nongrouping,
# nontime, nonversion key columns can be seen as the "effective" key
# of the computation; the computation might return an object that
# reports a different key or no key, but the "effective" key should
# still be a valid unique key for the data, and is something that we
# could use even with `.keep = FALSE`.
comp_effective_key_vars =
setdiff(key(private$ungrouped$DT),
c(private$vars, "time_value", "version"))

# Computation for one group, one time value
comp_one_grp = function(.data_group, .group_key,
f, ...,
ref_time_value,
comp_effective_key_vars,
new_col) {
# Carry out the specified computation
comp_value = f(.data_group, .group_key, ...)
Expand All @@ -282,77 +278,12 @@ grouped_epi_archive =
.data_group = .data_group$DT
}

# Calculate the number of output elements/rows we expect the
# computation to output: one per distinct "effective computation
# key variable" value encountered in the input.
#
# Note: this mirrors how `epi_slide` does things if we're using
# unique keys, but can diverge if using nonunique keys. The
# `epi_slide` approach of counting occurrences of the
# `ref_time_value` in the `time_value` column, which helps lines
# up the computation results with corresponding rows of the
# input data, wouldn't quite apply here: we'd want to line up
# with rows (from the same group) with `version` matching the
# `ref_time_value`, but would still need to summarize these rows
# somehow and drop the `time_value` input column, but this
# summarization requires something like a to-be-unique output
# key to determine a sensible number of rows to output (and the
# contents of those rows).
count =
if (length(comp_effective_key_vars) != 0L) {
comp_effective_key_vals_in_comp_input =
if (data.table::is.data.table(.data_group)) {
.data_group[, comp_effective_key_vars, with=FALSE]
} else {
.data_group[, comp_effective_key_vars]
}
sum(!duplicated(comp_effective_key_vals_in_comp_input))
} else {
# Same idea as above, but accounting for `duplicated` working
# differently (outputting `logical(0)`) on 0-column inputs
# rather than matching the number of rows. (Instead, we use
# the same count we would get if we were counting distinct
# values of a column defined as `rep(val, target_n_rows)`.)
if (nrow(.data_group) == 0L) {
0L
} else {
1L
}
}

# If we get back an atomic vector
if (is.atomic(comp_value)) {
if (length(comp_value) == 1) {
comp_value = rep(comp_value, count)
}
# If not a singleton, should be the right length, else abort
else if (length(comp_value) != count) {
Abort('If the slide computation returns an atomic vector, then it must have either (a) a single element, or (b) one element per distinct combination of key variables, excluding the `time_value`, `version`, and grouping variables, that is present in the first argument to the computation.')
}
}

# If we get back a data frame
else if (is.data.frame(comp_value)) {
if (nrow(comp_value) == 1) {
comp_value = rep(list(comp_value), count)
}
# If not a single row, should be the right length, else abort
else if (nrow(comp_value) != count) {
Abort("If the slide computation returns a data frame, then it must have a single row, or else one row per appearance of the reference time value in the local window.")
}
# Make into a list
else {
comp_value = split(comp_value, seq_len(nrow(comp_value)))
}
}

# If neither an atomic vector data frame, then abort
else {
if (! (is.atomic(comp_value) || is.data.frame(comp_value))) {
Abort("The slide computation must return an atomic vector or a data frame.")
}

# Label every result row with the `ref_time_value`:
return(tibble::tibble(time_value = rep(.env$ref_time_value, count),
return(tibble::tibble(time_value = .env$ref_time_value,
!!new_col := .env$comp_value))
}

Expand Down Expand Up @@ -391,7 +322,6 @@ grouped_epi_archive =
group_modify_fn = function(.data_group, .group_key,
f, ...,
ref_time_value,
comp_effective_key_vars,
new_col) {
# .data_group is coming from as_of_df as a tibble, but we
# want to feed `comp_one_grp` an `epi_archive` backed by a
Expand All @@ -402,7 +332,6 @@ grouped_epi_archive =
.data_group_archive$DT = .data_group
comp_one_grp(.data_group_archive, .group_key, f = f, ...,
ref_time_value = ref_time_value,
comp_effective_key_vars = comp_effective_key_vars,
new_col = new_col
)
}
Expand All @@ -414,7 +343,6 @@ grouped_epi_archive =
dplyr::group_modify(group_modify_fn,
f = f, ...,
ref_time_value = ref_time_value,
comp_effective_key_vars = comp_effective_key_vars,
new_col = new_col,
.keep = TRUE)
)
Expand Down Expand Up @@ -501,13 +429,18 @@ grouped_epi_archive =
if (!as_list_col) {
x = tidyr::unnest(x, !!new_col, names_sep = names_sep)
}

# Join to get all rows, if we need to, then return
if (all_rows) {
cols = c(private$vars, "time_value")
y = unique(private$ungrouped$DT[, ..cols])
x = dplyr::left_join(y, x, by = cols)

if (is_epi_df(x)) {
# The analogue of `epi_df`'s `as_of` metadata for an archive is
# `<archive>$versions_end`, at least in the current absence of
# separate fields/columns denoting the "archive version" with a
# different resolution, or from the perspective of a different
# stage of a data pipeline. The `as_of` that is automatically
# derived won't always match; override:

attr(x, "metadata")[["as_of"]] <- private$ungrouped$versions_end
}

return(x)
}
)
Expand Down
Loading