Skip to content

Commit 214100d

Browse files
authored
Merge pull request #519 from cmu-delphi/ds/epi-slide-group
undefined
2 parents c167ddf + 409dcac commit 214100d

14 files changed

+1039
-937
lines changed

NAMESPACE

+6
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@ S3method("names<-",epi_df)
55
S3method(Summary,epi_df)
66
S3method(arrange_canonical,default)
77
S3method(arrange_canonical,epi_df)
8+
S3method(arrange_col_canonical,default)
9+
S3method(arrange_col_canonical,epi_df)
10+
S3method(arrange_row_canonical,default)
11+
S3method(arrange_row_canonical,epi_df)
812
S3method(as_epi_df,data.frame)
913
S3method(as_epi_df,epi_df)
1014
S3method(as_epi_df,tbl_df)
@@ -76,6 +80,7 @@ export(filter)
7680
export(full_seq)
7781
export(geo_column_names)
7882
export(group_by)
83+
export(group_epi_df)
7984
export(group_modify)
8085
export(growth_rate)
8186
export(guess_period)
@@ -91,6 +96,7 @@ export(relocate)
9196
export(rename)
9297
export(revision_summary)
9398
export(slice)
99+
export(sum_groups_epi_df)
94100
export(time_column_names)
95101
export(ungroup)
96102
export(unnest)

R/epi_df.R

+18-2
Original file line numberDiff line numberDiff line change
@@ -245,10 +245,10 @@ as_epi_df.tbl_df <- function(
245245
)
246246
}
247247
if (lifecycle::is_present(geo_type)) {
248-
cli_warn("epi_archive constructor argument `geo_type` is now ignored. Consider removing.")
248+
cli_warn("epi_df constructor argument `geo_type` is now ignored. Consider removing.")
249249
}
250250
if (lifecycle::is_present(time_type)) {
251-
cli_warn("epi_archive constructor argument `time_type` is now ignored. Consider removing.")
251+
cli_warn("epi_df constructor argument `time_type` is now ignored. Consider removing.")
252252
}
253253

254254
# If geo type is missing, then try to guess it
@@ -277,6 +277,22 @@ as_epi_df.tbl_df <- function(
277277
}
278278

279279
assert_character(other_keys)
280+
281+
if (".time_value_counts" %in% other_keys) {
282+
cli_abort("as_epi_df: `other_keys` can't include \".time_value_counts\"")
283+
}
284+
duplicated_time_values <- x %>%
285+
group_by(across(all_of(c("geo_value", "time_value", other_keys)))) %>%
286+
filter(dplyr::n() > 1) %>%
287+
ungroup()
288+
if (nrow(duplicated_time_values) > 0) {
289+
bad_data <- capture.output(duplicated_time_values)
290+
cli_abort(
291+
"as_epi_df: some groups in the data have duplicated time values. epi_df requires a unique time_value per group.",
292+
body = c("Sample groups:", bad_data)
293+
)
294+
}
295+
280296
new_epi_df(x, geo_type, time_type, as_of, other_keys)
281297
}
282298

R/grouped_epi_archive.R

+3-2
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,7 @@ epix_slide.grouped_epi_archive <- function(
278278
.versions <- sort(.versions)
279279
}
280280

281-
validate_slide_window_arg(.before, .x$private$ungrouped$time_type)
281+
validate_slide_window_arg(.before, .x$private$ungrouped$time_type, lower = 0) # nolint: object_usage_linter
282282

283283
checkmate::assert_string(.new_col_name, null.ok = TRUE)
284284
if (!is.null(.new_col_name)) {
@@ -292,7 +292,8 @@ epix_slide.grouped_epi_archive <- function(
292292
))
293293
}
294294
if (identical(.new_col_name, "version")) {
295-
cli_abort('`.new_col_name` must not be `"version"`; `epix_slide()` uses that column name to attach the element of `.versions` associated with each slide computation') # nolint: line_length_linter
295+
cli_abort('`.new_col_name` must not be `"version"`; `epix_slide()` uses that column name to attach the element
296+
of `.versions` associated with each slide computation')
296297
}
297298
}
298299

R/methods-epi_archive.R

+20-17
Original file line numberDiff line numberDiff line change
@@ -623,26 +623,29 @@ epix_detailed_restricted_mutate <- function(.data, ...) {
623623
#' @param .f Function, formula, or missing; together with `...` specifies the
624624
#' computation to slide. To "slide" means to apply a computation over a
625625
#' sliding (a.k.a. "rolling") time window for each data group. The window is
626-
#' determined by the `before` parameter described below. One time step is
627-
#' typically one day or one week; see [`epi_slide`] details for more
628-
#' explanation. If a function, `.f` must take an `epi_df` with the same
629-
#' column names as the archive's `DT`, minus the `version` column; followed
630-
#' by a one-row tibble containing the values of the grouping variables for
631-
#' the associated group; followed by a reference time value, usually as a
632-
#' `Date` object; followed by any number of named arguments. If a formula,
633-
#' `.f` can operate directly on columns accessed via `.x$var` or `.$var`, as
634-
#' in `~ mean (.x$var)` to compute a mean of a column `var` for each
635-
#' group-`ref_time_value` combination. The group key can be accessed via
636-
#' `.y` or `.group_key`, and the reference time value can be accessed via
637-
#' `.z` or `.ref_time_value`. If `.f` is missing, then `...` will specify the
626+
#' determined by the `.before` parameter (see details for more). If a
627+
#' function, `.f` must have the form `function(x, g, t, ...)`, where
628+
#'
629+
#' - "x" is an epi_df with the same column names as the archive's `DT`, minus
630+
#' the `version` column
631+
#' - "g" is a one-row tibble containing the values of the grouping variables
632+
#' for the associated group
633+
#' - "t" is the ref_time_value for the current window
634+
#' - "..." are additional arguments
635+
#'
636+
#' If a formula, `.f` can operate directly on columns accessed via `.x$var` or
637+
#' `.$var`, as in `~ mean (.x$var)` to compute a mean of a column `var` for
638+
#' each group-`ref_time_value` combination. The group key can be accessed via
639+
#' `.y` or `.group_key`, and the reference time value can be accessed via `.z`
640+
#' or `.ref_time_value`. If `.f` is missing, then `...` will specify the
638641
#' computation.
639642
#' @param ... Additional arguments to pass to the function or formula specified
640-
#' via `f`. Alternatively, if `.f` is missing, then the `...` is interpreted as
641-
#' a ["data-masking"][rlang::args_data_masking] expression or expressions for
642-
#' tidy evaluation; in addition to referring columns directly by name, the
643+
#' via `f`. Alternatively, if `.f` is missing, then the `...` is interpreted
644+
#' as a ["data-masking"][rlang::args_data_masking] expression or expressions
645+
#' for tidy evaluation; in addition to referring columns directly by name, the
643646
#' expressions have access to `.data` and `.env` pronouns as in `dplyr` verbs,
644-
#' and can also refer to `.x`, `.group_key`, and `.ref_time_value`. See
645-
#' details.
647+
#' and can also refer to `.x` (not the same as the input epi_archive),
648+
#' `.group_key`, and `.ref_time_value`. See details for more.
646649
#' @param .before How many time values before the `.ref_time_value`
647650
#' should each snapshot handed to the function `.f` contain? If provided, it
648651
#' should be a single value that is compatible with the time_type of the

R/methods-epi_df.R

+100-6
Original file line numberDiff line numberDiff line change
@@ -255,9 +255,10 @@ group_modify.epi_df <- function(.data, .f, ..., .keep = FALSE) {
255255

256256
#' Complete epi_df
257257
#'
258-
#' A [tidyr::complete()] analogue for `epi_df` objects. This function fills in
259-
#' missing combinations of `geo_value` and `time_value` with `NA` values. See
260-
#' the examples for usage details.
258+
#' A ‘tidyr::complete()’ analogue for ‘epi_df’ objects. This function
259+
#' can be used, for example, to add rows for missing combinations
260+
#' of ‘geo_value’ and ‘time_value’, filling other columns with `NA`s.
261+
#' See the examples for usage details.
261262
#'
262263
#' @param data an `epi_df`
263264
#' @param ... see [`tidyr::complete`]
@@ -378,8 +379,101 @@ arrange_canonical.default <- function(x, ...) {
378379
#' @export
379380
arrange_canonical.epi_df <- function(x, ...) {
380381
rlang::check_dots_empty()
381-
keys <- key_colnames(x)
382382
x %>%
383-
dplyr::relocate(dplyr::all_of(keys), .before = 1) %>%
384-
dplyr::arrange(dplyr::across(dplyr::all_of(keys)))
383+
arrange_row_canonical() %>%
384+
arrange_col_canonical()
385+
}
386+
387+
arrange_row_canonical <- function(x, ...) {
388+
UseMethod("arrange_row_canonical")
389+
}
390+
391+
#' @export
392+
arrange_row_canonical.default <- function(x, ...) {
393+
rlang::check_dots_empty()
394+
cli::cli_abort(c(
395+
"`arrange_row_canonical()` is only meaningful for an {.cls epi_df}."
396+
))
397+
return(x)
398+
}
399+
400+
#' @export
401+
arrange_row_canonical.epi_df <- function(x, ...) {
402+
rlang::check_dots_empty()
403+
x %>% dplyr::arrange(dplyr::across(dplyr::all_of(key_colnames(.))))
404+
}
405+
406+
arrange_col_canonical <- function(x, ...) {
407+
UseMethod("arrange_col_canonical")
408+
}
409+
410+
#' @export
411+
arrange_col_canonical.default <- function(x, ...) {
412+
rlang::check_dots_empty()
413+
cli::cli_abort(c(
414+
"`arrange_col_canonical()` is only meaningful for an {.cls epi_df}."
415+
))
416+
return(x)
417+
}
418+
419+
#' @export
420+
arrange_col_canonical.epi_df <- function(x, ...) {
421+
rlang::check_dots_empty()
422+
x %>% dplyr::relocate(dplyr::all_of(key_colnames(.)), .before = 1)
423+
}
424+
425+
#' @export
426+
group_epi_df <- function(x) {
427+
x %>% group_by(across(all_of(kill_time_value(key_colnames(.)))))
428+
}
429+
430+
#' Aggregate an `epi_df` object
431+
#'
432+
#' Aggregates an `epi_df` object by the specified group columns, summing the
433+
#' `value` column, and returning an `epi_df`. If aggregating over `geo_value`,
434+
#' the resulting `epi_df` will have `geo_value` set to `"total"`.
435+
#'
436+
#' @param .x an `epi_df`
437+
#' @param value_col character vector of the columns to aggregate
438+
#' @param group_cols character vector of column names to group by. "time_value" is
439+
#' included by default.
440+
#' @return an `epi_df` object
441+
#'
442+
#' @export
443+
sum_groups_epi_df <- function(.x, sum_cols = "value", group_cols = character()) {
444+
assert_class(.x, "epi_df")
445+
assert_character(sum_cols)
446+
assert_character(group_cols)
447+
checkmate::assert_subset(sum_cols, setdiff(names(.x), key_colnames(.x)))
448+
checkmate::assert_subset(group_cols, key_colnames(.x))
449+
if (!"time_value" %in% group_cols) {
450+
group_cols <- c("time_value", group_cols)
451+
}
452+
453+
out <- .x %>%
454+
group_by(across(all_of(group_cols))) %>%
455+
dplyr::summarize(across(all_of(sum_cols), sum), .groups = "drop")
456+
457+
# To preserve epi_df-ness, we need to ensure that the `geo_value` column is
458+
# present.
459+
out <- if (!"geo_value" %in% group_cols) {
460+
out %>%
461+
mutate(geo_value = "total") %>%
462+
relocate(geo_value, .before = 1)
463+
} else {
464+
out
465+
}
466+
467+
# The `geo_type` will be correctly inherited here by the following logic:
468+
# - if `geo_value` is in `group_cols`, then the constructor will see the
469+
# geo_value here and will correctly read the existing values
470+
# - if `geo_value` is not in `group_cols`, then the constructor will see
471+
# the unrecognizeable "total" value and will correctly infer the "custom"
472+
# geo_type.
473+
out %>%
474+
as_epi_df(
475+
as_of = attr(.x, "metadata")$as_of,
476+
other_keys = intersect(attr(.x, "metadata")$other_keys, group_cols)
477+
) %>%
478+
arrange_canonical()
385479
}

0 commit comments

Comments
 (0)