diff --git a/NAMESPACE b/NAMESPACE index 4f9b8151..73db3483 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -66,7 +66,9 @@ importFrom(data.table,as.data.table) importFrom(data.table,between) importFrom(data.table,copy) importFrom(data.table,key) +importFrom(data.table,rbindlist) importFrom(data.table,set) +importFrom(data.table,setDF) importFrom(data.table,setkeyv) importFrom(dplyr,arrange) importFrom(dplyr,bind_rows) @@ -114,6 +116,8 @@ importFrom(rlang,syms) importFrom(stats,cor) importFrom(stats,median) importFrom(tibble,as_tibble) +importFrom(tibble,new_tibble) +importFrom(tibble,validate_tibble) importFrom(tidyr,unnest) importFrom(tidyselect,eval_select) importFrom(tidyselect,starts_with) diff --git a/R/archive.R b/R/archive.R index 5897fc4d..7a7d8d82 100644 --- a/R/archive.R +++ b/R/archive.R @@ -495,7 +495,10 @@ epi_archive = version <= max_version, ] %>% unique(by = c("geo_value", "time_value", other_keys), fromLast = TRUE) %>% - tibble::as_tibble() %>% + tibble::as_tibble() %>% + # (`as_tibble` should de-alias the DT and its columns in any edge + # cases where they are aliased. We don't say we guarantee this + # though.) dplyr::select(-"version") %>% as_epi_df(geo_type = self$geo_type, time_type = self$time_type, diff --git a/R/epi_df.R b/R/epi_df.R index c2b84c83..045c4aaf 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -159,7 +159,13 @@ new_epi_df = function(x = tibble::tibble(), geo_type, time_type, as_of, # Reorder columns (geo_value, time_value, ...) if(sum(dim(x)) != 0){ - x = dplyr::relocate(x, "geo_value", "time_value") + cols_to_put_first <- c("geo_value", "time_value") + x <- x[, c( + cols_to_put_first, + # All other columns + names(x)[!(names(x) %in% cols_to_put_first)] + ) + ] } # Apply epi_df class, attach metadata, and return diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index d1ddf5bf..b11bf821 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -185,7 +185,9 @@ grouped_epi_archive = #' @description Slides a given function over variables in a `grouped_epi_archive` #' object. See the documentation for the wrapper function [`epix_slide()`] for #' details. -#' @importFrom data.table key address +#' @importFrom data.table key address rbindlist setDF +#' @importFrom tibble as_tibble new_tibble validate_tibble +#' @importFrom dplyr group_by groups #' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms #' env missing_arg slide = function(f, ..., before, ref_time_values, @@ -280,16 +282,19 @@ grouped_epi_archive = if (! (is.atomic(comp_value) || is.data.frame(comp_value))) { Abort("The slide computation must return an atomic vector or a data frame.") } + + # Label every result row with the `ref_time_value` + res <- list(time_value = ref_time_value) + # Wrap the computation output in a list and unchop/unnest later if # `as_list_col = FALSE`. This approach means that we will get a # list-class col rather than a data.frame-class col when # `as_list_col = TRUE` and the computations outputs are data # frames. - comp_value <- list(comp_value) - - # Label every result row with the `ref_time_value`: - return(tibble::tibble(time_value = .env$ref_time_value, - !!new_col := .env$comp_value)) + res[[new_col]] <- list(comp_value) + + # Convert the list to a tibble all at once for speed. + return(validate_tibble(new_tibble(res))) } # If `f` is missing, interpret ... as an expression for tidy evaluation @@ -308,7 +313,7 @@ grouped_epi_archive = } f = as_slide_computation(f, ...) - x = purrr::map_dfr(ref_time_values, function(ref_time_value) { + x = lapply(ref_time_values, function(ref_time_value) { # Ungrouped as-of data; `epi_df` if `all_versions` is `FALSE`, # `epi_archive` if `all_versions` is `TRUE`: as_of_raw = private$ungrouped$as_of(ref_time_value, min_time_value = ref_time_value - before, all_versions = all_versions) @@ -331,6 +336,13 @@ grouped_epi_archive = # copies. if (address(as_of_archive$DT) == address(private$ungrouped$DT)) { # `as_of` aliased its the full `$DT`; copy before mutating: + # + # Note: this step is probably unneeded; we're fine with + # aliasing of the DT or its columns: vanilla operations aren't + # going to mutate them in-place if they are aliases, and we're + # not performing mutation (unlike the situation with + # `fill_through_version` where we do mutate a `DT` and don't + # want aliasing). as_of_archive$DT <- copy(as_of_archive$DT) } dt_key = data.table::key(as_of_archive$DT) @@ -357,15 +369,20 @@ grouped_epi_archive = } return( - dplyr::group_by(as_of_df, dplyr::across(tidyselect::all_of(private$vars)), - .drop=private$drop) %>% - dplyr::group_modify(group_modify_fn, - f = f, ..., - ref_time_value = ref_time_value, - new_col = new_col, - .keep = TRUE) + dplyr::group_modify( + dplyr::group_by(as_of_df, !!!syms(private$vars), .drop=private$drop), + group_modify_fn, + f = f, ..., + ref_time_value = ref_time_value, + new_col = new_col, + .keep = TRUE + ) ) }) + # Combine output into a single tibble + x <- as_tibble(setDF(rbindlist(x))) + # Reconstruct groups + x <- group_by(x, !!!syms(private$vars), .drop=private$drop) # Unchop/unnest if we need to if (!as_list_col) { diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index c110555c..367fe759 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -32,13 +32,18 @@ #' x$as_of(max_version = v) #' ``` #' -#' @export +#' Mutation and aliasing: `epix_as_of` and `$as_of` will not mutate the input +#' archives, but may in some edge cases alias parts of the inputs, so copy the +#' outputs if needed before using mutating operations like `data.table`'s `:=` +#' operator. Currently, the only situation where there is potentially aliasing +#' is of the `DT` in edge cases with `all_versions = TRUE`, but this may change +#' in the future. +#' #' @examples #' # warning message of data latency shown #' epix_as_of(x = archive_cases_dv_subset, #' max_version = max(archive_cases_dv_subset$DT$version)) #' -#' @export #' @examples #' #' range(archive_cases_dv_subset$DT$version) # 2020-06-02 -- 2021-12-01 @@ -60,6 +65,8 @@ #' }, epiprocess__snapshot_as_of_clobberable_version = function(wrn) invokeRestart("muffleWarning")) #' # Since R 4.0, there is a `globalCallingHandlers` function that can be used #' # to globally toggle these warnings. +#' +#' @export epix_as_of = function(x, max_version, min_time_value = -Inf, all_versions = FALSE) { if (!inherits(x, "epi_archive")) Abort("`x` must be of class `epi_archive`.") return(x$as_of(max_version, min_time_value, all_versions = all_versions)) @@ -798,6 +805,14 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr #' x$slide(new_var = comp(old_var), before = 119) #' ``` #' +#' Mutation and aliasing: `epix_slide` and `$slide` will not perform in-place +#' mutation of the input archives on their own. In some edge cases the inputs it +#' feeds to the slide computations may alias parts of the input archive, so copy +#' the slide computation inputs if needed before using mutating operations like +#' `data.table`'s `:=` operator. Similarly, in some edge cases, the output of +#' the slide operation may alias parts of the input archive, so similarly, make +#' sure to clone and/or copy appropriately before using in-place mutation. +#' #' @examples #' library(dplyr) #' diff --git a/R/slide.R b/R/slide.R index 7467f219..0feb689a 100644 --- a/R/slide.R +++ b/R/slide.R @@ -300,8 +300,7 @@ epi_slide = function(x, f, ..., before, after, ref_time_values, # Count the number of appearances of each reference time value (these # appearances should all be real for now, but if we allow ref time values # outside of .data_group's time values): - counts = .data_group %>% - dplyr::filter(.data$time_value %in% time_values) %>% + counts = dplyr::filter(.data_group, .data$time_value %in% time_values) %>% dplyr::count(.data$time_value) %>% dplyr::pull(n) @@ -375,8 +374,7 @@ epi_slide = function(x, f, ..., before, after, ref_time_values, .x$.real <- NULL f(.x, .group_key, .ref_time_value, ...) } - x = x %>% - group_modify(slide_one_grp, + x = group_modify(x, slide_one_grp, f = f_wrapper, ..., starts = starts, stops = stops, diff --git a/man/epix_as_of.Rd b/man/epix_as_of.Rd index dcdf167d..51884597 100644 --- a/man/epix_as_of.Rd +++ b/man/epix_as_of.Rd @@ -45,6 +45,13 @@ is equivalent to: \if{html}{\out{