diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml
index f4b17a4b..2fca5dbd 100644
--- a/.github/workflows/R-CMD-check.yaml
+++ b/.github/workflows/R-CMD-check.yaml
@@ -4,7 +4,7 @@ on:
push:
branches: [main, master]
pull_request:
- branches: [main, master]
+ branches: [main, master, dev]
name: R-CMD-check
diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml
index 087f0b05..847176d3 100644
--- a/.github/workflows/pkgdown.yaml
+++ b/.github/workflows/pkgdown.yaml
@@ -4,7 +4,7 @@ on:
push:
branches: [main, master]
pull_request:
- branches: [main, master]
+ branches: [main, master, dev]
release:
types: [published]
workflow_dispatch:
diff --git a/R/archive.R b/R/archive.R
index d5be0ed4..086d31b3 100644
--- a/R/archive.R
+++ b/R/archive.R
@@ -640,7 +640,7 @@ epi_archive =
slide = function(f, ..., before, ref_time_values,
time_step, new_col_name = "slide_value",
as_list_col = FALSE, names_sep = "_",
- all_rows = FALSE, all_versions = FALSE) {
+ all_versions = FALSE) {
# For an "ungrouped" slide, treat all rows as belonging to one big
# group (group by 0 vars), like `dplyr::summarize`, and let the
# resulting `grouped_epi_archive` handle the slide:
@@ -649,7 +649,7 @@ epi_archive =
before = before, ref_time_values = ref_time_values,
time_step = time_step, new_col_name = new_col_name,
as_list_col = as_list_col, names_sep = names_sep,
- all_rows = all_rows, all_versions = all_versions
+ all_versions = all_versions
) %>%
# We want a slide on ungrouped archives to output something
# ungrouped, rather than retaining the trivial (0-variable)
diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R
index 1c6bd311..ddd4527f 100644
--- a/R/grouped_epi_archive.R
+++ b/R/grouped_epi_archive.R
@@ -190,7 +190,11 @@ grouped_epi_archive =
slide = function(f, ..., before, ref_time_values,
time_step, new_col_name = "slide_value",
as_list_col = FALSE, names_sep = "_",
- all_rows = FALSE, all_versions = FALSE) {
+ all_versions = FALSE) {
+ # Perform some deprecated argument checks without using ` =
+ # deprecated()` in the function signature, because they are from
+ # early development versions and much more likely to be clutter than
+ # informative in the signature.
if ("group_by" %in% nse_dots_names(...)) {
Abort("
The `group_by` argument to `slide` has been removed; please use
@@ -200,7 +204,15 @@ grouped_epi_archive =
this check is a false positive, but you will still need to use a
different column name here and rename the resulting column after
the slide.)
- ")
+ ", class = "epiprocess__epix_slide_group_by_parameter_deprecated")
+ }
+ if ("all_rows" %in% nse_dots_names(...)) {
+ Abort("
+ The `all_rows` argument has been removed from `epix_slide` (but
+ is still supported in `epi_slide`). Since `epix_slide` now
+ allows any number of rows out of slide computations, it's
+ unclear how `all_rows=TRUE` should fill in missing results.
+ ", class = "epiprocess__epix_slide_all_rows_parameter_deprecated")
}
if (missing(ref_time_values)) {
@@ -247,30 +259,14 @@ grouped_epi_archive =
if (! (rlang::is_string(names_sep) || is.null(names_sep)) ) {
Abort("`names_sep` must be a (single) string or NULL.")
}
- if (!rlang::is_bool(all_rows)) {
- Abort("`all_rows` must be TRUE or FALSE.")
- }
if (!rlang::is_bool(all_versions)) {
Abort("`all_versions` must be TRUE or FALSE.")
}
- # Each computation is expected to output a data frame with either
- # one element/row total or one element/row per encountered
- # nongrouping, nontime, nonversion key value. These nongrouping,
- # nontime, nonversion key columns can be seen as the "effective" key
- # of the computation; the computation might return an object that
- # reports a different key or no key, but the "effective" key should
- # still be a valid unique key for the data, and is something that we
- # could use even with `.keep = FALSE`.
- comp_effective_key_vars =
- setdiff(key(private$ungrouped$DT),
- c(private$vars, "time_value", "version"))
-
# Computation for one group, one time value
comp_one_grp = function(.data_group, .group_key,
f, ...,
ref_time_value,
- comp_effective_key_vars,
new_col) {
# Carry out the specified computation
comp_value = f(.data_group, .group_key, ...)
@@ -282,77 +278,12 @@ grouped_epi_archive =
.data_group = .data_group$DT
}
- # Calculate the number of output elements/rows we expect the
- # computation to output: one per distinct "effective computation
- # key variable" value encountered in the input.
- #
- # Note: this mirrors how `epi_slide` does things if we're using
- # unique keys, but can diverge if using nonunique keys. The
- # `epi_slide` approach of counting occurrences of the
- # `ref_time_value` in the `time_value` column, which helps lines
- # up the computation results with corresponding rows of the
- # input data, wouldn't quite apply here: we'd want to line up
- # with rows (from the same group) with `version` matching the
- # `ref_time_value`, but would still need to summarize these rows
- # somehow and drop the `time_value` input column, but this
- # summarization requires something like a to-be-unique output
- # key to determine a sensible number of rows to output (and the
- # contents of those rows).
- count =
- if (length(comp_effective_key_vars) != 0L) {
- comp_effective_key_vals_in_comp_input =
- if (data.table::is.data.table(.data_group)) {
- .data_group[, comp_effective_key_vars, with=FALSE]
- } else {
- .data_group[, comp_effective_key_vars]
- }
- sum(!duplicated(comp_effective_key_vals_in_comp_input))
- } else {
- # Same idea as above, but accounting for `duplicated` working
- # differently (outputting `logical(0)`) on 0-column inputs
- # rather than matching the number of rows. (Instead, we use
- # the same count we would get if we were counting distinct
- # values of a column defined as `rep(val, target_n_rows)`.)
- if (nrow(.data_group) == 0L) {
- 0L
- } else {
- 1L
- }
- }
-
- # If we get back an atomic vector
- if (is.atomic(comp_value)) {
- if (length(comp_value) == 1) {
- comp_value = rep(comp_value, count)
- }
- # If not a singleton, should be the right length, else abort
- else if (length(comp_value) != count) {
- Abort('If the slide computation returns an atomic vector, then it must have either (a) a single element, or (b) one element per distinct combination of key variables, excluding the `time_value`, `version`, and grouping variables, that is present in the first argument to the computation.')
- }
- }
-
- # If we get back a data frame
- else if (is.data.frame(comp_value)) {
- if (nrow(comp_value) == 1) {
- comp_value = rep(list(comp_value), count)
- }
- # If not a single row, should be the right length, else abort
- else if (nrow(comp_value) != count) {
- Abort("If the slide computation returns a data frame, then it must have a single row, or else one row per appearance of the reference time value in the local window.")
- }
- # Make into a list
- else {
- comp_value = split(comp_value, seq_len(nrow(comp_value)))
- }
- }
-
- # If neither an atomic vector data frame, then abort
- else {
+ if (! (is.atomic(comp_value) || is.data.frame(comp_value))) {
Abort("The slide computation must return an atomic vector or a data frame.")
}
-
+
# Label every result row with the `ref_time_value`:
- return(tibble::tibble(time_value = rep(.env$ref_time_value, count),
+ return(tibble::tibble(time_value = .env$ref_time_value,
!!new_col := .env$comp_value))
}
@@ -391,7 +322,6 @@ grouped_epi_archive =
group_modify_fn = function(.data_group, .group_key,
f, ...,
ref_time_value,
- comp_effective_key_vars,
new_col) {
# .data_group is coming from as_of_df as a tibble, but we
# want to feed `comp_one_grp` an `epi_archive` backed by a
@@ -402,7 +332,6 @@ grouped_epi_archive =
.data_group_archive$DT = .data_group
comp_one_grp(.data_group_archive, .group_key, f = f, ...,
ref_time_value = ref_time_value,
- comp_effective_key_vars = comp_effective_key_vars,
new_col = new_col
)
}
@@ -414,7 +343,6 @@ grouped_epi_archive =
dplyr::group_modify(group_modify_fn,
f = f, ...,
ref_time_value = ref_time_value,
- comp_effective_key_vars = comp_effective_key_vars,
new_col = new_col,
.keep = TRUE)
)
@@ -501,13 +429,18 @@ grouped_epi_archive =
if (!as_list_col) {
x = tidyr::unnest(x, !!new_col, names_sep = names_sep)
}
-
- # Join to get all rows, if we need to, then return
- if (all_rows) {
- cols = c(private$vars, "time_value")
- y = unique(private$ungrouped$DT[, ..cols])
- x = dplyr::left_join(y, x, by = cols)
+
+ if (is_epi_df(x)) {
+ # The analogue of `epi_df`'s `as_of` metadata for an archive is
+ # `$versions_end`, at least in the current absence of
+ # separate fields/columns denoting the "archive version" with a
+ # different resolution, or from the perspective of a different
+ # stage of a data pipeline. The `as_of` that is automatically
+ # derived won't always match; override:
+
+ attr(x, "metadata")[["as_of"]] <- private$ungrouped$versions_end
}
+
return(x)
}
)
diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R
index 348c5c01..420e5f1c 100644
--- a/R/methods-epi_archive.R
+++ b/R/methods-epi_archive.R
@@ -478,16 +478,17 @@ epix_detailed_restricted_mutate = function(.data, ...) {
} else {
# Have `dplyr` do the `dplyr_col_modify`, keeping the column-level-aliasing
# and must-copy-on-write-if-refcount-more-than-1 model, obtaining a tibble,
- # then `setDT`-ing it in place to be a `data.table`. The key should still be
- # valid (assuming that the user did not explicitly alter `key(.data$DT)` or
- # the columns by reference somehow within `...` tidyeval-style computations,
- # or trigger refcount-1 alterations due to still having >1 refcounts on the
- # columns), so in between, set the "sorted" attribute accordingly to prevent
- # attempted sorting (including potential extra copies) or sortedness
- # checking, then `setDT`.
- out_DT = dplyr::dplyr_col_modify(in_tbl, col_modify_cols) # tibble
- data.table::setattr(out_DT, "sorted", data.table::key(.data$DT))
- data.table::setDT(out_DT, key=key(.data$DT))
+ # then convert it into a `data.table`. The key should still be valid
+ # (assuming that the user did not explicitly alter `key(.data$DT)` or the
+ # columns by reference somehow within `...` tidyeval-style computations, or
+ # trigger refcount-1 alterations due to still having >1 refcounts on the
+ # columns), set the "sorted" attribute accordingly to prevent attempted
+ # sorting (including potential extra copies) or sortedness checking, then
+ # `setDT` (rather than `as.data.table`, in order to prevent column copying
+ # to establish ownership according to `data.table`'s memory model).
+ out_DT = dplyr::dplyr_col_modify(in_tbl, col_modify_cols) %>%
+ data.table::setattr("sorted", data.table::key(.data$DT)) %>%
+ data.table::setDT(key=key(.data$DT))
out_archive = .data$clone()
out_archive$DT <- out_DT
request_names = names(col_modify_cols)
@@ -512,13 +513,13 @@ epix_detailed_restricted_mutate = function(.data, ...) {
#'
#' @param .data An `epi_archive` or `grouped_epi_archive`
#' @param ... Similar to [`dplyr::group_by`] (see "Details:" for edge cases);
-#' * In `group_by`: unquoted variable name(s) or other ["data
-#' masking"][dplyr::dplyr_data_masking] expression(s). It's possible to use
-#' [`dplyr::mutate`]-like syntax here to calculate new columns on which to
+#' * For `group_by`: unquoted variable name(s) or other
+#' ["data masking"][dplyr::dplyr_data_masking] expression(s). It's possible to
+#' use [`dplyr::mutate`]-like syntax here to calculate new columns on which to
#' perform grouping, but note that, if you are regrouping an already-grouped
#' `.data` object, the calculations will be carried out ignoring such grouping
#' (same as [in dplyr][dplyr::group_by]).
-#' * In `ungroup`: either
+#' * For `ungroup`: either
#' * empty, in order to remove the grouping and output an `epi_archive`; or
#' * variable name(s) or other ["tidy-select"][dplyr::dplyr_tidy_select]
#' expression(s), in order to remove the matching variables from the list of
@@ -527,12 +528,13 @@ epix_detailed_restricted_mutate = function(.data, ...) {
#' the variable selection from `...` only; if `TRUE`, the output will be
#' grouped by the current grouping variables plus the variable selection from
#' `...`.
-#' @param .drop As in [`dplyr::group_by`]; determines treatment of factor
-#' columns.
-#' @param x a `grouped_epi_archive`, or, in `is_grouped_epi_archive`, any object
-#' @param .tbl An `epi_archive` or `grouped_epi_archive` (`epi_archive`
-#' dispatches to the S3 default method, and `grouped_epi_archive` dispatches
-#' its own S3 method)
+#' @param .drop As described in [`dplyr::group_by`]; determines treatment of
+#' factor columns.
+#' @param x For `groups` or `ungroup`: a `grouped_epi_archive`; for
+#' `is_grouped_epi_archive`: any object
+#' @param .tbl (For `group_by_drop_default`:) an `epi_archive` or
+#' `grouped_epi_archive` (`epi_archive` dispatches to the S3 default method;
+#' `grouped_epi_archive` dispatches its own S3 method)
#'
#' @details
#'
@@ -614,19 +616,10 @@ epix_detailed_restricted_mutate = function(.data, ...) {
#' # To get the grouping variable names as a `list` of `name`s (a.k.a. symbols):
#' toy_archive %>% group_by(geo_value) %>% groups()
#'
-#' # `.drop = FALSE` is supported in a sense; `f` is called on 0-row inputs for
-#' # the missing groups identified by `dplyr`, but the row-recycling rules will
-#' # exclude the corresponding outputs of `f` from the output of the slide:
-#' all.equal(
-#' toy_archive %>%
-#' group_by(geo_value, age_group, .drop=FALSE) %>%
-#' epix_slide(f = ~ sum(.x$value), before = 20) %>%
-#' ungroup(),
-#' toy_archive %>%
-#' group_by(geo_value, age_group, .drop=TRUE) %>%
-#' epix_slide(f = ~ sum(.x$value), before = 20) %>%
-#' ungroup()
-#' )
+#' toy_archive %>%
+#' group_by(geo_value, age_group, .drop=FALSE) %>%
+#' epix_slide(f = ~ sum(.x$value), before = 20) %>%
+#' ungroup()
#'
#' @importFrom dplyr group_by
#' @export
@@ -635,6 +628,20 @@ epix_detailed_restricted_mutate = function(.data, ...) {
group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_drop_default(.data)) {
# `add` makes no difference; this is an ungrouped `epi_archive`.
detailed_mutate = epix_detailed_restricted_mutate(.data, ...)
+ if (!rlang::is_bool(.drop)) {
+ Abort("`.drop` must be TRUE or FALSE")
+ } else if (!.drop) {
+ grouping_cols = as.list(detailed_mutate[["archive"]][["DT"]])[detailed_mutate[["request_names"]]]
+ grouping_col_is_factor = purrr::map_lgl(grouping_cols, is.factor)
+ # ^ Use `as.list` to try to avoid any possibility of a deep copy.
+ if (!any(grouping_col_is_factor)) {
+ Warn("`.drop=FALSE` but there are no factor grouping columns; did you mean to convert one of the columns to a factor beforehand?",
+ class = "epiprocess__group_by_epi_archive_drop_FALSE_no_factors")
+ } else if (any(diff(grouping_col_is_factor) == -1L)) {
+ Warn("`.drop=FALSE` but there are one or more non-factor grouping columns listed after a factor grouping column; this may produce groups with `NA`s for these columns; see https://github.com/tidyverse/dplyr/issues/5369#issuecomment-683762553; depending on how you want completion to work, you might instead want to convert all grouping columns to factors beforehand, specify the non-factor grouping columns first, or use `.drop=TRUE` and add a call to `tidyr::complete`.",
+ class = "epiprocess__group_by_epi_archive_drop_FALSE_nonfactor_after_factor")
+ }
+ }
grouped_epi_archive$new(detailed_mutate[["archive"]],
detailed_mutate[["request_names"]],
drop = .drop)
@@ -707,15 +714,12 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr
#' @param names_sep String specifying the separator to use in `tidyr::unnest()`
#' when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix
#' from `new_col_name` entirely.
-#' @param all_rows If `all_rows = TRUE`, then the output will have one row per
-#' combination of grouping variables and unique time values in the underlying
-#' data table. Otherwise, there will be one row in the output for each time
-#' value in `x` that acts as a reference time value. Default is `FALSE`.
-#' @param all_versions If `all_versions = TRUE`, then `f` will be passed the
-#' version history (all `version <= ref_time_value`) for rows having
-#' `time_value` between `ref_time_value - before` and `ref_time_value`.
-#' Otherwise, `f` will be passed only the most recent `version` for every
-#' unique `time_value`. Default is `FALSE`.
+#' @param all_versions (Not the same as `all_rows` parameter of `epi_slide`.) If
+#' `all_versions = TRUE`, then `f` will be passed the version history (all
+#' `version <= ref_time_value`) for rows having `time_value` between
+#' `ref_time_value - before` and `ref_time_value`. Otherwise, `f` will be
+#' passed only the most recent `version` for every unique `time_value`.
+#' Default is `FALSE`.
#' @return A tibble whose columns are: the grouping variables, `time_value`,
#' containing the reference time values for the slide computation, and a
#' column named according to the `new_col_name` argument, containing the slide
@@ -736,32 +740,34 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr
#' ends); `epi_slide` windows extend from `before` time steps before a
#' `ref_time_value` through `after` time steps after `ref_time_value`.
#' 3. The input class and columns are similar but different: `epix_slide`
-#' keeps all columns and the `epi_df`-ness of the first input to the
-#' computation; `epi_slide` only provides the grouping variables in the second
-#' input, and will convert the first input into a regular tibble if the
-#' grouping variables include the essential `geo_value` column.
+#' (with the default `all_versions=FALSE`) keeps all columns and the
+#' `epi_df`-ness of the first argument to each computation; `epi_slide` only
+#' provides the grouping variables in the second input, and will convert the
+#' first input into a regular tibble if the grouping variables include the
+#' essential `geo_value` column. (With `all_versions=TRUE`, `epix_slide` will
+#' will provide an `epi_archive` rather than an `epi-df` to each
+#' computation.)
#' 4. The output class and columns are similar but different: `epix_slide()`
-#' returns a tibble containing only the grouping variables, `time_value`, and
-#' the new column(s) from the slide computation `f`, whereas `epi_slide()`
-#' returns an `epi_df` with all original variables plus the new columns from
-#' the slide computation.
-#' 5. Unless grouping by `geo_value` and all `other_keys`, there will be
-#' row-recyling behavior meant to resemble `epi_slide`'s results, based on the
-#' distinct combinations of `geo_value`, `time_value`, and all `other_keys`
-#' present in the version data with `time_value` matching one of the
-#' `ref_time_values`. However, due to reporting latency or reporting dropping
-#' in and out, this may not exactly match the behavior of "corresponding"
-#' `epi_df`s.
-#' 6. Similar to the row recyling, while `all_rows=TRUE` is designed to mimic
-#' `epi_slide` by completing based on distinct combinations of `geo_value`,
-#' `time_value`, and all `other_keys` present in the version data with
-#' `time_value` matching one of the `ref_time_values`, this can have unexpected
-#' behaviors due reporting latency or reporting dropping in and out.
+#' returns an `epi_df` or tibble containing only the grouping variables,
+#' `time_value`, and the new column(s) from the slide computations, whereas
+#' `epi_slide()` returns an `epi_df` with all original variables plus the new
+#' columns from the slide computations.
+#' 5. There are no size stability checks or element/row recycling to maintain
+#' size stability in `epix_slide`, unlike in `epi_slide`. (`epix_slide` is
+#' roughly analogous to [`dplyr::reframe`] in `dplyr` 1.1.0
+#' or[`dplyr::summarize`] in `dplyr` 1.0.0, while `epi_slide` is roughly
+#' analogous to `dplyr::mutate` followed by `dplyr::arrange`) This is detailed
+#' in the "advanced" vignette.
+#' 6. `all_rows` is not supported in `epix_slide`; since the slide
+#' computations are allowed more flexibility in their outputs than in
+#' `epi_slide`, we can't guess a good representation for missing computations
+#' for excluded group-`ref_time_value` pairs.
#' 7. The `ref_time_values` default for `epix_slide` is based on making an
#' evenly-spaced sequence out of the `version`s in the `DT` plus the
#' `versions_end`, rather than the `time_value`s.
-#' Apart from this, the interfaces between `epix_slide()` and `epi_slide()` are
-#' the same.
+#'
+#' Apart from the above distinctions, the interfaces between `epix_slide()` and
+#' `epi_slide()` are the same.
#'
#' Furthermore, the current function can be considerably slower than
#' `epi_slide()`, for two reasons: (1) it must repeatedly fetch
@@ -840,7 +846,7 @@ group_by.epi_archive = function(.data, ..., .add=FALSE, .drop=dplyr::group_by_dr
epix_slide = function(x, f, ..., before, ref_time_values,
time_step, new_col_name = "slide_value",
as_list_col = FALSE, names_sep = "_",
- all_rows = FALSE, all_versions = FALSE) {
+ all_versions = FALSE) {
if (!is_epi_archive(x, grouped_okay=TRUE)) {
Abort("`x` must be of class `epi_archive` or `grouped_epi_archive`.")
}
@@ -850,7 +856,6 @@ epix_slide = function(x, f, ..., before, ref_time_values,
new_col_name = new_col_name,
as_list_col = as_list_col,
names_sep = names_sep,
- all_rows = all_rows,
all_versions = all_versions
))
}
diff --git a/_pkgdown.yml b/_pkgdown.yml
index d9e2ec79..bba3ea8d 100644
--- a/_pkgdown.yml
+++ b/_pkgdown.yml
@@ -56,7 +56,7 @@ reference:
desc: Details on `epi_archive`, and basic functionality.
- contents:
- matches("archive")
-- title: `epix_*()` functions
+- title: "`epix_*()` functions"
desc: Functions that act on an `epi_archive` and/or `grouped_epi_archive` object.
- contents:
- starts_with("epix")
diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd
index 8ae412a6..a4a58645 100644
--- a/man/epi_archive.Rd
+++ b/man/epi_archive.Rd
@@ -322,7 +322,6 @@ details.
new_col_name = "slide_value",
as_list_col = FALSE,
names_sep = "_",
- all_rows = FALSE,
all_versions = FALSE
)}\if{html}{\out{}}
}
diff --git a/man/epix_slide.Rd b/man/epix_slide.Rd
index 54d16e86..116dd657 100644
--- a/man/epix_slide.Rd
+++ b/man/epix_slide.Rd
@@ -14,7 +14,6 @@ epix_slide(
new_col_name = "slide_value",
as_list_col = FALSE,
names_sep = "_",
- all_rows = FALSE,
all_versions = FALSE
)
}
@@ -84,16 +83,12 @@ by prepending \code{new_col_name} to the names of the list elements.}
when \code{as_list_col = FALSE}. Default is "_". Using \code{NULL} drops the prefix
from \code{new_col_name} entirely.}
-\item{all_rows}{If \code{all_rows = TRUE}, then the output will have one row per
-combination of grouping variables and unique time values in the underlying
-data table. Otherwise, there will be one row in the output for each time
-value in \code{x} that acts as a reference time value. Default is \code{FALSE}.}
-
-\item{all_versions}{If \code{all_versions = TRUE}, then \code{f} will be passed the
-version history (all \code{version <= ref_time_value}) for rows having
-\code{time_value} between \code{ref_time_value - before} and \code{ref_time_value}.
-Otherwise, \code{f} will be passed only the most recent \code{version} for every
-unique \code{time_value}. Default is \code{FALSE}.}
+\item{all_versions}{(Not the same as \code{all_rows} parameter of \code{epi_slide}.) If
+\code{all_versions = TRUE}, then \code{f} will be passed the version history (all
+\code{version <= ref_time_value}) for rows having \code{time_value} between
+\code{ref_time_value - before} and \code{ref_time_value}. Otherwise, \code{f} will be
+passed only the most recent \code{version} for every unique \code{time_value}.
+Default is \code{FALSE}.}
}
\value{
A tibble whose columns are: the grouping variables, \code{time_value},
@@ -126,34 +121,36 @@ time interval (e.g., day) are only published after that time interval
ends); \code{epi_slide} windows extend from \code{before} time steps before a
\code{ref_time_value} through \code{after} time steps after \code{ref_time_value}.
\item The input class and columns are similar but different: \code{epix_slide}
-keeps all columns and the \code{epi_df}-ness of the first input to the
-computation; \code{epi_slide} only provides the grouping variables in the second
-input, and will convert the first input into a regular tibble if the
-grouping variables include the essential \code{geo_value} column.
+(with the default \code{all_versions=FALSE}) keeps all columns and the
+\code{epi_df}-ness of the first argument to each computation; \code{epi_slide} only
+provides the grouping variables in the second input, and will convert the
+first input into a regular tibble if the grouping variables include the
+essential \code{geo_value} column. (With \code{all_versions=TRUE}, \code{epix_slide} will
+will provide an \code{epi_archive} rather than an \code{epi-df} to each
+computation.)
\item The output class and columns are similar but different: \code{epix_slide()}
-returns a tibble containing only the grouping variables, \code{time_value}, and
-the new column(s) from the slide computation \code{f}, whereas \code{epi_slide()}
-returns an \code{epi_df} with all original variables plus the new columns from
-the slide computation.
-\item Unless grouping by \code{geo_value} and all \code{other_keys}, there will be
-row-recyling behavior meant to resemble \code{epi_slide}'s results, based on the
-distinct combinations of \code{geo_value}, \code{time_value}, and all \code{other_keys}
-present in the version data with \code{time_value} matching one of the
-\code{ref_time_values}. However, due to reporting latency or reporting dropping
-in and out, this may not exactly match the behavior of "corresponding"
-\code{epi_df}s.
-\item Similar to the row recyling, while \code{all_rows=TRUE} is designed to mimic
-\code{epi_slide} by completing based on distinct combinations of \code{geo_value},
-\code{time_value}, and all \code{other_keys} present in the version data with
-\code{time_value} matching one of the \code{ref_time_values}, this can have unexpected
-behaviors due reporting latency or reporting dropping in and out.
+returns an \code{epi_df} or tibble containing only the grouping variables,
+\code{time_value}, and the new column(s) from the slide computations, whereas
+\code{epi_slide()} returns an \code{epi_df} with all original variables plus the new
+columns from the slide computations.
+\item There are no size stability checks or element/row recycling to maintain
+size stability in \code{epix_slide}, unlike in \code{epi_slide}. (\code{epix_slide} is
+roughly analogous to \code{\link[dplyr:reframe]{dplyr::reframe}} in \code{dplyr} 1.1.0
+or\code{\link[dplyr:summarise]{dplyr::summarize}} in \code{dplyr} 1.0.0, while \code{epi_slide} is roughly
+analogous to \code{dplyr::mutate} followed by \code{dplyr::arrange}) This is detailed
+in the "advanced" vignette.
+\item \code{all_rows} is not supported in \code{epix_slide}; since the slide
+computations are allowed more flexibility in their outputs than in
+\code{epi_slide}, we can't guess a good representation for missing computations
+for excluded group-\code{ref_time_value} pairs.
\item The \code{ref_time_values} default for \code{epix_slide} is based on making an
evenly-spaced sequence out of the \code{version}s in the \code{DT} plus the
\code{versions_end}, rather than the \code{time_value}s.
-Apart from this, the interfaces between \code{epix_slide()} and \code{epi_slide()} are
-the same.
}
+Apart from the above distinctions, the interfaces between \code{epix_slide()} and
+\code{epi_slide()} are the same.
+
Furthermore, the current function can be considerably slower than
\code{epi_slide()}, for two reasons: (1) it must repeatedly fetch
properly-versioned snapshots from the data archive (via its \code{as_of()}
diff --git a/man/group_by.epi_archive.Rd b/man/group_by.epi_archive.Rd
index dac7ba0d..aee0a07b 100644
--- a/man/group_by.epi_archive.Rd
+++ b/man/group_by.epi_archive.Rd
@@ -27,12 +27,13 @@ is_grouped_epi_archive(x)
\item{...}{Similar to \code{\link[dplyr:group_by]{dplyr::group_by}} (see "Details:" for edge cases);
\itemize{
-\item In \code{group_by}: unquoted variable name(s) or other \link[dplyr:dplyr_data_masking]{"data masking"} expression(s). It's possible to use
-\code{\link[dplyr:mutate]{dplyr::mutate}}-like syntax here to calculate new columns on which to
+\item For \code{group_by}: unquoted variable name(s) or other
+\link[dplyr:dplyr_data_masking]{"data masking"} expression(s). It's possible to
+use \code{\link[dplyr:mutate]{dplyr::mutate}}-like syntax here to calculate new columns on which to
perform grouping, but note that, if you are regrouping an already-grouped
\code{.data} object, the calculations will be carried out ignoring such grouping
(same as \link[dplyr:group_by]{in dplyr}).
-\item In \code{ungroup}: either
+\item For \code{ungroup}: either
\itemize{
\item empty, in order to remove the grouping and output an \code{epi_archive}; or
\item variable name(s) or other \link[dplyr:dplyr_tidy_select]{"tidy-select"}
@@ -46,14 +47,15 @@ the variable selection from \code{...} only; if \code{TRUE}, the output will be
grouped by the current grouping variables plus the variable selection from
\code{...}.}
-\item{.drop}{As in \code{\link[dplyr:group_by]{dplyr::group_by}}; determines treatment of factor
-columns.}
+\item{.drop}{As described in \code{\link[dplyr:group_by]{dplyr::group_by}}; determines treatment of
+factor columns.}
-\item{x}{a \code{grouped_epi_archive}, or, in \code{is_grouped_epi_archive}, any object}
+\item{x}{For \code{groups} or \code{ungroup}: a \code{grouped_epi_archive}; for
+\code{is_grouped_epi_archive}: any object}
-\item{.tbl}{An \code{epi_archive} or \code{grouped_epi_archive} (\code{epi_archive}
-dispatches to the S3 default method, and \code{grouped_epi_archive} dispatches
-its own S3 method)}
+\item{.tbl}{(For \code{group_by_drop_default}:) an \code{epi_archive} or
+\code{grouped_epi_archive} (\code{epi_archive} dispatches to the S3 default method;
+\code{grouped_epi_archive} dispatches its own S3 method)}
}
\description{
\code{group_by} and related methods for \code{epi_archive}, \code{grouped_epi_archive}
@@ -137,18 +139,9 @@ toy_archive \%>\% group_by(geo_value, age_group) \%>\% ungroup(age_group)
# To get the grouping variable names as a `list` of `name`s (a.k.a. symbols):
toy_archive \%>\% group_by(geo_value) \%>\% groups()
-# `.drop = FALSE` is supported in a sense; `f` is called on 0-row inputs for
-# the missing groups identified by `dplyr`, but the row-recycling rules will
-# exclude the corresponding outputs of `f` from the output of the slide:
-all.equal(
- toy_archive \%>\%
- group_by(geo_value, age_group, .drop=FALSE) \%>\%
- epix_slide(f = ~ sum(.x$value), before = 20) \%>\%
- ungroup(),
- toy_archive \%>\%
- group_by(geo_value, age_group, .drop=TRUE) \%>\%
- epix_slide(f = ~ sum(.x$value), before = 20) \%>\%
- ungroup()
-)
+toy_archive \%>\%
+ group_by(geo_value, age_group, .drop=FALSE) \%>\%
+ epix_slide(f = ~ sum(.x$value), before = 20) \%>\%
+ ungroup()
}
diff --git a/tests/testthat/test-deprecations.R b/tests/testthat/test-deprecations.R
new file mode 100644
index 00000000..334b4488
--- /dev/null
+++ b/tests/testthat/test-deprecations.R
@@ -0,0 +1,48 @@
+
+test_that("epix_slide group_by= deprecation works",{
+ expect_error(
+ archive_cases_dv_subset %>%
+ epix_slide(function(...) {}, before=2L, group_by=c()),
+ class = "epiprocess__epix_slide_group_by_parameter_deprecated"
+ )
+ expect_error(
+ archive_cases_dv_subset$
+ slide(function(...) {}, before=2L, group_by=c()),
+ class = "epiprocess__epix_slide_group_by_parameter_deprecated"
+ )
+ expect_error(
+ archive_cases_dv_subset %>%
+ group_by(geo_value) %>%
+ epix_slide(function(...) {}, before=2L, group_by=c()),
+ class = "epiprocess__epix_slide_group_by_parameter_deprecated"
+ )
+ expect_error(
+ archive_cases_dv_subset$
+ group_by(geo_value)$
+ slide(function(...) {}, before=2L, group_by=c()),
+ class = "epiprocess__epix_slide_group_by_parameter_deprecated"
+ )
+ #
+ expect_error(
+ archive_cases_dv_subset %>%
+ epix_slide(function(...) {}, before=2L, all_rows=TRUE),
+ class = "epiprocess__epix_slide_all_rows_parameter_deprecated"
+ )
+ expect_error(
+ archive_cases_dv_subset$
+ slide(function(...) {}, before=2L, all_rows=TRUE),
+ class = "epiprocess__epix_slide_all_rows_parameter_deprecated"
+ )
+ expect_error(
+ archive_cases_dv_subset %>%
+ group_by(geo_value) %>%
+ epix_slide(function(...) {}, before=2L, all_rows=TRUE),
+ class = "epiprocess__epix_slide_all_rows_parameter_deprecated"
+ )
+ expect_error(
+ archive_cases_dv_subset$
+ group_by(geo_value)$
+ slide(function(...) {}, before=2L, all_rows=TRUE),
+ class = "epiprocess__epix_slide_all_rows_parameter_deprecated"
+ )
+})
diff --git a/tests/testthat/test-epix_slide.R b/tests/testthat/test-epix_slide.R
index 9ef2f9af..c352f90a 100644
--- a/tests/testthat/test-epix_slide.R
+++ b/tests/testthat/test-epix_slide.R
@@ -27,7 +27,7 @@ test_that("epix_slide works as intended",{
2^6+2^3,
2^10+2^9,
2^15+2^14)) %>%
- as_epi_df(as_of = 4) %>% # Also a bug (issue #213)
+ as_epi_df(as_of = 7) %>%
group_by(geo_value)
expect_identical(xx1,xx2) # *
@@ -348,3 +348,18 @@ test_that("epix_slide with all_versions option works as intended",{
expect_identical(xx1,xx3) # This and * Imply xx2 and xx3 are identical
})
+
+test_that("`epix_slide` uses `versions_end` as a resulting `epi_df`'s `as_of`", {
+ ea_updated_stale = ea$clone()
+ ea_updated_stale$versions_end <- ea_updated_stale$versions_end + 3 # (dbl)
+ #
+ expect_identical(
+ ea_updated_stale %>%
+ group_by(geo_value) %>%
+ epix_slide(~ slice_head(.x, n = 1L), before = 10L) %>%
+ ungroup() %>%
+ attr("metadata") %>%
+ .$as_of,
+ 10
+ )
+})
diff --git a/tests/testthat/test-grouped_epi_archive.R b/tests/testthat/test-grouped_epi_archive.R
new file mode 100644
index 00000000..64c482bf
--- /dev/null
+++ b/tests/testthat/test-grouped_epi_archive.R
@@ -0,0 +1,80 @@
+test_that("Grouping, regrouping, and ungrouping archives works as intended", {
+ # From an example:
+ library(dplyr)
+ toy_archive =
+ tribble(
+ ~geo_value, ~age_group, ~time_value, ~version, ~value,
+ "us", "adult", "2000-01-01", "2000-01-02", 121,
+ "us", "pediatric", "2000-01-02", "2000-01-03", 5, # (addition)
+ "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision)
+ "us", "adult", "2000-01-02", "2000-01-03", 130 # (addition)
+ ) %>%
+ mutate(age_group = ordered(age_group, c("pediatric", "adult")),
+ time_value = as.Date(time_value),
+ version = as.Date(version)) %>%
+ as_epi_archive(other_keys = "age_group")
+
+ # Ensure that we're using testthat edition 3's idea of "identical", which is
+ # not as strict as `identical`:
+ testthat::local_edition(3)
+
+ # Test equivalency claims in example:
+ by_both_keys = toy_archive %>% group_by(geo_value, age_group)
+ expect_identical(
+ by_both_keys,
+ toy_archive %>% group_by(geo_value) %>% group_by(age_group, .add=TRUE)
+ )
+ grouping_cols = c("geo_value", "age_group")
+ expect_identical(
+ by_both_keys,
+ toy_archive %>% group_by(across(all_of(grouping_cols)))
+ )
+
+ expect_identical(
+ toy_archive %>% group_by(geo_value),
+ toy_archive %>% group_by(geo_value, age_group) %>% ungroup(age_group)
+ )
+
+ # Test `.drop` behavior:
+ expect_error(toy_archive %>% group_by(.drop = "bogus"),
+ regexp = "\\.drop.*TRUE or FALSE")
+ expect_warning(toy_archive %>% group_by(.drop=FALSE),
+ class="epiprocess__group_by_epi_archive_drop_FALSE_no_factors")
+ expect_warning(toy_archive %>% group_by(geo_value, .drop=FALSE),
+ class="epiprocess__group_by_epi_archive_drop_FALSE_no_factors")
+ expect_warning(grouped_factor_then_nonfactor <-
+ toy_archive %>% group_by(age_group, geo_value, .drop=FALSE),
+ class="epiprocess__group_by_epi_archive_drop_FALSE_nonfactor_after_factor")
+ expect_identical(grouped_factor_then_nonfactor %>%
+ epix_slide(before = 10, s = sum(value)) %>%
+ ungroup(),
+ tibble::tribble(
+ ~age_group, ~geo_value, ~time_value, ~s,
+ "pediatric", NA_character_, "2000-01-02", 0,
+ "adult", "us", "2000-01-02", 121,
+ "pediatric", "us", "2000-01-03", 5,
+ "adult", "us", "2000-01-03", 255) %>%
+ mutate(age_group = ordered(age_group, c("pediatric", "adult")),
+ time_value = as.Date(time_value)) %>%
+ as_epi_df(geo_type = "nation", # bug; want "custom" from NA; issue #242
+ as_of = as.Date("2000-01-03"),
+ additional_metadata = list(other_keys = "age_group")) %>%
+ # put back in expected order; see issue #166:
+ select(age_group, geo_value, time_value, s))
+ expect_identical(toy_archive %>%
+ group_by(geo_value, age_group, .drop=FALSE) %>%
+ epix_slide(before = 10, s = sum(value)) %>%
+ ungroup(),
+ tibble::tribble(
+ ~geo_value, ~age_group, ~time_value, ~s,
+ "us", "pediatric", "2000-01-02", 0,
+ "us", "adult", "2000-01-02", 121,
+ "us", "pediatric", "2000-01-03", 5,
+ "us", "adult", "2000-01-03", 255) %>%
+ mutate(age_group = ordered(age_group, c("pediatric", "adult")),
+ time_value = as.Date(time_value)) %>%
+ as_epi_df(as_of = as.Date("2000-01-03"),
+ additional_metadata = list(other_keys = "age_group")) %>%
+ # put back in expected order; see issue #166:
+ select(geo_value, age_group, time_value, s))
+})
diff --git a/vignettes/advanced.Rmd b/vignettes/advanced.Rmd
index ee0751e2..9580bee7 100644
--- a/vignettes/advanced.Rmd
+++ b/vignettes/advanced.Rmd
@@ -7,21 +7,62 @@ vignette: >
%\VignetteEncoding{UTF-8}
---
-In this vignette, we discuss how to use the sliding functionality in the
-`epiprocess` package with computations that have advanced output structures.
-
-In general, the functions `epi_slide()` and `epix_slide()` do what they can to
-ensure the result of a slide operation is *size stable*, meaning, it will return
-something whose length is the same as the number of appearances of reference
-time values for the slide computation in the given data frame/table (this
-defaults to all time values, but can be some given subset when `ref_time_values`
-is specified).
-
-The output of a slide computation should either be an atomic value/vector, or a
-data frame. This data frame can have multiple columns, multiple rows, or both.
-Below we demonstrate some advanced use cases of sliding with these output
-structures. We focus on `epi_slide()` for the most part, though the behavior we
-demonstrate also carries over to `epix_slide()`.
+In this vignette, we discuss how to use the sliding functionality in the
+`epiprocess` package with less common grouping schemes or with computations that
+have advanced output structures.
+The output of a slide computation should either be an atomic value/vector, or a
+data frame. This data frame can have multiple columns, multiple rows, or both.
+
+During basic usage (e.g., when all optional arguments are set to their defaults):
+
+* `epi_slide(edf, , .....)`:
+
+ * keeps **all** columns of `edf`, adds computed column(s)
+ * outputs one row **for every row in `edf`** (recycling outputs from
+ computations appropriately if there are multiple time series bundled
+ together inside any group(s))
+ * is roughly analogous to (the non-sliding) **`dplyr::mutate` followed by
+ `dplyr::arrange(time_value, .by_group = TRUE)`**
+
+* `epix_slide(ea, , .....)`:
+
+ * keeps **grouping and `time_value`** columns of `ea`, adds computed
+ column(s)
+ * outputs one row **for element/row output from the computations**
+ * is roughly analogous to (the non-sliding) **`dplyr::reframe`** (or
+ `dplyr::summarize`, before the `dplyr` 1.1.0
+ [update](https://www.tidyverse.org/blog/2023/02/dplyr-1-1-0-pick-reframe-arrange/#reframe))
+
+These differences in basic behavior make some common slide operations require less boilerplate:
+* predictors and targets calculated with `epi_slide` are automatically lined up
+ with each other and with the signals from which they were calculated; and
+* computations for an `epix_slide` can output data frames with any number of
+ rows, containing models, forecasts, evaluations, etc., and will not be
+ recycled.
+
+When using more advanced features, more complex rules apply:
+
+* Generalization: `epi_slide(edf, ....., ref_time_values=my_ref_time_values)`
+ will output one row for every row in `edf` with `time_value` appearing inside
+ `ref_time_values`, and is analogous to a `dplyr::mutate`&`dplyr::arrange`
+ followed by `dplyr::filter` to those `ref_time_values`. We call this property
+ **size stability**, and describe how it is achieved in the following sections.
+ The default behavior described above is a special case of this general rule
+ based on a default value of `ref_time_values`.
+* Exception/feature: `epi_slide(edf, ....., ref_time_values=my_ref_time_values,
+ all_rows=TRUE)` will not just output rows for the given `ref_time_values`, but
+ instead will output one row per row in `edf`.
+* Exception/feature: `epix_slide(ea, ....., as_list_col=TRUE)` will output one
+ row per computation that outputs a data frame, even when these data frames
+ have arbitrary numbers of rows.
+* Clarification: `ea %>% group_by(....., .drop=FALSE) %>%
+ epix_slide(, .....)` will call the computation on any missing
+ groups according to `dplyr`'s `.drop=FALSE` rules, resulting in additional
+ output rows.
+
+Below we demonstrate some advanced use cases of sliding with different output
+structures. We focus on `epi_slide()` for the most part, though some of the
+behavior we demonstrate also carries over to `epix_slide()`.
## Recycling outputs
@@ -35,7 +76,7 @@ simple synthetic example.
library(epiprocess)
library(dplyr)
-df <- tibble(
+edf <- tibble(
geo_value = rep(c("ca", "fl", "pa"), each = 3),
time_value = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"),
by = "day"), length.out = length(geo_value)),
@@ -44,29 +85,31 @@ df <- tibble(
as_epi_df()
# 2-day trailing average, per geo value
-df %>%
+edf %>%
group_by(geo_value) %>%
epi_slide(x_2dav = mean(x), before = 1) %>%
ungroup()
# 2-day trailing average, marginally
-df %>%
+edf %>%
epi_slide(x_2dav = mean(x), before = 1)
```
```{r, include = FALSE}
# More checks (not included)
-df %>%
+edf %>%
epi_slide(x_2dav = mean(x), before = 1, ref_time_values = as.Date("2020-06-02"))
-df %>%
+edf %>%
+ # pretend that observations about time_value t are reported in version t (nowcasts)
mutate(version = time_value) %>%
as_epi_archive() %>%
group_by(geo_value) %>%
epix_slide(x_2dav = mean(x), before = 1, ref_time_values = as.Date("2020-06-02")) %>%
ungroup()
-df %>%
+edf %>%
+ # pretend that observations about time_value t are reported in version t (nowcasts)
mutate(version = time_value) %>%
as_epi_archive() %>%
group_by(geo_value) %>%
@@ -80,7 +123,7 @@ so, uses it to fill the new column. For example, this next computation gives the
same result as the last one.
```{r}
-df %>%
+edf %>%
epi_slide(y_2dav = rep(mean(x), 3), before = 1)
```
@@ -89,7 +132,7 @@ is *not* size stable, then `epi_slide()` throws an error. For example, below we
are trying to return 2 things for 3 states.
```{r, error = TRUE}
-df %>%
+edf %>%
epi_slide(x_2dav = rep(mean(x), 2), before = 1)
```
@@ -103,15 +146,15 @@ we set `as_list_col = TRUE` in the call to `epi_slide()`, the resulting `epi_df`
object returned by `epi_slide()` has a list column containing the slide values.
```{r}
-df2 <- df %>%
+edf2 <- edf %>%
group_by(geo_value) %>%
epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)),
before = 1, as_list_col = TRUE) %>%
ungroup()
-class(df2$a)
-length(df2$a)
-df2$a[[2]]
+class(edf2$a)
+length(edf2$a)
+edf2$a[[2]]
```
When we use `as_list_col = FALSE` (the default in `epi_slide()`), the function
@@ -122,7 +165,7 @@ list column (here `a`) onto the column names of the output data frame from the
slide computation (here `x_2dav` and `x_2dma`) separated by "_".
```{r}
-df %>%
+edf %>%
group_by(geo_value) %>%
epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)),
before = 1, as_list_col = FALSE) %>%
@@ -133,7 +176,7 @@ We can use `names_sep = NULL` (which gets passed to `tidyr::unnest()`) to drop
the prefix associated with list column name, in naming the unnested columns.
```{r}
-df %>%
+edf %>%
group_by(geo_value) %>%
epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)),
before = 1, as_list_col = FALSE, names_sep = NULL) %>%
@@ -144,19 +187,19 @@ Furthermore, `epi_slide()` will recycle the single row data frame as needed in
order to make the result size stable, just like the case for atomic values.
```{r}
-df %>%
+edf %>%
epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)),
before = 1, as_list_col = FALSE, names_sep = NULL)
```
```{r, include = FALSE}
# More checks (not included)
-df %>%
+edf %>%
epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)),
ref_time_values = as.Date("2020-06-02"),
before = 1, as_list_col = FALSE, names_sep = NULL)
-df %>%
+edf %>%
mutate(version = time_value) %>%
as_epi_archive() %>%
group_by(geo_value) %>%
@@ -174,32 +217,51 @@ Meaning, `epi_slide()` will check that the result is size stable, and if so,
will fill the new column(s) in the resulting `epi_df` object appropriately.
This can be convenient for modeling in the following sense: we can, for example,
-fit a sliding forecasting model by pooling data from different locations, and
-then return separate forecasts from this common model for each location. We use
-our synthetic example to demonstrate this idea abstractly but simply.
+fit a sliding, data-versioning-unaware nowcasting or forecasting model by
+pooling data from different locations, and then return separate forecasts from
+this common model for each location. We use our synthetic example to demonstrate
+this idea abstractly but simply by forecasting (actually, nowcasting) `y` from
+`x` by fitting a time-windowed linear model that pooling data across all
+locations.
```{r}
-df$y <- 2 * df$x + 0.05 * rnorm(length(df$x))
+edf$y <- 2 * edf$x + 0.05 * rnorm(length(edf$x))
-df %>%
+edf %>%
epi_slide(function(d, ...) {
obj <- lm(y ~ x, data = d)
return(
as.data.frame(
predict(obj, newdata = d %>%
- group_by(geo_value) %>%
- filter(time_value == max(time_value)),
+ group_by(geo_value) %>%
+ filter(time_value == max(time_value)),
interval = "prediction", level = 0.9)
))
}, before = 1, new_col_name = "fc", names_sep = NULL)
```
+The above example focused on simplicity to show how to work with multi-row
+outputs. Note however, the following issues in this example:
+
+* The `lm` fitting data includes the testing instances, as no training-test split was performed.
+* Adding a simple training-test split would not factor in reporting latency properly.
+* Data revisions are not taken into account.
+
+All three of these factors contribute to unrealistic retrospective forecasts and
+overly optimistic retrospective performance evaluations. Instead, one should
+favor an `epix_slide` for more realistic "pseudoprospective" forecasts. Using
+`epix_slide` also makes it easier to express certain types of forecasts; while
+in `epi_slide`, forecasts for additional aheads or quantile levels would need to
+be expressed as additional columns, or nested inside list columns, `epix_slide`
+does not perform size stability checks or recycling, allowing computations to
+output any number of rows.
+
## Version-aware forecasting, revisited
-Finally, we revisit the COVID-19 forecasting example from the [archive
+We revisit the COVID-19 forecasting example from the [archive
vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html) in order
-to demonstrate the last point in a more realistic setting. First, we fetch the
-versioned data and build the archive.
+to demonstrate the preceding points regarding forecast evaluation in a more
+realistic setting. First, we fetch the versioned data and build the archive.
```{r, message = FALSE, warning = FALSE, eval =FALSE}
library(epidatr)