cmu-delphi · nmdefries · Jan 19, 2024 · Jan 17, 2024 · Jan 17, 2024 · Jan 18, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: epiprocess
 Title: Tools for basic signal processing in epidemiology
-Version: 0.7.1.9999
+Version: 0.7.2.9999
 Authors@R: c(
     person("Jacob", "Bien", role = "ctb"),
     person("Logan", "Brooks", role = "aut"),

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,11 @@
+# epiprocess 0.7.2.9999
+
+## Improvements
+
+* `epi_slide` computations are now 2-4 times faster after changing how
+  reference time values, made accessible within sliding functions, are
+  calculated (#397).
+
 # epiprocess 0.7.1.9999
 
 Note that `epiprocess` uses the [Semantic Versioning

diff --git a/R/slide.R b/R/slide.R
@@ -230,84 +230,56 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values,
     after <- time_step(after)
   }
 
-  min_ref_time_values <- ref_time_values - before
-  min_ref_time_values_not_in_x <- min_ref_time_values[!(min_ref_time_values %in% unique(x$time_value))]
-
-  # Do set up to let us recover `ref_time_value`s later.
-  # A helper column marking real observations.
-  x$.real <- TRUE
-
-  # Create df containing phony data. Df has the same columns and attributes as
-  # `x`, but filled with `NA`s aside from grouping columns. Number of rows is
-  # equal to the number of `min_ref_time_values_not_in_x` we have * the
-  # number of unique levels seen in the grouping columns.
-  before_time_values_df <- data.frame(time_value = min_ref_time_values_not_in_x)
-  if (length(group_vars(x)) != 0) {
-    before_time_values_df <- dplyr::cross_join(
-      # Get unique combinations of grouping columns seen in real data.
-      unique(x[, group_vars(x)]),
-      before_time_values_df
-    )
-  }
-  # Automatically fill in all other columns from `x` with `NA`s, and carry
-  # attributes over to new df.
-  before_time_values_df <- bind_rows(x[0, ], before_time_values_df)
-  before_time_values_df$.real <- FALSE
-
-  x <- bind_rows(before_time_values_df, x)
-
   # Arrange by increasing time_value
   x <- arrange(x, time_value)
 
   # Now set up starts and stops for sliding/hopping
-  time_range <- range(unique(x$time_value))
-  starts <- in_range(ref_time_values - before, time_range)
-  stops <- in_range(ref_time_values + after, time_range)
-
-  if (length(starts) == 0 || length(stops) == 0) {
-    Abort("The starting and/or stopping times for sliding are out of bounds with respect to the range of times in your data. Check your settings for ref_time_values and align (and before, if specified).")
-  }
+  starts <- ref_time_values - before
+  stops <- ref_time_values + after
 
   # Symbolize new column name
   new_col <- sym(new_col_name)
 
   # Computation for one group, all time values
   slide_one_grp <- function(.data_group,
-                            f, ...,
+                            .group_key, # see `?group_modify`
+                            ..., # `...` to `epi_slide` forwarded here
+                            f_factory,
                             starts,
                             stops,
-                            time_values,
+                            ref_time_values,
                             all_rows,
                             new_col) {
     # Figure out which reference time values appear in the data group in the
     # first place (we need to do this because it could differ based on the
     # group, hence the setup/checks for the reference time values based on all
-    # the data could still be off)
-    o <- time_values %in% .data_group$time_value
+    # the data could still be off):
+    o <- ref_time_values %in% .data_group$time_value
     starts <- starts[o]
     stops <- stops[o]
-    time_values <- time_values[o]
+    kept_ref_time_values <- ref_time_values[o]
+
+    f <- f_factory(kept_ref_time_values)
 
     # Compute the slide values
     slide_values_list <- slider::hop_index(
       .x = .data_group,
       .i = .data_group$time_value,
-      .f = f, ...,
       .starts = starts,
-      .stops = stops
+      .stops = stops,
+      .f = f,
+      .group_key, ...
     )
 
     # Now figure out which rows in the data group are in the reference time
     # values; this will be useful for all sorts of checks that follow
-    o <- .data_group$time_value %in% time_values
+    o <- .data_group$time_value %in% kept_ref_time_values
     num_ref_rows <- sum(o)
 
-    # Count the number of appearances of each reference time value (these
-    # appearances should all be real for now, but if we allow ref time values
-    # outside of .data_group's time values):
-    counts <- dplyr::filter(.data_group, .data$time_value %in% time_values) %>%
+    # Count the number of appearances of each kept reference time value.
+    counts <- dplyr::filter(.data_group, .data$time_value %in% kept_ref_time_values) %>%
       dplyr::count(.data$time_value) %>%
-      dplyr::pull(n)
+      `[[`("n")
 
     if (!all(purrr::map_lgl(slide_values_list, is.atomic)) &&
       !all(purrr::map_lgl(slide_values_list, is.data.frame))) {
@@ -349,7 +321,6 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values,
       # fills with NA equivalent.
       vctrs::vec_slice(slide_values, o) <- orig_values
     } else {
-      # This implicitly removes phony (`.real` == FALSE) observations.
       .data_group <- filter(.data_group, o)
     }
     return(mutate(.data_group, !!new_col := slide_values))
@@ -372,18 +343,24 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values,
 
   f <- as_slide_computation(f, ...)
   # Create a wrapper that calculates and passes `.ref_time_value` to the
-  # computation.
-  f_wrapper <- function(.x, .group_key, ...) {
-    .ref_time_value <- min(.x$time_value) + before
-    .x <- .x[.x$.real, ]
-    .x$.real <- NULL
-    f(.x, .group_key, .ref_time_value, ...)
+  # computation. `i` is contained in the `f_wrapper_factory` environment such
+  # that when called within `slide_one_grp` `i` is reset for every group.
+  f_wrapper_factory <- function(kept_ref_time_values) {
+    # Use `i` to advance through list of start dates.
+    i <- 1L
+    f_wrapper <- function(.x, .group_key, ...) {
+      .ref_time_value <- kept_ref_time_values[[i]]
+      i <<- i + 1L
+      f(.x, .group_key, .ref_time_value, ...)
+    }
+    return(f_wrapper)
   }
   x <- group_modify(x, slide_one_grp,
-    f = f_wrapper, ...,
+    ...,
+    f_factory = f_wrapper_factory,
     starts = starts,
     stops = stops,
-    time_values = ref_time_values,
+    ref_time_values = ref_time_values,
     all_rows = all_rows,
     new_col = new_col,
     .keep = FALSE
@@ -394,14 +371,5 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values,
     x <- unnest(x, !!new_col, names_sep = names_sep)
   }
 
-  # Remove any remaining phony observations. When `all_rows` is TRUE, phony
-  # observations aren't necessarily removed in `slide_one_grp`.
-  if (all_rows) {
-    x <- x[x$.real, ]
-  }
-
-  # Drop helper column `.real`.
-  x$.real <- NULL
-
   return(x)
 }
diff --git a/R/utils.R b/R/utils.R
@@ -361,10 +361,6 @@ as_slide_computation <- function(f, ...) {
 
 ##########
 
-in_range <- function(x, rng) pmin(pmax(x, rng[1]), rng[2])
-
-##########
-
 Min <- function(x) min(x, na.rm = TRUE)
 Max <- function(x) max(x, na.rm = TRUE)
 Sum <- function(x) sum(x, na.rm = TRUE)

diff --git a/man/as_epi_archive.Rd b/man/as_epi_archive.Rd
diff --git a/man/as_epi_df.Rd b/man/as_epi_df.Rd