cmu-delphi · brookslogan · Nov 13, 2022 · Jul 26, 2022 · Jul 26, 2022 · Jul 26, 2022
diff --git a/R/slide.R b/R/slide.R
@@ -19,24 +19,14 @@
 #' @param ... Additional arguments to pass to the function or formula specified
 #'   via `f`. Alternatively, if `f` is missing, then the current argument is
 #'   interpreted as an expression for tidy evaluation. See details.  
-#' @param n Number of time steps to use in the running window. For example, if
-#'   `n = 7`, one time step is one day, and the alignment is "right", then to
-#'   produce a value on January 7 we apply the given function or formula to data
-#'   in between January 1 and 7.
+#' @param before A nonnegative integer specifying the number of days before to
+#'   extract data from. Set to 0 for a "left" alignment in slide.
+#' @param after A nonnegative integer specifying the number of days after to
+#'   extract data from. Set to 0 for a "right" alignment in slide.
 #' @param ref_time_values Time values for sliding computations, meaning, each
 #'   element of this vector serves as the reference time point for one sliding
 #'   window. If missing, then this will be set to all unique time values in the
 #'   underlying data table, by default.
-#' @param align One of "right", "center", or "left", indicating the alignment of
-#'   the sliding window relative to the reference time point. If the alignment
-#'   is "center" and `n` is even, then one more time point will be used after
-#'   the reference time point than before. Default is "right".
-#' @param before Positive integer less than `n`, specifying the number of time
-#'   points to use in the sliding window strictly before the reference time
-#'   point. For example, setting `before = n-1` would be the same as setting
-#'   `align = "right"`. The `before` argument allows for more flexible
-#'   specification of alignment than the `align` parameter, and if specified,
-#'   overrides `align`. 
 #' @param time_step Optional function used to define the meaning of one time
 #'   step, which if specified, overrides the default choice based on the
 #'   `time_value` column. This function must take a positive integer and return
@@ -76,11 +66,11 @@
 #' If `f` is missing, then an expression for tidy evaluation can be specified,
 #'   for example, as in: 
 #'   ```
-#'   epi_slide(x, cases_7dav = mean(cases), n = 7)
+#'   epi_slide(x, cases_7dav = mean(cases), before = 7)
 #'   ```
 #'   which would be equivalent to:
 #'   ```
-#'   epi_slide(x, function(x, ...) mean(x$cases), n = 7,
+#'   epi_slide(x, function(x, ...) mean(x$cases), before = 7,
 #'             new_col_name = "cases_7dav")
 #'   ```
 #'   Thus, to be clear, when the computation is specified via an expression for
@@ -95,16 +85,14 @@
 #'  # slide a 7-day trailing average formula on cases
 #'   jhu_csse_daily_subset %>%
 #'   group_by(geo_value) %>%
-#'   epi_slide(cases_7dav = mean(cases), n = 7, 
-#'             align = "right") %>% 
+#'   epi_slide(cases_7dav = mean(cases), before = 6) %>% 
 #'   # rmv a nonessential var. to ensure new col is printed
 #'   dplyr::select(-death_rate_7d_av) 
 #'  
 #'  # slide a left-aligned 7-day average
 #'   jhu_csse_daily_subset %>%
 #'   group_by(geo_value) %>%
-#'   epi_slide(cases_7dav = mean(cases), n = 7, 
-#'             align = "left") %>% 
+#'   epi_slide(cases_7dav = mean(cases), before = 6) %>% 
 #'   # rmv a nonessential var. to ensure new col is printed
 #'   dplyr::select(-death_rate_7d_av) 
 #'  
@@ -113,9 +101,9 @@
 #'  group_by(geo_value) %>%
 #'  epi_slide(a = data.frame(cases_2dav = mean(cases), 
 #'                           cases_2dma = mad(cases)),
-#'            n = 2, as_list_col = TRUE)
-epi_slide = function(x, f, ..., n, ref_time_values,
-                     align = c("right", "center", "left"), before, time_step, 
+#'            before = 1, as_list_col = TRUE)
+epi_slide = function(x, f, ..., before = 0, after = 0, ref_time_values,
+                     time_step, 
                      new_col_name = "slide_value", as_list_col = FALSE,
                      names_sep = "_", all_rows = FALSE) { 
   # Check we have an `epi_df` object
@@ -133,33 +121,18 @@ epi_slide = function(x, f, ..., n, ref_time_values,
     ref_time_values = ref_time_values[ref_time_values %in%
                                       unique(x$time_value)] 
   }
-
-  # If before is missing, then use align to set up alignment
-  if (missing(before)) {
-    align = match.arg(align)
-    if (align == "right") {
-      before_num = n-1
-      after_num = 0
-    }
-    else if (align == "center") {
-      before_num = floor((n-1)/2)
-      after_num = ceiling((n-1)/2)
-    }
-    else {
-      before_num = 0
-      after_num = n-1
-    }
-  }
 
   # Otherwise set up alignment based on passed before value
-  else {
-    if (before < 0 || before > n-1) {
-      Abort("`before` must be in between 0 and n-1`.")
-    }
-
-    before_num = before
-    after_num = n-1-before
+  if (before < 0 || after < 0) {
+    Abort("`before` and `after` must be at least 0.")
   }
+
+  if (floor(before) < ceiling(before) || floor(after) < ceiling(after)) {
+    Abort("`before` and `after` must be integers.")
+  }
+
+  before_num = before
+  after_num = after
 
   # If a custom time step is specified, then redefine units 
   if (!missing(time_step)) {

diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd
diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R
@@ -10,25 +10,25 @@ f = function(x, ...) dplyr::tibble(value=mean(x$value), count=length(x$value))
 
 ## --- These cases generate the error: ---
 test_that("`ref_time_values` + `align` that result in no slide data, generate the error", {
-  expect_error(edf %>% group_by(geo_value) %>% epi_slide(f, n=3L, ref_time_values=as.Date("2020-01-01")), 
+  expect_error(edf %>% group_by(geo_value) %>% epi_slide(f, before=3L, ref_time_values=as.Date("2020-01-01")), 
                "starting and/or stopping times for sliding are out of bounds") # before the first, no data in the slide windows
-  expect_error(edf %>% group_by(geo_value) %>% epi_slide(f, n=3L, ref_time_values=as.Date("2020-01-01")+207L), 
+  expect_error(edf %>% group_by(geo_value) %>% epi_slide(f, before=3L, ref_time_values=as.Date("2020-01-01")+207L), 
                "starting and/or stopping times for sliding are out of bounds") # beyond the last, no data in window
 })
 
 test_that("`ref_time_values` + `align` that have some slide data, but generate the error due to ref. time being out of time range", {
-  expect_error(edf %>% group_by(geo_value) %>% epi_slide(f, n=3L, ref_time_values=as.Date("2020-01-01"), align="left"), 
+  expect_error(edf %>% group_by(geo_value) %>% epi_slide(f, after=3L, ref_time_values=as.Date("2020-01-01")), 
                "starting and/or stopping times for sliding are out of bounds") # before the first, but we'd expect there to be data in the window
-  expect_error(edf %>% group_by(geo_value) %>% epi_slide(f, n=3L, ref_time_values=as.Date("2020-01-01")+201L), 
+  expect_error(edf %>% group_by(geo_value) %>% epi_slide(f, before=3L, ref_time_values=as.Date("2020-01-01")+201L), 
                "starting and/or stopping times for sliding are out of bounds") # beyond the last, but still with data in window
 })
 
 ## --- These cases doesn't generate the error: ---
 test_that("these doesn't produce an error; the error appears only if the ref time values are out of the range for every group", {
-  expect_identical(edf %>% group_by(geo_value) %>% epi_slide(f, n=3L, ref_time_values=as.Date("2020-01-01")+200L) %>% 
+  expect_identical(edf %>% group_by(geo_value) %>% epi_slide(f, before=2L, ref_time_values=as.Date("2020-01-01")+200L) %>% 
                      dplyr::select("geo_value","slide_value_value"), 
                    dplyr::tibble(geo_value = "ak", slide_value_value = 199)) # out of range for one group
-  expect_identical(edf %>% group_by(geo_value) %>% epi_slide(f, n=3L, ref_time_values=as.Date("2020-01-04")) %>% 
+  expect_identical(edf %>% group_by(geo_value) %>% epi_slide(f, before=3L, ref_time_values=as.Date("2020-01-04")) %>% 
                      dplyr::select("geo_value","slide_value_value"), 
                    dplyr::tibble(geo_value = c("ak", "al"), slide_value_value = c(2, -2))) # not out of range for either group
 })
diff --git a/vignettes/advanced.Rmd b/vignettes/advanced.Rmd
@@ -46,17 +46,17 @@ df <- tibble(
 # 2-day trailing average, per geo value
 df %>% 
   group_by(geo_value) %>%
-  epi_slide(x_2dav = mean(x), n = 2)
+  epi_slide(x_2dav = mean(x), before = 1)
 
 # 2-day trailing average, marginally 
 df %>% 
-  epi_slide(x_2dav = mean(x), n = 2)
+  epi_slide(x_2dav = mean(x), before = 1)
 ```
 
 ```{r, include = FALSE}
 # More checks (not included)
 df %>% 
-  epi_slide(x_2dav = mean(x), n = 2, ref_time_values = as.Date("2020-06-02"))
+  epi_slide(x_2dav = mean(x), before = 1, ref_time_values = as.Date("2020-06-02"))
 
 df %>% 
   mutate(version = time_value) %>% 
@@ -76,7 +76,7 @@ same result as the last one.
 
 ```{r}
 df %>% 
-  epi_slide(y_2dav = rep(mean(x), 3), n = 2)
+  epi_slide(y_2dav = rep(mean(x), 3), before = 1)
 ```
 
 However, if the output is an atomic vector (rather than a single value) and it
@@ -85,7 +85,7 @@ are trying to return 2 things for 3 states.
 
 ```{r, error = TRUE}
 df %>% 
-  epi_slide(x_2dav = rep(mean(x), 2), n = 2)
+  epi_slide(x_2dav = rep(mean(x), 2), before = 1)
 ```
 
 ## Multi-column outputs
@@ -101,7 +101,7 @@ object returned by `epi_slide()` has a list column containing the slide values.
 df2 <- df %>% 
   group_by(geo_value) %>%
   epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)),
-            n = 2, as_list_col = TRUE)
+            before = 1, as_list_col = TRUE)
 
 class(df2$a)
 length(df2$a)
@@ -119,7 +119,7 @@ slide computation (here `x_2dav` and `x_2dma`) separated by "_".
 df %>% 
   group_by(geo_value) %>%
   epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)),
-            n = 2, as_list_col = FALSE)
+            before = 1, as_list_col = FALSE)
 ```
 
 We can use `names_sep = NULL` (which gets passed to `tidyr::unnest()`) to drop
@@ -129,7 +129,7 @@ the prefix associated with list column name, in naming the unnested columns.
 df %>% 
   group_by(geo_value) %>%
   epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)),
-            n = 2, as_list_col = FALSE, names_sep = NULL)
+            before = 1, as_list_col = FALSE, names_sep = NULL)
 ```
 
 Furthermore, `epi_slide()` will recycle the single row data frame as needed in
@@ -138,15 +138,15 @@ order to make the result size stable, just like the case for atomic values.
 ```{r}
 df %>% 
   epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)),
-            n = 2, as_list_col = FALSE, names_sep = NULL)
+            before = 1, as_list_col = FALSE, names_sep = NULL)
 ```
 
 ```{r, include = FALSE}
 # More checks (not included)
 df %>% 
   epi_slide(a = data.frame(x_2dav = mean(x), x_2dma = mad(x)),
             ref_time_values = as.Date("2020-06-02"),
-            n = 2, as_list_col = FALSE, names_sep = NULL)
+            before = 1, as_list_col = FALSE, names_sep = NULL)
 
 df %>% 
   mutate(version = time_value) %>% 
@@ -181,7 +181,7 @@ df %>%
                   filter(time_value == max(time_value)), 
                 interval = "prediction", level = 0.9)
       ))
-  }, n = 2, new_col_name = "fc", names_sep = NULL)
+  }, before = 1, new_col_name = "fc", names_sep = NULL)
 ```
 
 ## Version-aware forecasting, revisited
@@ -222,15 +222,15 @@ x <- y1 %>%
     version = issue,
     percent_cli = value
   ) %>%
-  as_epi_archive()
+  as_epi_archive(compactify=FALSE)
 
 # mutating merge operation:
 x$merge(y2 %>%
   select(geo_value, time_value,
     version = issue,
     case_rate_7d_av = value
   ) %>%
-  as_epi_archive()
+  as_epi_archive(compactify=FALSE)
 )
 ```
 

diff --git a/vignettes/aggregation.Rmd b/vignettes/aggregation.Rmd
@@ -189,15 +189,15 @@ running `epi_slide()` on the zero-filled data brings these trailing averages
 xt %>% 
   as_epi_df() %>%
   group_by(geo_value) %>%
-  epi_slide(cases_7dav = mean(cases), n = 7) %>%
+  epi_slide(cases_7dav = mean(cases), before = 6) %>%
   filter(geo_value == "Plymouth, MA",
          abs(time_value - as.Date("2021-07-01")) <= 3) %>%
   print(n = 7)
 
 xt_filled %>% 
   as_epi_df() %>%
   group_by(geo_value) %>%
-  epi_slide(cases_7dav = mean(cases), n = 7) %>%
+  epi_slide(cases_7dav = mean(cases), before = 6) %>%
   filter(geo_value == "Plymouth, MA",
          abs(time_value - as.Date("2021-07-01")) <= 3) %>%
   print(n = 7)

diff --git a/vignettes/archive.Rmd b/vignettes/archive.Rmd
@@ -394,7 +394,7 @@ k_week_ahead <- function(x, ahead = 7, as_of = TRUE) {
   else {
     x_latest %>% 
       group_by(geo_value) %>%
-      epi_slide(fc = prob_arx(percent_cli, case_rate_7d_av, ahead = ahead), n = 120,
+      epi_slide(fc = prob_arx(percent_cli, case_rate_7d_av, ahead = ahead), before = 119,
                 ref_time_values = fc_time_values) %>%
       mutate(target_date = time_value + ahead, as_of = FALSE) 
   }