cmu-delphi
diff --git a/‎DESCRIPTION
Lines changed: 1 addition & 0 deletions b/‎DESCRIPTION
Lines changed: 1 addition & 0 deletions
diff --git a/‎NAMESPACE
Lines changed: 5 additions & 0 deletions b/‎NAMESPACE
Lines changed: 5 additions & 0 deletions
diff --git a/‎R/data_transforms.R
Lines changed: 69 additions & 0 deletions b/‎R/data_transforms.R
Lines changed: 69 additions & 0 deletions
diff --git a/‎man/get_trainable_names.Rd
Lines changed: 17 additions & 0 deletions b/‎man/get_trainable_names.Rd
Lines changed: 17 additions & 0 deletions
diff --git a/‎man/rolling_mean.Rd
Lines changed: 21 additions & 0 deletions b/‎man/rolling_mean.Rd
Lines changed: 21 additions & 0 deletions
diff --git a/‎man/rolling_sd.Rd
Lines changed: 33 additions & 0 deletions b/‎man/rolling_sd.Rd
Lines changed: 33 additions & 0 deletions
diff --git a/‎man/smooth_scaled.Rd
Lines changed: 65 additions & 0 deletions b/‎man/smooth_scaled.Rd
Lines changed: 65 additions & 0 deletions
diff --git a/‎tests/testthat/test-transforms.R
Lines changed: 37 additions & 0 deletions b/‎tests/testthat/test-transforms.R
Lines changed: 37 additions & 0 deletions
@@ -27,6 +27,7 @@ Imports:
     purrr,
     recipes (>= 1.0.4),
     rlang,
+    slider,
     targets,
     tibble,
     tidyr
 
@@ -30,12 +30,15 @@ export(make_target_param_grid)
 export(overprediction)
 export(perform_sanity_checks)
 export(read_external_predictions_data)
+export(rolling_mean)
+export(rolling_sd)
 export(run_evaluation_measure)
 export(run_workflow_and_format)
 export(scaled_pop)
 export(sharpness)
 export(single_id)
 export(slide_forecaster)
+export(smooth_scaled)
 export(underprediction)
 export(weighted_interval_score)
 importFrom(assertthat,assert_that)
@@ -96,6 +99,8 @@ importFrom(rlang,.data)
 importFrom(rlang,quo)
 importFrom(rlang,sym)
 importFrom(rlang,syms)
+importFrom(slider,slide2_dbl)
+importFrom(slider,slide_dbl)
 importFrom(targets,tar_config_get)
 importFrom(targets,tar_group)
 importFrom(targets,tar_read)
 
@@ -0,0 +1,69 @@
+# various reusable transforms to apply before handing to epipredict
+
+#' extract the non-key columns from epi_data
+#' @keywords internal
+#' @param epi_data the epi_data tibble
+#' @param cols vector of column names to use. If `NULL`, fill with all non-key columns
+get_trainable_names <- function(epi_data, cols) {
+  if (is.null(cols)) {
+    cols <- names(epi_data)
+    cols <- cols[!(cols %in% c("geo_value", "time_value", attr(epi_data, "metadata")$other_keys))]
+  }
+  return(cols)
+}
+
+#' get a rolling average for the named columns
+#' @description
+#' add column(s) that are the rolling means of the specified columns, as
+#'   implemented by slider. Defaults to the previous 7 days.
+#' Currently only group_by's on the geo_value. Should probably extend to more
+#'   keys if you have them
+#' @param epi_data the dataset
+#' @param width the number of days (or examples, the sliding isn't time-aware) to use
+#' @param cols_to_mean the non-key columns to take the mean over. `NULL` means all
+#' @importFrom slider slide_dbl
+#' @export
+rolling_mean <- function(epi_data, width = 7L, cols_to_mean = NULL) {
+  cols_to_mean <- get_trainable_names(epi_data, cols_to_mean)
+  epi_data %<>% group_by(geo_value)
+  for (col in cols_to_mean) {
+    mean_name <- paste0(col, width)
+    epi_data %<>% mutate({{ mean_name }} := slider::slide_dbl(.data[[col]], mean, .before = width))
+  }
+  epi_data %<>% ungroup()
+  return(epi_data)
+}
+
+#' get a rolling standard deviation for the named columns
+#' @description
+#' A rolling standard deviation, based off of a rolling mean. First it
+#'   calculates a rolling mean with width `mean_width`, and then squares the
+#'   difference between that and the actual value, averaged over `sd_width`.
+#' @param epi_data the dataset
+#' @param sd_width the number of days (or examples, the sliding isn't
+#'   time-aware) to use for the standard deviation calculation
+#' @param mean_width like `sd_width`, but it governs the mean. Should be less
+#'   than the `sd_width`, and if `NULL` (the default) it is half of `sd_width`
+#'   (so 14 in the complete default case)
+#' @param cols_to_sd the non-key columns to take the sd over. `NULL` means all
+#' @param keep_mean bool, if `TRUE`, it retains keeps the mean column
+#' @importFrom slider slide_dbl slide2_dbl
+#' @export
+rolling_sd <- function(epi_data, sd_width = 28L, mean_width = NULL, cols_to_sd = NULL, keep_mean = FALSE) {
+  if (is.null(mean_width)) {
+    mean_width <- as.integer(ceiling(sd_width / 2))
+  }
+  cols_to_sd <- get_trainable_names(epi_data, cols_to_sd)
+  epi_data %<>% group_by(geo_value)
+  for (col in cols_to_sd) {
+    mean_name <- paste0(col, "_m", mean_width)
+    sd_name <- paste0(col, "_SD", sd_width)
+    epi_data %<>% mutate({{ mean_name }} := slider::slide_dbl(.data[[col]], mean, .before = mean_width))
+    epi_data %<>% mutate({{ sd_name }} := slider::slide2_dbl(.data[[col]], .data[[mean_name]], ~ sqrt(mean((.x - .y)^2)), .before = sd_width))
+    if (!keep_mean) {
+      epi_data %<>% select(-{{ mean_name }})
+    }
+  }
+  epi_data %<>% ungroup()
+  return(epi_data)
+}
@@ -0,0 +1,37 @@
+n_days <- 40
+simple_dates <- seq(as.Date("2012-01-01"), by = "day", length.out = n_days)
+rand_vals <- rnorm(n_days)
+epi_data <- epiprocess::as_epi_df(rbind(tibble(
+geo_value = "al",
+time_value = simple_dates,
+a = 1:n_days,
+b = rand_vals
+), tibble(
+geo_value = "ca",
+time_value = simple_dates,
+a = n_days:1,
+b = rand_vals + 10
+)))
+test_that("rolling_mean generates correct mean", {
+  rolled <- rolling_mean(epi_data)
+  expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a7", "b7"))
+  # hand specified rolling mean with a rear window of 7, noting that mean(1:7) = 4
+  linear_roll_mean <- c(seq(from=1, to = 4, by = .5), seq(from = 4.5, to = 36.5, by = 1))
+  expect_equal(rolled %>% filter(geo_value == "al") %>% pull("a7"), linear_roll_mean)
+  # same, but "ca" is reversed, noting mean(40:(40-7)) =36.5
+  linear_reverse_roll_mean <- c(seq(from=40, to = 36.5, by = -0.5), seq(from = 35.5, to = 4.5, by = -1))
+  expect_equal(rolled %>% filter(geo_value == "ca") %>% pull("a7"), linear_reverse_roll_mean)
+})
+
+test_that("rolling_sd generates correct standard deviation", {
+  rolled <- rolling_sd(epi_data)
+  expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a_SD28", "b_SD28"))
+  # hand specified rolling mean with a rear window of 7, noting that mean(1:14) = 7.5
+  linear_roll_mean <- c(seq(from=1, to = 7.5, by = .5), seq(from = 8, to = 33, by = 1))
+  # and the standard deviation is
+  linear_roll_sd <- sqrt(slider::slide_dbl((1:40 - linear_roll_mean)^2, mean, .before = 28))
+  expect_equal(rolled %>% filter(geo_value == "al") %>% pull("a_SD28"), linear_roll_sd)
+  # even though ca is reversed, the changes are all the same, so the standard deviation is *exactly* the same values
+  expect_equal(rolled %>% filter(geo_value == "ca") %>% pull("a_SD28"), linear_roll_sd)
+  })
+# TODO example with NA's, example with missing days, only one column, keep_mean