cmu-delphi
diff --git a/‎NAMESPACE
Lines changed: 4 additions & 0 deletions b/‎NAMESPACE
Lines changed: 4 additions & 0 deletions
diff --git a/‎R/data_transforms.R
Lines changed: 58 additions & 2 deletions b/‎R/data_transforms.R
Lines changed: 58 additions & 2 deletions
diff --git a/‎R/epipredict_utilities.R
Lines changed: 4 additions & 4 deletions b/‎R/epipredict_utilities.R
Lines changed: 4 additions & 4 deletions
diff --git a/‎R/forecaster_smoothed_scaled.R
Lines changed: 2 additions & 2 deletions b/‎R/forecaster_smoothed_scaled.R
Lines changed: 2 additions & 2 deletions
diff --git a/‎man/arx_preprocess.Rd
Lines changed: 3 additions & 3 deletions b/‎man/arx_preprocess.Rd
Lines changed: 3 additions & 3 deletions
diff --git a/‎man/cache_metadata.Rd
Lines changed: 11 additions & 0 deletions b/‎man/cache_metadata.Rd
Lines changed: 11 additions & 0 deletions
diff --git a/‎man/get_nonkey_names.Rd
Lines changed: 14 additions & 0 deletions b/‎man/get_nonkey_names.Rd
Lines changed: 14 additions & 0 deletions
diff --git a/‎man/update_predictors.Rd
Lines changed: 16 additions & 0 deletions b/‎man/update_predictors.Rd
Lines changed: 16 additions & 0 deletions
diff --git a/‎tests/testthat/test-forecasters-basics.R
Lines changed: 1 addition & 0 deletions b/‎tests/testthat/test-forecasters-basics.R
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/testthat/test-transforms.R
Lines changed: 18 additions & 1 deletion b/‎tests/testthat/test-transforms.R
Lines changed: 18 additions & 1 deletion
@@ -40,6 +40,7 @@ export(single_id)
 export(slide_forecaster)
 export(smoothed_scaled)
 export(underprediction)
+export(update_predictors)
 export(weighted_interval_score)
 importFrom(assertthat,assert_that)
 importFrom(cli,cli_abort)
@@ -88,9 +89,12 @@ importFrom(epiprocess,epix_slide)
 importFrom(magrittr,"%<>%")
 importFrom(magrittr,"%>%")
 importFrom(purrr,imap)
+importFrom(purrr,list_modify)
 importFrom(purrr,map)
 importFrom(purrr,map2_vec)
+importFrom(purrr,map_chr)
 importFrom(purrr,map_vec)
+importFrom(purrr,reduce)
 importFrom(purrr,transpose)
 importFrom(recipes,all_numeric)
 importFrom(rlang,"!!")
 
@@ -6,14 +6,48 @@
 #' @param cols vector of column names to use. If `NULL`, fill with all non-key columns
 get_trainable_names <- function(epi_data, cols) {
   if (is.null(cols)) {
-    cols <- names(epi_data)
-    cols <- cols[!(cols %in% c("geo_value", "time_value", attr(epi_data, "metadata")$other_keys))]
+    cols <- get_nonkey_names(epi_data)
     # exclude anything with the same naming schema as the rolling average/sd created below
     cols <- cols[!grepl("_\\w{1,2}\\d+", cols)]
   }
   return(cols)
 }
 
+#' just the names which aren't keys for an epi_df
+#' @description
+#' names, but it excludes keys
+#' @param epi_data the epi_df
+get_nonkey_names <- function(epi_data) {
+  cols <- names(epi_data)
+  cols <- cols[!(cols %in% c("geo_value", "time_value", attr(epi_data, "metadata")$other_keys))]
+}
+
+
+#' update the predictors to only contain the smoothed/sd versions of cols
+#' @description
+#' should only be applied after both rolling_mean and rolling_sd
+#' @param epi_data the epi_df
+#' @param cols the list of columns
+#' @importFrom purrr map map_chr reduce
+#' @export
+update_predictors <- function(epi_data, cols_modified, predictors) {
+  if (!is.null(cols_modified)) {
+    # if cols_modified isn't null, make sure we include predictors that weren't modified
+    other_predictors <- map(cols_modified, ~ !grepl(.x, predictors)) %>% reduce(`&`)
+    other_predictors <- predictors[other_predictors]
+  } else {
+    other_predictors <- c()
+  }
+  # all the non-key names
+  col_names <- get_nonkey_names(epi_data)
+  is_present <- function(x) {
+    grepl(x, col_names) & !(col_names %in% predictors)
+  }
+  is_modified <- map(predictors, is_present) %>% reduce(`|`)
+  new_predictors <- col_names[is_modified]
+  return(c(other_predictors, new_predictors))
+}
+
 #' get a rolling average for the named columns
 #' @description
 #' add column(s) that are the rolling means of the specified columns, as
@@ -36,6 +70,25 @@ rolling_mean <- function(epi_data, width = 7L, cols_to_mean = NULL) {
   return(epi_data)
 }
 
+#' store the metadata in a easy to reapply way
+#' @importFrom purrr list_modify
+cache_metadata <- function(epi_data) {
+  features <- list()
+  all_others <- attributes(epi_data)$metadata
+  all_others["geo_type"] <- NULL
+  all_others["time_type"] <- NULL
+  all_others["as_of"] <- NULL
+  if (length(all_others) == 0) {
+   all_others <- list()
+  }
+  features <- list(
+    as_of = attributes(epi_data)$metadata$as_of,
+    geo_type = attributes(epi_data)$metadata$geo_type,
+    time_type = attributes(epi_data)$metadata$time_type, all_others = all_others
+  )
+  return(features)
+}
+
 #' get a rolling standard deviation for the named columns
 #' @description
 #' A rolling standard deviation, based off of a rolling mean. First it
@@ -56,15 +109,18 @@ rolling_sd <- function(epi_data, sd_width = 28L, mean_width = NULL, cols_to_sd =
     mean_width <- as.integer(ceiling(sd_width / 2))
   }
   cols_to_sd <- get_trainable_names(epi_data, cols_to_sd)
+  metadata <- cache_metadata(epi_data)
   epi_data %<>% group_by(geo_value)
   for (col in cols_to_sd) {
     mean_name <- paste0(col, "_m", mean_width)
     sd_name <- paste0(col, "_SD", sd_width)
     epi_data %<>% mutate({{ mean_name }} := slider::slide_dbl(.data[[col]], mean, .before = mean_width))
     epi_data %<>% mutate({{ sd_name }} := slider::slide2_dbl(.data[[col]], .data[[mean_name]], ~ sqrt(mean((.x - .y)^2)), .before = sd_width))
     if (!keep_mean) {
+      # TODO make sure the extra info sticks around
       epi_data %<>% select(-{{ mean_name }})
     }
+    epi_data %<>%  as_epi_df(metadata$geo_type, metadata$time_type, metadata$as_of, metadata$all_others)
   }
   epi_data %<>% ungroup()
   return(epi_data)
 
@@ -9,18 +9,18 @@
 #' @seealso [arx_postprocess] for the layer equivalent
 #' @importFrom epipredict step_epi_lag step_epi_ahead step_epi_naomit step_training_window
 #' @export
-arx_preprocess <- function(rec, outcome, predictors, args_list) {
+arx_preprocess <- function(preproc, outcome, predictors, args_list) {
   # input already validated
   lags <- args_list$lags
   for (l in seq_along(lags)) {
     p <- predictors[l]
-    rec %<>% step_epi_lag(!!p, lag = lags[[l]])
+    preproc %<>% step_epi_lag(!!p, lag = lags[[l]])
   }
-  rec %<>%
+  preproc %<>%
     step_epi_ahead(!!outcome, ahead = args_list$ahead) %>%
     step_epi_naomit() %>%
     step_training_window(n_recent = args_list$n_training)
-  return(rec)
+  return(preproc)
 }
 
 # TODO replace with `layer_arx_forecaster`
 
@@ -100,8 +100,8 @@ smoothed_scaled <- function(epi_data,
       keep_mean = keep_mean
     )
   }
-  # even
-
+  # and need to make sure we exclude the original varialbes as predictors
+  predictors <- update_predictors(epi_data, c(smooth_cols, sd_cols), predictors)
   # preprocessing supported by epipredict
   preproc <- epi_recipe(epi_data)
   if (pop_scaling) {
 
@@ -5,6 +5,7 @@ forecasters <- list(
   c("flatline_fc", flatline_fc),
   c("smoothed_scaled", smoothed_scaled)
 )
+forecaster <- forecasters[[3]]
 for (forecaster in forecasters) {
   test_that(paste(forecaster[[1]], "gets the date and columns right"), {
     jhu <- epipredict::case_death_rate_subset %>%
 
@@ -21,6 +21,7 @@ test_that("rolling_mean generates correct mean", {
   # same, but "ca" is reversed, noting mean(40:(40-7)) =36.5
   linear_reverse_roll_mean <- c(seq(from = 40, to = 36.5, by = -0.5), seq(from = 35.5, to = 4.5, by = -1))
   expect_equal(rolled %>% filter(geo_value == "ca") %>% pull("a_m7"), linear_reverse_roll_mean)
+  expect_true("epi_df" %in% class(rolled))
 })
 
 test_that("rolling_sd generates correct standard deviation", {
@@ -33,10 +34,26 @@ test_that("rolling_sd generates correct standard deviation", {
   expect_equal(rolled %>% filter(geo_value == "al") %>% pull("a_SD28"), linear_roll_sd)
   # even though ca is reversed, the changes are all the same, so the standard deviation is *exactly* the same values
   expect_equal(rolled %>% filter(geo_value == "ca") %>% pull("a_SD28"), linear_roll_sd)
+  # doesn't break types
+  expect_true("epi_df" %in% class(rolled))
 })
-testthat("get_trainable_names pulls out mean and sd columns", {
+
+test_that("get_trainable_names pulls out mean and sd columns", {
   rolled <- rolling_sd(epi_data, keep_mean = TRUE)
   expect_equal(names(rolled), c("geo_value", "time_value", "a", "b", "a_m14", "a_SD28", "b_m14", "b_SD28"))
   expect_equal(get_trainable_names(rolled, NULL), c("a", "b"))
 })
 # TODO example with NA's, example with missing days, only one column, keep_mean
+
+test_that("update_predictors keeps unmodified predictors", {
+  epi_data["c"] = NaN
+  epi_data["d"] = NaN
+  epi_data["b_m14"] = NaN
+  epi_data["b_SD28"] = NaN
+  predictors <- c("a", "b", "c") # everything but d
+  modified <- c("b", "c") # we want to exclude b but not its modified versions
+  expected_predictors <- c("a", "b_m14", "b_SD28")
+  expect_equal(update_predictors(epi_data, modified, predictors), expected_predictors)
+  expected_if_all_modified <- c("b_m14", "b_SD28")
+  expect_equal(update_predictors(epi_data, NULL, predictors), expected_if_all_modified)
+})
Original file line number	Diff line number	Diff line change
`@@ -100,8 +100,8 @@ smoothed_scaled <- function(epi_data,`
`100`	`100`	`keep_mean = keep_mean`
`101`	`101`	`)`
`102`	`102`	`}`
`103`		`- # even`
`104`		`-`
	`103`	`+ # and need to make sure we exclude the original varialbes as predictors`
	`104`	`+ predictors <- update_predictors(epi_data, c(smooth_cols, sd_cols), predictors)`
`105`	`105`	`# preprocessing supported by epipredict`
`106`	`106`	`preproc <- epi_recipe(epi_data)`
`107`	`107`	`if (pop_scaling) {`
Original file line number	Diff line number	Diff line change
`@@ -5,6 +5,7 @@ forecasters <- list(`
`5`	`5`	`c("flatline_fc", flatline_fc),`
`6`	`6`	`c("smoothed_scaled", smoothed_scaled)`
`7`	`7`	`)`
	`8`	`+forecaster <- forecasters[[3]]`
`8`	`9`	`for (forecaster in forecasters) {`
`9`	`10`	`test_that(paste(forecaster[[1]], "gets the date and columns right"), {`
`10`	`11`	`jhu <- epipredict::case_death_rate_subset %>%`