cmu-delphi · dshemetov · Apr 24, 2024 · Apr 8, 2024 · Apr 10, 2024 · Apr 11, 2024
diff --git a/.gitignore b/.gitignore
@@ -9,3 +9,4 @@ scripts/**.html
 nohup.out
 run.Rout
 tmp.R
+reports/
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -21,6 +21,7 @@ Imports:
     epipredict,
     epiprocess,
     here,
+    jsonlite,
     lubridate,
     magrittr,
     parsnip (>= 1.0.0),

diff --git a/Makefile b/Makefile
@@ -14,16 +14,12 @@ run-nohup:
 sync:
 	Rscript scripts/sync.R
 
-download:
+pull:
 	Rscript scripts/sync.R download
 
-pull: download
-
-upload:
+push:
 	Rscript scripts/sync.R upload
 
-push: upload
-
 dashboard:
 	Rscript scripts/dashboard.R
 

diff --git a/NAMESPACE b/NAMESPACE
@@ -16,6 +16,7 @@ export(extend_ahead)
 export(flatline_fc)
 export(forecaster_lookup)
 export(format_storage)
+export(get_exclusions)
 export(id_ahead_ensemble_grid)
 export(interval_coverage)
 export(make_data_targets)

diff --git a/R/epipredict_utilities.R b/R/epipredict_utilities.R
@@ -65,7 +65,7 @@ arx_postprocess <- function(postproc,
   return(postproc)
 }
 
-#' helper function to run a epipredict model and reformat to hub format
+#' run_workflow_and_format
 #' @description
 #' helper function to run a epipredict model and reformat to hub format
 #' @param preproc the preprocessing steps

diff --git a/R/latency_adjusting.R b/R/latency_adjusting.R
@@ -24,12 +24,3 @@ extend_ahead <- function(epi_data, ahead) {
   }
   return(list(epi_data, effective_ahead))
 }
-
-#' last observation carried forward
-#' @description
-#' instead of modifying `ahead`, interpolate `epi_data` to contain last
-#'   observation carried forward
-#' @param epi_data the dataset
-#' @param ahead how many units (depending on the dataset, normally days or weeks) to predict ahead of the `forecast_date`
-locf_latency <- function(epi_data, ahead) {
-}
diff --git a/R/targets_utils.R b/R/targets_utils.R
@@ -155,6 +155,7 @@ make_shared_grids <- function() {
     )
   )
 }
+
 #' Make list of common ensembles for forecasting experiments across projects
 #' @export
 make_shared_ensembles <- function() {

diff --git a/R/utils.R b/R/utils.R
@@ -9,33 +9,6 @@ covidhub_probs <- function(type = c("standard", "inc_case")) {
   )
 }
 
-
-#' add a unique id based on the column contents
-#' @description
-#' create a string of `n_adj` that is a hash of the parameters
-#' and append the `ahead` at the end.
-#' @param df the df to add a column to. everything should be convertable to a string
-#' @param n_adj the number of adjectives to use; default of 2.
-#' @importFrom cli hash_animal
-#' @export
-add_id <- function(df, n_adj = 2) {
-  no_ahead <- df %>%
-    select(-ahead)
-  stringified <- no_ahead %>%
-    select(order(colnames(no_ahead))) %>%
-    rowwise() %>%
-    mutate(id = paste(across(everything()), sep = "", collapse = ""), .keep = "none") %>%
-    mutate(id = hash_animal(id, n_adj = n_adj)$words) %>%
-    mutate(id = paste(id[1:n_adj], sep = "", collapse = "."))
-  df %<>%
-    ungroup() %>%
-    mutate(parent_id = stringified$id) %>%
-    rowwise() %>%
-    mutate(id = paste(parent_id, ahead, sep = ".", collapse = " ")) %>%
-    ungroup()
-  return(df)
-}
-
 #' look up forecasters by name
 #' @description
 #' given a (partial) forecaster name, look up all forecasters in the given project which contain part of that name.
@@ -105,6 +78,31 @@ ensemble_missing_forecasters_details <- function(ensemble_grid = NULL, param_gri
   return(unique_missing)
 }
 
+#' add a unique id based on the column contents
+#' @description
+#' create a string of `n_adj` that is a hash of the parameters
+#' and append the `ahead` at the end.
+#' @param df the df to add a column to. everything should be convertable to a string
+#' @param n_adj the number of adjectives to use; default of 2.
+#' @importFrom cli hash_animal
+#' @export
+add_id <- function(df, n_adj = 2) {
+  no_ahead <- df %>%
+    select(-ahead)
+  stringified <- no_ahead %>%
+    select(order(colnames(no_ahead))) %>%
+    rowwise() %>%
+    mutate(id = paste(across(everything()), sep = "", collapse = ""), .keep = "none") %>%
+    mutate(id = hash_animal(id, n_adj = n_adj)$words) %>%
+    mutate(id = paste(id[1:n_adj], sep = "", collapse = "."))
+  df %<>%
+    ungroup() %>%
+    mutate(parent_id = stringified$id) %>%
+    rowwise() %>%
+    mutate(id = paste(parent_id, ahead, sep = ".", collapse = " ")) %>%
+    ungroup()
+  return(df)
+}
 
 #' generate an id from a simple list of parameters
 #' @param param_list the list of parameters. must include `ahead` if `ahead = NULL`
@@ -153,28 +151,6 @@ id_ahead_ensemble_grid <- function(ensemble_grid, aheads, n_adj = 2) {
   return(ensemble_grid)
 }
 
-
-#' temporary patch that pulls `NA`'s out of an epi_df
-#' @description
-#' just delete rows that have NA's in them. eventually epipredict should directly handle this so we don't have to
-#' @param epi_data the epi_df to be fixed
-#' @param outcome the column name containing the target variable
-#' @param extra_sources any other columns used as predictors
-#' @importFrom tidyr drop_na
-#' @importFrom epiprocess as_epi_df
-#' @export
-clear_lastminute_nas <- function(epi_data, outcome, extra_sources) {
-  meta_data <- attr(epi_data, "metadata")
-  if (extra_sources == c("")) {
-    extra_sources <- character(0L)
-  }
-  epi_data %<>%
-    drop_na(c(!!outcome, !!!extra_sources)) %>%
-    as_epi_df()
-  attr(epi_data, "metadata") <- meta_data
-  return(epi_data)
-}
-
 #' convert a list of forecasters
 #' @description
 #' the required format for targets is a little jank; this takes a human legible tibble and makes it targets legible.
@@ -197,6 +173,17 @@ make_target_param_grid <- function(param_grid) {
     param_names = list_names
   )
 }
+
+#' helper function for `make_target_param_grid`
+#' @keywords internal
+lists_of_real_values <- function(param_grid) {
+  full_lists <- transpose(param_grid %>% select(-forecaster, -id))
+  filter_nonvalues <- function(x) {
+    Filter(function(a) !all(is.null(a)) && !all(is.na(a)), x)
+  }
+  map(full_lists, filter_nonvalues)
+}
+
 #' convert a list of forecasters
 #' @description
 #' the required format for targets is a little jank; this takes a human legible tibble and makes it targets legible.
@@ -215,19 +202,53 @@ make_target_ensemble_grid <- function(param_grid, ONE_AHEAD_FORECASTER_NAME = "f
     mutate(forecaster_ids = list(syms(paste(ONE_AHEAD_FORECASTER_NAME, forecaster_ids, sep = "_"))))
   return(param_grid)
 }
+
 #' function to map
 #' @keywords internal
 #' @param sym_names a list of the parameter names that should be turned into symbols
 sym_subset <- function(param_list, sym_names = list("average_type")) {
   imap(param_list, \(x, y) if (y %in% sym_names) sym(x) else x)
 }
 
-#' helper function for `make_target_param_grid`
-#' @keywords internal
-lists_of_real_values <- function(param_grid) {
-  full_lists <- transpose(param_grid %>% select(-forecaster, -id))
-  filter_nonvalues <- function(x) {
-    Filter(function(a) !all(is.null(a)) && !all(is.na(a)), x)
+#' temporary patch that pulls `NA`'s out of an epi_df
+#' @description
+#' just delete rows that have NA's in them. eventually epipredict should directly handle this so we don't have to
+#' @param epi_data the epi_df to be fixed
+#' @param outcome the column name containing the target variable
+#' @param extra_sources any other columns used as predictors
+#' @importFrom tidyr drop_na
+#' @importFrom epiprocess as_epi_df
+#' @export
+clear_lastminute_nas <- function(epi_data, outcome, extra_sources) {
+  meta_data <- attr(epi_data, "metadata")
+  if (extra_sources == c("")) {
+    extra_sources <- character(0L)
   }
-  map(full_lists, filter_nonvalues)
+  epi_data %<>%
+    drop_na(c(!!outcome, !!!extra_sources)) %>%
+    as_epi_df()
+  attr(epi_data, "metadata") <- meta_data
+  return(epi_data)
+}
+
+#' Get exclusions from a JSON file for a given date
+#'
+#' @param date A date
+#' @param exclusions_json A JSON file with exclusions in the format:
+#'
+#'    {"exclusions": {"2024-03-24": "ak,hi"}}
+#'
+#' @export
+get_exclusions <- function(
+    date,
+    exclusions_json = here::here("scripts", "geo_exclusions.json")) {
+  if (!file.exists(exclusions_json)) {
+    return("")
+  }
+
+  s <- jsonlite::read_json(exclusions_json)$exclusions[[as.character(date)]]
+  if (!is.null(s)) {
+    return(strsplit(s, ",")[[1]])
+  }
+  return("")
 }
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # Exploration Tooling
 
-This repo is meant to be a place to explore different forecasting methods and tools for both COVID and flu.
+This repo is for exploring forecasting methods and tools for both COVID and Flu.
 The repo is structured as a [targets](https://docs.ropensci.org/targets/) project, which means that it is easy to run things in parallel and to cache results.
 The repo is also structured as an R package, which means that it is easy to share code between different targets.
 
@@ -12,7 +12,6 @@ Define run parameters:
 # Save to your `.Renviron` file:
 EPIDATR_USE_CACHE=true
 # not strictly necessary, but you probably want a long cache time, since this is for the historical data
-EPIDATR_CACHE_DIR=~/.epidatr-cache
 EPIDATR_CACHE_MAX_AGE_DAYS=42
 DEBUG_MODE=false
 USE_SHINY=false
@@ -21,22 +20,20 @@ EXTERNAL_SCORES_PATH=legacy-exploration-scorecards.qs
 AWS_S3_PREFIX=exploration
 ```
 
--   `EPIDATR_USE_CACHE` controls whether `epidatr` functions use the cache.
--   `DEBUG_MODE` controls whether `targets::tar_make` is run with the `callr_function=NULL`, which allows for debugging. This only works if parallelization has been turned off in `scripts/targets-common.R` by setting the default controller to serial on line 51.
--   `USE_SHINY` controls whether we start a Shiny server after producing the targets.
--   `TAR_PROJECT` controls which `targets` project is run by `run.R`. Likely either `covid_hosp_explore` or `flu_hosp_explore`
--   `EXTERNAL_SCORES_PATH` controls where external scores are loaded from. If not set, external scores are not used.
--   `AWS_S3_PREFIX` controls the prefix to use in the AWS S3 bucket (a prefix is a pseudo-directory in a bucket).
+- `EPIDATR_USE_CACHE` controls whether `epidatr` functions use the cache.
+- `DEBUG_MODE` controls whether `targets::tar_make` is run with the `callr_function=NULL`, which allows for debugging. This only works if parallelization has been turned off in `scripts/targets-common.R` by setting the default controller to serial on line 51.
+- `USE_SHINY` controls whether we start a Shiny server after producing the targets.
+- `TAR_PROJECT` controls which `targets` project is run by `run.R`. Likely either `covid_hosp_explore` or `flu_hosp_explore`
+- `EXTERNAL_SCORES_PATH` controls where external scores are loaded from. If not set, external scores are not used.
+- `AWS_S3_PREFIX` controls the prefix to use in the AWS S3 bucket (a prefix is a pseudo-directory in a bucket).
 
 Run the pipeline using:
 
 ```sh
-# Install renv and R dependencies.
+# Install renv and R dependencies
 make install
 
 # Pull pre-scored forecasts from the AWS bucket
-make download
-# or
 make pull
 
 # Run only the dashboard, to display results run on other machines
@@ -47,32 +44,23 @@ make run
 # or in the background
 make run-nohup
 
-# Upload/push complete or partial results to the AWS bucket
-make upload
-# or
+# Push complete or partial results to the AWS bucket
 make push
 ```
 
--   `EPIDATR_USE_CACHE` controls whether `epidatr` functions use the cache.
--   `DEBUG_MODE` controls whether `targets::tar_make` is run with the `callr_function=NULL`, which allows for `browser()`. It also disables parallelization. If you are developing, it is recommended to set this to true. If you are just running, it is recommended to set it to false.
--   `USE_SHINY` controls whether we start a Shiny server after producing the targets.
--   `TAR_PROJECT` controls which `targets` project is run by `run.R`.
--   `EXTERNAL_SCORES_PATH` controls where external scores are loaded from. If not set, external scores are not used.
--   `AWS_S3_PREFIX` controls the prefix to use in the AWS S3 bucket (a prefix is a pseudo-directory in a bucket).
-
 ## Development
 
 ### Directory Layout
 
--   `run.R` and `Makefile`: the main entrypoint for all pipelines
--   `R/`: R package code to be reused
--   `scripts/`: plotting, code, and misc.
--   `tests/`: package tests
--   `covid_hosp_explore/` and `covid_hosp_explore.R`: a `targets` project for exploring covid hospitalization forecasters
--   `flu_hosp_explore/` and `flu_hosp_explore.R`: a `targets` project for exploring flu hospitalization forecasters
--   `covid_hosp_prod/` and `covid_hosp_prod.R`: a `targets` project for predicting covid hospitalizations
--   `flu_hosp_prod/` and `flu_hosp_prod.R`: a `targets` project for predicting flu hospitalizations
--   `forecaster_testing/` and `forecaster_testing.R`: a `targets` project for testing forecasters
+- `Makefile`: the main entrypoint for all pipelines
+- `R/`: R package code to be reused
+- `scripts/`: plotting, code, and misc.
+- `tests/`: package tests
+- `covid_hosp_explore/` and `scripts/covid_hosp_explore.R`: a `targets` project for exploring covid hospitalization forecasters
+- `flu_hosp_explore/` and `scripts/flu_hosp_explore.R`: a `targets` project for exploring flu hospitalization forecasters
+- `covid_hosp_prod/` and `scripts/covid_hosp_prod.R`: a `targets` project for predicting covid hospitalizations
+- `flu_hosp_prod/` and `scripts/flu_hosp_prod.R`: a `targets` project for predicting flu hospitalizations
+- `forecaster_testing/` and `scripts/forecaster_testing.R`: a `targets` project for testing forecasters
 
 ### Parallelization Gotchas
 
@@ -84,6 +72,7 @@ It is safest to develop with parallelism disabled.
 Targets in parallel mode has two problems when it comes to debugging: 1) it ignores browsers, so you can't step through functions and 2) reloading any changes requires both `renv::install(".")` and restarting R.
 
 To debug a target named `yourTarget`:
+
 1. set `DEBUG_MODE=true`
 2. insert a browser in the relevant function
 3. run an R session, and call `tar_make(yourTarget)`

diff --git a/covid_hosp_prod/.gitignore b/covid_hosp_prod/.gitignore
@@ -0,0 +1,6 @@
+*
+!.gitignore
+!meta
+!*.R
+meta/*
+# !meta/meta
diff --git a/man/get_exclusions.Rd b/man/get_exclusions.Rd
diff --git a/man/locf_latency.Rd b/man/locf_latency.Rd
diff --git a/man/run_workflow_and_format.Rd b/man/run_workflow_and_format.Rd
-Original file line number
+Diff line change
@@ Expand Up / @@ -155,6 +155,7 @@ make_shared_grids <- function() { @@
         )
       )
     }
     #' Make list of common ensembles for forecasting experiments across projects
     #' @export
     make_shared_ensembles <- function() {
@@ Expand Down @@