Skip to content

Commit 14d99f1

Browse files
authored
Merge pull request #113 from cmu-delphi/ds/covid-prod
feat: add covid hospitalizations production pipeline
2 parents 1fdf98a + 6a5d240 commit 14d99f1

19 files changed

+368
-382
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,4 @@ scripts/**.html
99
nohup.out
1010
run.Rout
1111
tmp.R
12+
reports/

DESCRIPTION

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ Imports:
2121
epipredict,
2222
epiprocess,
2323
here,
24+
jsonlite,
2425
lubridate,
2526
magrittr,
2627
parsnip (>= 1.0.0),

Makefile

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,16 +14,12 @@ run-nohup:
1414
sync:
1515
Rscript scripts/sync.R
1616

17-
download:
17+
pull:
1818
Rscript scripts/sync.R download
1919

20-
pull: download
21-
22-
upload:
20+
push:
2321
Rscript scripts/sync.R upload
2422

25-
push: upload
26-
2723
dashboard:
2824
Rscript scripts/dashboard.R
2925

NAMESPACE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ export(extend_ahead)
1616
export(flatline_fc)
1717
export(forecaster_lookup)
1818
export(format_storage)
19+
export(get_exclusions)
1920
export(id_ahead_ensemble_grid)
2021
export(interval_coverage)
2122
export(make_data_targets)

R/epipredict_utilities.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ arx_postprocess <- function(postproc,
6565
return(postproc)
6666
}
6767

68-
#' helper function to run a epipredict model and reformat to hub format
68+
#' run_workflow_and_format
6969
#' @description
7070
#' helper function to run a epipredict model and reformat to hub format
7171
#' @param preproc the preprocessing steps

R/latency_adjusting.R

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,3 @@ extend_ahead <- function(epi_data, ahead) {
2424
}
2525
return(list(epi_data, effective_ahead))
2626
}
27-
28-
#' last observation carried forward
29-
#' @description
30-
#' instead of modifying `ahead`, interpolate `epi_data` to contain last
31-
#' observation carried forward
32-
#' @param epi_data the dataset
33-
#' @param ahead how many units (depending on the dataset, normally days or weeks) to predict ahead of the `forecast_date`
34-
locf_latency <- function(epi_data, ahead) {
35-
}

R/targets_utils.R

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,7 @@ make_shared_grids <- function() {
155155
)
156156
)
157157
}
158+
158159
#' Make list of common ensembles for forecasting experiments across projects
159160
#' @export
160161
make_shared_ensembles <- function() {

R/utils.R

Lines changed: 77 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -9,33 +9,6 @@ covidhub_probs <- function(type = c("standard", "inc_case")) {
99
)
1010
}
1111

12-
13-
#' add a unique id based on the column contents
14-
#' @description
15-
#' create a string of `n_adj` that is a hash of the parameters
16-
#' and append the `ahead` at the end.
17-
#' @param df the df to add a column to. everything should be convertable to a string
18-
#' @param n_adj the number of adjectives to use; default of 2.
19-
#' @importFrom cli hash_animal
20-
#' @export
21-
add_id <- function(df, n_adj = 2) {
22-
no_ahead <- df %>%
23-
select(-ahead)
24-
stringified <- no_ahead %>%
25-
select(order(colnames(no_ahead))) %>%
26-
rowwise() %>%
27-
mutate(id = paste(across(everything()), sep = "", collapse = ""), .keep = "none") %>%
28-
mutate(id = hash_animal(id, n_adj = n_adj)$words) %>%
29-
mutate(id = paste(id[1:n_adj], sep = "", collapse = "."))
30-
df %<>%
31-
ungroup() %>%
32-
mutate(parent_id = stringified$id) %>%
33-
rowwise() %>%
34-
mutate(id = paste(parent_id, ahead, sep = ".", collapse = " ")) %>%
35-
ungroup()
36-
return(df)
37-
}
38-
3912
#' look up forecasters by name
4013
#' @description
4114
#' given a (partial) forecaster name, look up all forecasters in the given project which contain part of that name.
@@ -105,6 +78,31 @@ ensemble_missing_forecasters_details <- function(ensemble_grid = NULL, param_gri
10578
return(unique_missing)
10679
}
10780

81+
#' add a unique id based on the column contents
82+
#' @description
83+
#' create a string of `n_adj` that is a hash of the parameters
84+
#' and append the `ahead` at the end.
85+
#' @param df the df to add a column to. everything should be convertable to a string
86+
#' @param n_adj the number of adjectives to use; default of 2.
87+
#' @importFrom cli hash_animal
88+
#' @export
89+
add_id <- function(df, n_adj = 2) {
90+
no_ahead <- df %>%
91+
select(-ahead)
92+
stringified <- no_ahead %>%
93+
select(order(colnames(no_ahead))) %>%
94+
rowwise() %>%
95+
mutate(id = paste(across(everything()), sep = "", collapse = ""), .keep = "none") %>%
96+
mutate(id = hash_animal(id, n_adj = n_adj)$words) %>%
97+
mutate(id = paste(id[1:n_adj], sep = "", collapse = "."))
98+
df %<>%
99+
ungroup() %>%
100+
mutate(parent_id = stringified$id) %>%
101+
rowwise() %>%
102+
mutate(id = paste(parent_id, ahead, sep = ".", collapse = " ")) %>%
103+
ungroup()
104+
return(df)
105+
}
108106

109107
#' generate an id from a simple list of parameters
110108
#' @param param_list the list of parameters. must include `ahead` if `ahead = NULL`
@@ -153,28 +151,6 @@ id_ahead_ensemble_grid <- function(ensemble_grid, aheads, n_adj = 2) {
153151
return(ensemble_grid)
154152
}
155153

156-
157-
#' temporary patch that pulls `NA`'s out of an epi_df
158-
#' @description
159-
#' just delete rows that have NA's in them. eventually epipredict should directly handle this so we don't have to
160-
#' @param epi_data the epi_df to be fixed
161-
#' @param outcome the column name containing the target variable
162-
#' @param extra_sources any other columns used as predictors
163-
#' @importFrom tidyr drop_na
164-
#' @importFrom epiprocess as_epi_df
165-
#' @export
166-
clear_lastminute_nas <- function(epi_data, outcome, extra_sources) {
167-
meta_data <- attr(epi_data, "metadata")
168-
if (extra_sources == c("")) {
169-
extra_sources <- character(0L)
170-
}
171-
epi_data %<>%
172-
drop_na(c(!!outcome, !!!extra_sources)) %>%
173-
as_epi_df()
174-
attr(epi_data, "metadata") <- meta_data
175-
return(epi_data)
176-
}
177-
178154
#' convert a list of forecasters
179155
#' @description
180156
#' the required format for targets is a little jank; this takes a human legible tibble and makes it targets legible.
@@ -197,6 +173,17 @@ make_target_param_grid <- function(param_grid) {
197173
param_names = list_names
198174
)
199175
}
176+
177+
#' helper function for `make_target_param_grid`
178+
#' @keywords internal
179+
lists_of_real_values <- function(param_grid) {
180+
full_lists <- transpose(param_grid %>% select(-forecaster, -id))
181+
filter_nonvalues <- function(x) {
182+
Filter(function(a) !all(is.null(a)) && !all(is.na(a)), x)
183+
}
184+
map(full_lists, filter_nonvalues)
185+
}
186+
200187
#' convert a list of forecasters
201188
#' @description
202189
#' the required format for targets is a little jank; this takes a human legible tibble and makes it targets legible.
@@ -215,19 +202,53 @@ make_target_ensemble_grid <- function(param_grid, ONE_AHEAD_FORECASTER_NAME = "f
215202
mutate(forecaster_ids = list(syms(paste(ONE_AHEAD_FORECASTER_NAME, forecaster_ids, sep = "_"))))
216203
return(param_grid)
217204
}
205+
218206
#' function to map
219207
#' @keywords internal
220208
#' @param sym_names a list of the parameter names that should be turned into symbols
221209
sym_subset <- function(param_list, sym_names = list("average_type")) {
222210
imap(param_list, \(x, y) if (y %in% sym_names) sym(x) else x)
223211
}
224212

225-
#' helper function for `make_target_param_grid`
226-
#' @keywords internal
227-
lists_of_real_values <- function(param_grid) {
228-
full_lists <- transpose(param_grid %>% select(-forecaster, -id))
229-
filter_nonvalues <- function(x) {
230-
Filter(function(a) !all(is.null(a)) && !all(is.na(a)), x)
213+
#' temporary patch that pulls `NA`'s out of an epi_df
214+
#' @description
215+
#' just delete rows that have NA's in them. eventually epipredict should directly handle this so we don't have to
216+
#' @param epi_data the epi_df to be fixed
217+
#' @param outcome the column name containing the target variable
218+
#' @param extra_sources any other columns used as predictors
219+
#' @importFrom tidyr drop_na
220+
#' @importFrom epiprocess as_epi_df
221+
#' @export
222+
clear_lastminute_nas <- function(epi_data, outcome, extra_sources) {
223+
meta_data <- attr(epi_data, "metadata")
224+
if (extra_sources == c("")) {
225+
extra_sources <- character(0L)
231226
}
232-
map(full_lists, filter_nonvalues)
227+
epi_data %<>%
228+
drop_na(c(!!outcome, !!!extra_sources)) %>%
229+
as_epi_df()
230+
attr(epi_data, "metadata") <- meta_data
231+
return(epi_data)
232+
}
233+
234+
#' Get exclusions from a JSON file for a given date
235+
#'
236+
#' @param date A date
237+
#' @param exclusions_json A JSON file with exclusions in the format:
238+
#'
239+
#' {"exclusions": {"2024-03-24": "ak,hi"}}
240+
#'
241+
#' @export
242+
get_exclusions <- function(
243+
date,
244+
exclusions_json = here::here("scripts", "geo_exclusions.json")) {
245+
if (!file.exists(exclusions_json)) {
246+
return("")
247+
}
248+
249+
s <- jsonlite::read_json(exclusions_json)$exclusions[[as.character(date)]]
250+
if (!is.null(s)) {
251+
return(strsplit(s, ",")[[1]])
252+
}
253+
return("")
233254
}

README.md

Lines changed: 19 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Exploration Tooling
22

3-
This repo is meant to be a place to explore different forecasting methods and tools for both COVID and flu.
3+
This repo is for exploring forecasting methods and tools for both COVID and Flu.
44
The repo is structured as a [targets](https://docs.ropensci.org/targets/) project, which means that it is easy to run things in parallel and to cache results.
55
The repo is also structured as an R package, which means that it is easy to share code between different targets.
66

@@ -12,7 +12,6 @@ Define run parameters:
1212
# Save to your `.Renviron` file:
1313
EPIDATR_USE_CACHE=true
1414
# not strictly necessary, but you probably want a long cache time, since this is for the historical data
15-
EPIDATR_CACHE_DIR=~/.epidatr-cache
1615
EPIDATR_CACHE_MAX_AGE_DAYS=42
1716
DEBUG_MODE=false
1817
USE_SHINY=false
@@ -21,22 +20,20 @@ EXTERNAL_SCORES_PATH=legacy-exploration-scorecards.qs
2120
AWS_S3_PREFIX=exploration
2221
```
2322

24-
- `EPIDATR_USE_CACHE` controls whether `epidatr` functions use the cache.
25-
- `DEBUG_MODE` controls whether `targets::tar_make` is run with the `callr_function=NULL`, which allows for debugging. This only works if parallelization has been turned off in `scripts/targets-common.R` by setting the default controller to serial on line 51.
26-
- `USE_SHINY` controls whether we start a Shiny server after producing the targets.
27-
- `TAR_PROJECT` controls which `targets` project is run by `run.R`. Likely either `covid_hosp_explore` or `flu_hosp_explore`
28-
- `EXTERNAL_SCORES_PATH` controls where external scores are loaded from. If not set, external scores are not used.
29-
- `AWS_S3_PREFIX` controls the prefix to use in the AWS S3 bucket (a prefix is a pseudo-directory in a bucket).
23+
- `EPIDATR_USE_CACHE` controls whether `epidatr` functions use the cache.
24+
- `DEBUG_MODE` controls whether `targets::tar_make` is run with the `callr_function=NULL`, which allows for debugging. This only works if parallelization has been turned off in `scripts/targets-common.R` by setting the default controller to serial on line 51.
25+
- `USE_SHINY` controls whether we start a Shiny server after producing the targets.
26+
- `TAR_PROJECT` controls which `targets` project is run by `run.R`. Likely either `covid_hosp_explore` or `flu_hosp_explore`
27+
- `EXTERNAL_SCORES_PATH` controls where external scores are loaded from. If not set, external scores are not used.
28+
- `AWS_S3_PREFIX` controls the prefix to use in the AWS S3 bucket (a prefix is a pseudo-directory in a bucket).
3029

3130
Run the pipeline using:
3231

3332
```sh
34-
# Install renv and R dependencies.
33+
# Install renv and R dependencies
3534
make install
3635

3736
# Pull pre-scored forecasts from the AWS bucket
38-
make download
39-
# or
4037
make pull
4138

4239
# Run only the dashboard, to display results run on other machines
@@ -47,32 +44,23 @@ make run
4744
# or in the background
4845
make run-nohup
4946

50-
# Upload/push complete or partial results to the AWS bucket
51-
make upload
52-
# or
47+
# Push complete or partial results to the AWS bucket
5348
make push
5449
```
5550

56-
- `EPIDATR_USE_CACHE` controls whether `epidatr` functions use the cache.
57-
- `DEBUG_MODE` controls whether `targets::tar_make` is run with the `callr_function=NULL`, which allows for `browser()`. It also disables parallelization. If you are developing, it is recommended to set this to true. If you are just running, it is recommended to set it to false.
58-
- `USE_SHINY` controls whether we start a Shiny server after producing the targets.
59-
- `TAR_PROJECT` controls which `targets` project is run by `run.R`.
60-
- `EXTERNAL_SCORES_PATH` controls where external scores are loaded from. If not set, external scores are not used.
61-
- `AWS_S3_PREFIX` controls the prefix to use in the AWS S3 bucket (a prefix is a pseudo-directory in a bucket).
62-
6351
## Development
6452

6553
### Directory Layout
6654

67-
- `run.R` and `Makefile`: the main entrypoint for all pipelines
68-
- `R/`: R package code to be reused
69-
- `scripts/`: plotting, code, and misc.
70-
- `tests/`: package tests
71-
- `covid_hosp_explore/` and `covid_hosp_explore.R`: a `targets` project for exploring covid hospitalization forecasters
72-
- `flu_hosp_explore/` and `flu_hosp_explore.R`: a `targets` project for exploring flu hospitalization forecasters
73-
- `covid_hosp_prod/` and `covid_hosp_prod.R`: a `targets` project for predicting covid hospitalizations
74-
- `flu_hosp_prod/` and `flu_hosp_prod.R`: a `targets` project for predicting flu hospitalizations
75-
- `forecaster_testing/` and `forecaster_testing.R`: a `targets` project for testing forecasters
55+
- `Makefile`: the main entrypoint for all pipelines
56+
- `R/`: R package code to be reused
57+
- `scripts/`: plotting, code, and misc.
58+
- `tests/`: package tests
59+
- `covid_hosp_explore/` and `scripts/covid_hosp_explore.R`: a `targets` project for exploring covid hospitalization forecasters
60+
- `flu_hosp_explore/` and `scripts/flu_hosp_explore.R`: a `targets` project for exploring flu hospitalization forecasters
61+
- `covid_hosp_prod/` and `scripts/covid_hosp_prod.R`: a `targets` project for predicting covid hospitalizations
62+
- `flu_hosp_prod/` and `scripts/flu_hosp_prod.R`: a `targets` project for predicting flu hospitalizations
63+
- `forecaster_testing/` and `scripts/forecaster_testing.R`: a `targets` project for testing forecasters
7664

7765
### Parallelization Gotchas
7866

@@ -84,6 +72,7 @@ It is safest to develop with parallelism disabled.
8472
Targets in parallel mode has two problems when it comes to debugging: 1) it ignores browsers, so you can't step through functions and 2) reloading any changes requires both `renv::install(".")` and restarting R.
8573

8674
To debug a target named `yourTarget`:
75+
8776
1. set `DEBUG_MODE=true`
8877
2. insert a browser in the relevant function
8978
3. run an R session, and call `tar_make(yourTarget)`

covid_hosp_prod/.gitignore

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
*
2+
!.gitignore
3+
!meta
4+
!*.R
5+
meta/*
6+
# !meta/meta

man/get_exclusions.Rd

Lines changed: 21 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/locf_latency.Rd

Lines changed: 0 additions & 17 deletions
This file was deleted.

man/run_workflow_and_format.Rd

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)