diff --git a/.gitignore b/.gitignore index 0c15fc15..9f015c35 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,4 @@ scripts/**.html nohup.out run.Rout tmp.R +reports/ \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index e57ff371..d1542580 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -21,6 +21,7 @@ Imports: epipredict, epiprocess, here, + jsonlite, lubridate, magrittr, parsnip (>= 1.0.0), diff --git a/Makefile b/Makefile index 4e8d5698..8d3fc167 100644 --- a/Makefile +++ b/Makefile @@ -14,16 +14,12 @@ run-nohup: sync: Rscript scripts/sync.R -download: +pull: Rscript scripts/sync.R download -pull: download - -upload: +push: Rscript scripts/sync.R upload -push: upload - dashboard: Rscript scripts/dashboard.R diff --git a/NAMESPACE b/NAMESPACE index 5c80102d..3fa8cd75 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -16,6 +16,7 @@ export(extend_ahead) export(flatline_fc) export(forecaster_lookup) export(format_storage) +export(get_exclusions) export(id_ahead_ensemble_grid) export(interval_coverage) export(make_data_targets) diff --git a/R/epipredict_utilities.R b/R/epipredict_utilities.R index 083ce35f..a058ea94 100644 --- a/R/epipredict_utilities.R +++ b/R/epipredict_utilities.R @@ -65,7 +65,7 @@ arx_postprocess <- function(postproc, return(postproc) } -#' helper function to run a epipredict model and reformat to hub format +#' run_workflow_and_format #' @description #' helper function to run a epipredict model and reformat to hub format #' @param preproc the preprocessing steps diff --git a/R/latency_adjusting.R b/R/latency_adjusting.R index b9bd97d6..b0dd73bb 100644 --- a/R/latency_adjusting.R +++ b/R/latency_adjusting.R @@ -24,12 +24,3 @@ extend_ahead <- function(epi_data, ahead) { } return(list(epi_data, effective_ahead)) } - -#' last observation carried forward -#' @description -#' instead of modifying `ahead`, interpolate `epi_data` to contain last -#' observation carried forward -#' @param epi_data the dataset -#' @param ahead how many units (depending on the dataset, normally days or weeks) to predict ahead of the `forecast_date` -locf_latency <- function(epi_data, ahead) { -} diff --git a/R/targets_utils.R b/R/targets_utils.R index 3c99fbc8..b0a53bcb 100644 --- a/R/targets_utils.R +++ b/R/targets_utils.R @@ -155,6 +155,7 @@ make_shared_grids <- function() { ) ) } + #' Make list of common ensembles for forecasting experiments across projects #' @export make_shared_ensembles <- function() { diff --git a/R/utils.R b/R/utils.R index 5eb22175..ba819240 100644 --- a/R/utils.R +++ b/R/utils.R @@ -9,33 +9,6 @@ covidhub_probs <- function(type = c("standard", "inc_case")) { ) } - -#' add a unique id based on the column contents -#' @description -#' create a string of `n_adj` that is a hash of the parameters -#' and append the `ahead` at the end. -#' @param df the df to add a column to. everything should be convertable to a string -#' @param n_adj the number of adjectives to use; default of 2. -#' @importFrom cli hash_animal -#' @export -add_id <- function(df, n_adj = 2) { - no_ahead <- df %>% - select(-ahead) - stringified <- no_ahead %>% - select(order(colnames(no_ahead))) %>% - rowwise() %>% - mutate(id = paste(across(everything()), sep = "", collapse = ""), .keep = "none") %>% - mutate(id = hash_animal(id, n_adj = n_adj)$words) %>% - mutate(id = paste(id[1:n_adj], sep = "", collapse = ".")) - df %<>% - ungroup() %>% - mutate(parent_id = stringified$id) %>% - rowwise() %>% - mutate(id = paste(parent_id, ahead, sep = ".", collapse = " ")) %>% - ungroup() - return(df) -} - #' look up forecasters by name #' @description #' given a (partial) forecaster name, look up all forecasters in the given project which contain part of that name. @@ -105,6 +78,31 @@ ensemble_missing_forecasters_details <- function(ensemble_grid = NULL, param_gri return(unique_missing) } +#' add a unique id based on the column contents +#' @description +#' create a string of `n_adj` that is a hash of the parameters +#' and append the `ahead` at the end. +#' @param df the df to add a column to. everything should be convertable to a string +#' @param n_adj the number of adjectives to use; default of 2. +#' @importFrom cli hash_animal +#' @export +add_id <- function(df, n_adj = 2) { + no_ahead <- df %>% + select(-ahead) + stringified <- no_ahead %>% + select(order(colnames(no_ahead))) %>% + rowwise() %>% + mutate(id = paste(across(everything()), sep = "", collapse = ""), .keep = "none") %>% + mutate(id = hash_animal(id, n_adj = n_adj)$words) %>% + mutate(id = paste(id[1:n_adj], sep = "", collapse = ".")) + df %<>% + ungroup() %>% + mutate(parent_id = stringified$id) %>% + rowwise() %>% + mutate(id = paste(parent_id, ahead, sep = ".", collapse = " ")) %>% + ungroup() + return(df) +} #' generate an id from a simple list of parameters #' @param param_list the list of parameters. must include `ahead` if `ahead = NULL` @@ -153,28 +151,6 @@ id_ahead_ensemble_grid <- function(ensemble_grid, aheads, n_adj = 2) { return(ensemble_grid) } - -#' temporary patch that pulls `NA`'s out of an epi_df -#' @description -#' just delete rows that have NA's in them. eventually epipredict should directly handle this so we don't have to -#' @param epi_data the epi_df to be fixed -#' @param outcome the column name containing the target variable -#' @param extra_sources any other columns used as predictors -#' @importFrom tidyr drop_na -#' @importFrom epiprocess as_epi_df -#' @export -clear_lastminute_nas <- function(epi_data, outcome, extra_sources) { - meta_data <- attr(epi_data, "metadata") - if (extra_sources == c("")) { - extra_sources <- character(0L) - } - epi_data %<>% - drop_na(c(!!outcome, !!!extra_sources)) %>% - as_epi_df() - attr(epi_data, "metadata") <- meta_data - return(epi_data) -} - #' convert a list of forecasters #' @description #' the required format for targets is a little jank; this takes a human legible tibble and makes it targets legible. @@ -197,6 +173,17 @@ make_target_param_grid <- function(param_grid) { param_names = list_names ) } + +#' helper function for `make_target_param_grid` +#' @keywords internal +lists_of_real_values <- function(param_grid) { + full_lists <- transpose(param_grid %>% select(-forecaster, -id)) + filter_nonvalues <- function(x) { + Filter(function(a) !all(is.null(a)) && !all(is.na(a)), x) + } + map(full_lists, filter_nonvalues) +} + #' convert a list of forecasters #' @description #' the required format for targets is a little jank; this takes a human legible tibble and makes it targets legible. @@ -215,6 +202,7 @@ make_target_ensemble_grid <- function(param_grid, ONE_AHEAD_FORECASTER_NAME = "f mutate(forecaster_ids = list(syms(paste(ONE_AHEAD_FORECASTER_NAME, forecaster_ids, sep = "_")))) return(param_grid) } + #' function to map #' @keywords internal #' @param sym_names a list of the parameter names that should be turned into symbols @@ -222,12 +210,45 @@ sym_subset <- function(param_list, sym_names = list("average_type")) { imap(param_list, \(x, y) if (y %in% sym_names) sym(x) else x) } -#' helper function for `make_target_param_grid` -#' @keywords internal -lists_of_real_values <- function(param_grid) { - full_lists <- transpose(param_grid %>% select(-forecaster, -id)) - filter_nonvalues <- function(x) { - Filter(function(a) !all(is.null(a)) && !all(is.na(a)), x) +#' temporary patch that pulls `NA`'s out of an epi_df +#' @description +#' just delete rows that have NA's in them. eventually epipredict should directly handle this so we don't have to +#' @param epi_data the epi_df to be fixed +#' @param outcome the column name containing the target variable +#' @param extra_sources any other columns used as predictors +#' @importFrom tidyr drop_na +#' @importFrom epiprocess as_epi_df +#' @export +clear_lastminute_nas <- function(epi_data, outcome, extra_sources) { + meta_data <- attr(epi_data, "metadata") + if (extra_sources == c("")) { + extra_sources <- character(0L) } - map(full_lists, filter_nonvalues) + epi_data %<>% + drop_na(c(!!outcome, !!!extra_sources)) %>% + as_epi_df() + attr(epi_data, "metadata") <- meta_data + return(epi_data) +} + +#' Get exclusions from a JSON file for a given date +#' +#' @param date A date +#' @param exclusions_json A JSON file with exclusions in the format: +#' +#' {"exclusions": {"2024-03-24": "ak,hi"}} +#' +#' @export +get_exclusions <- function( + date, + exclusions_json = here::here("scripts", "geo_exclusions.json")) { + if (!file.exists(exclusions_json)) { + return("") + } + + s <- jsonlite::read_json(exclusions_json)$exclusions[[as.character(date)]] + if (!is.null(s)) { + return(strsplit(s, ",")[[1]]) + } + return("") } diff --git a/README.md b/README.md index 0ab6af67..39bac1a4 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Exploration Tooling -This repo is meant to be a place to explore different forecasting methods and tools for both COVID and flu. +This repo is for exploring forecasting methods and tools for both COVID and Flu. The repo is structured as a [targets](https://docs.ropensci.org/targets/) project, which means that it is easy to run things in parallel and to cache results. The repo is also structured as an R package, which means that it is easy to share code between different targets. @@ -12,7 +12,6 @@ Define run parameters: # Save to your `.Renviron` file: EPIDATR_USE_CACHE=true # not strictly necessary, but you probably want a long cache time, since this is for the historical data -EPIDATR_CACHE_DIR=~/.epidatr-cache EPIDATR_CACHE_MAX_AGE_DAYS=42 DEBUG_MODE=false USE_SHINY=false @@ -21,22 +20,20 @@ EXTERNAL_SCORES_PATH=legacy-exploration-scorecards.qs AWS_S3_PREFIX=exploration ``` -- `EPIDATR_USE_CACHE` controls whether `epidatr` functions use the cache. -- `DEBUG_MODE` controls whether `targets::tar_make` is run with the `callr_function=NULL`, which allows for debugging. This only works if parallelization has been turned off in `scripts/targets-common.R` by setting the default controller to serial on line 51. -- `USE_SHINY` controls whether we start a Shiny server after producing the targets. -- `TAR_PROJECT` controls which `targets` project is run by `run.R`. Likely either `covid_hosp_explore` or `flu_hosp_explore` -- `EXTERNAL_SCORES_PATH` controls where external scores are loaded from. If not set, external scores are not used. -- `AWS_S3_PREFIX` controls the prefix to use in the AWS S3 bucket (a prefix is a pseudo-directory in a bucket). +- `EPIDATR_USE_CACHE` controls whether `epidatr` functions use the cache. +- `DEBUG_MODE` controls whether `targets::tar_make` is run with the `callr_function=NULL`, which allows for debugging. This only works if parallelization has been turned off in `scripts/targets-common.R` by setting the default controller to serial on line 51. +- `USE_SHINY` controls whether we start a Shiny server after producing the targets. +- `TAR_PROJECT` controls which `targets` project is run by `run.R`. Likely either `covid_hosp_explore` or `flu_hosp_explore` +- `EXTERNAL_SCORES_PATH` controls where external scores are loaded from. If not set, external scores are not used. +- `AWS_S3_PREFIX` controls the prefix to use in the AWS S3 bucket (a prefix is a pseudo-directory in a bucket). Run the pipeline using: ```sh -# Install renv and R dependencies. +# Install renv and R dependencies make install # Pull pre-scored forecasts from the AWS bucket -make download -# or make pull # Run only the dashboard, to display results run on other machines @@ -47,32 +44,23 @@ make run # or in the background make run-nohup -# Upload/push complete or partial results to the AWS bucket -make upload -# or +# Push complete or partial results to the AWS bucket make push ``` -- `EPIDATR_USE_CACHE` controls whether `epidatr` functions use the cache. -- `DEBUG_MODE` controls whether `targets::tar_make` is run with the `callr_function=NULL`, which allows for `browser()`. It also disables parallelization. If you are developing, it is recommended to set this to true. If you are just running, it is recommended to set it to false. -- `USE_SHINY` controls whether we start a Shiny server after producing the targets. -- `TAR_PROJECT` controls which `targets` project is run by `run.R`. -- `EXTERNAL_SCORES_PATH` controls where external scores are loaded from. If not set, external scores are not used. -- `AWS_S3_PREFIX` controls the prefix to use in the AWS S3 bucket (a prefix is a pseudo-directory in a bucket). - ## Development ### Directory Layout -- `run.R` and `Makefile`: the main entrypoint for all pipelines -- `R/`: R package code to be reused -- `scripts/`: plotting, code, and misc. -- `tests/`: package tests -- `covid_hosp_explore/` and `covid_hosp_explore.R`: a `targets` project for exploring covid hospitalization forecasters -- `flu_hosp_explore/` and `flu_hosp_explore.R`: a `targets` project for exploring flu hospitalization forecasters -- `covid_hosp_prod/` and `covid_hosp_prod.R`: a `targets` project for predicting covid hospitalizations -- `flu_hosp_prod/` and `flu_hosp_prod.R`: a `targets` project for predicting flu hospitalizations -- `forecaster_testing/` and `forecaster_testing.R`: a `targets` project for testing forecasters +- `Makefile`: the main entrypoint for all pipelines +- `R/`: R package code to be reused +- `scripts/`: plotting, code, and misc. +- `tests/`: package tests +- `covid_hosp_explore/` and `scripts/covid_hosp_explore.R`: a `targets` project for exploring covid hospitalization forecasters +- `flu_hosp_explore/` and `scripts/flu_hosp_explore.R`: a `targets` project for exploring flu hospitalization forecasters +- `covid_hosp_prod/` and `scripts/covid_hosp_prod.R`: a `targets` project for predicting covid hospitalizations +- `flu_hosp_prod/` and `scripts/flu_hosp_prod.R`: a `targets` project for predicting flu hospitalizations +- `forecaster_testing/` and `scripts/forecaster_testing.R`: a `targets` project for testing forecasters ### Parallelization Gotchas @@ -84,6 +72,7 @@ It is safest to develop with parallelism disabled. Targets in parallel mode has two problems when it comes to debugging: 1) it ignores browsers, so you can't step through functions and 2) reloading any changes requires both `renv::install(".")` and restarting R. To debug a target named `yourTarget`: + 1. set `DEBUG_MODE=true` 2. insert a browser in the relevant function 3. run an R session, and call `tar_make(yourTarget)` diff --git a/covid_hosp_prod/.gitignore b/covid_hosp_prod/.gitignore new file mode 100644 index 00000000..4f8b6a01 --- /dev/null +++ b/covid_hosp_prod/.gitignore @@ -0,0 +1,6 @@ +* +!.gitignore +!meta +!*.R +meta/* +# !meta/meta diff --git a/man/get_exclusions.Rd b/man/get_exclusions.Rd new file mode 100644 index 00000000..08b7f4c7 --- /dev/null +++ b/man/get_exclusions.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{get_exclusions} +\alias{get_exclusions} +\title{Get exclusions from a JSON file for a given date} +\usage{ +get_exclusions( + date, + exclusions_json = here::here("scripts", "geo_exclusions.json") +) +} +\arguments{ +\item{date}{A date} + +\item{exclusions_json}{A JSON file with exclusions in the format: + +{"exclusions": {"2024-03-24": "ak,hi"}}} +} +\description{ +Get exclusions from a JSON file for a given date +} diff --git a/man/locf_latency.Rd b/man/locf_latency.Rd deleted file mode 100644 index b76ad554..00000000 --- a/man/locf_latency.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/latency_adjusting.R -\name{locf_latency} -\alias{locf_latency} -\title{last observation carried forward} -\usage{ -locf_latency(epi_data, ahead) -} -\arguments{ -\item{epi_data}{the dataset} - -\item{ahead}{how many units (depending on the dataset, normally days or weeks) to predict ahead of the \code{forecast_date}} -} -\description{ -instead of modifying \code{ahead}, interpolate \code{epi_data} to contain last -observation carried forward -} diff --git a/man/run_workflow_and_format.Rd b/man/run_workflow_and_format.Rd index e1017e54..338f7c37 100644 --- a/man/run_workflow_and_format.Rd +++ b/man/run_workflow_and_format.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/epipredict_utilities.R \name{run_workflow_and_format} \alias{run_workflow_and_format} -\title{helper function to run a epipredict model and reformat to hub format} +\title{run_workflow_and_format} \usage{ run_workflow_and_format(preproc, postproc, trainer, epi_data) } diff --git a/scripts/covid_hosp_explore.R b/scripts/covid_hosp_explore.R index b7ac5ca9..d6da456e 100644 --- a/scripts/covid_hosp_explore.R +++ b/scripts/covid_hosp_explore.R @@ -23,9 +23,8 @@ make_unique_grids <- function() { forecaster = "smoothed_scaled", trainer = c("quantreg"), ahead = c(1:7, 14, 21, 28), - # lags = list( - # smoothed, sd, smoothed, sd + # list(smoothed, sd) list(c(0, 3, 5, 7, 14), c(0)), list(c(0, 7, 14, 21, 28), c(0)), list(c(0, 2, 4, 7, 14, 21, 28), c(0)) diff --git a/scripts/covid_hosp_prod.R b/scripts/covid_hosp_prod.R index 46409041..b5443477 100644 --- a/scripts/covid_hosp_prod.R +++ b/scripts/covid_hosp_prod.R @@ -1 +1,96 @@ -# TODO +# The COVID Hospitalization Production Forecasting Pipeline. +# +# Ran into some issues with targets: +# https://github.com/ropensci/targets/discussions/666#discussioncomment-9050772 +# + +source("scripts/targets-common.R") + +insufficient_data_geos <- c("as", "gu", "mp", "vi") +forecast_generation_date <- as.character(seq.Date(as.Date("2024-04-22"), Sys.Date(), by = "1 week")) +# forecast_generation_date <- as.character(seq.Date(as.Date("2024-01-01"), Sys.Date(), by = "1 week")) +bad_forecast_exclusions <- Vectorize(epieval::get_exclusions)(forecast_generation_date) +forecaster_fns <- list( + function(...) { + smoothed_scaled( + ..., + outcome = "hhs", + pop_scaling = TRUE, + trainer = epipredict::quantile_reg(), + lags = list(c(0, 7, 14, 21, 28), c(0)) + ) + } +) + +rlang::list2( + tar_target( + aheads, + command = { + 1:28 + } + ), + tar_target( + forecasters, + command = { + seq_along(forecaster_fns) + } + ), + tar_map( + values = tidyr::expand_grid( + tibble( + forecast_generation_date = forecast_generation_date, + bad_forecast_exclusions = bad_forecast_exclusions + ) + ), + names = "forecast_generation_date", + tar_target( + hhs_latest_data, + command = { + epidatr::pub_covidcast( + source = "hhs", + signals = "confirmed_admissions_covid_1d", + geo_type = "state", + time_type = "day", + geo_values = "*", + time_values = epidatr::epirange(from = "2020-01-01", to = forecast_generation_date), + as_of = forecast_generation_date, + fetch_args = epidatr::fetch_args_list(return_empty = TRUE, timeout_seconds = 400) + ) %>% + select(geo_value, time_value, value, issue) %>% + rename("hhs" := value) %>% + rename(version = issue) %>% + filter(!geo_value %in% insufficient_data_geos) + } + ), + tar_target( + forecast, + command = { + hhs_latest_data %>% + as_epi_df() %>% + forecaster_fns[[forecasters]](ahead = aheads) %>% + mutate( + forecaster = sprintf("epipredict_%s", forecasters), + geo_value = as.factor(geo_value) + ) + }, + pattern = cross(aheads, forecasters) + ), + tar_target( + notebook, + command = { + if (!dir.exists(here::here("reports"))) dir.create(here::here("reports")) + rmarkdown::render( + "scripts/covid_hosp_prod.Rmd", + output_file = here::here( + "reports", + sprintf("covid_hosp_prod_%s.html", forecast_generation_date) + ), + params = list( + forecast = forecast, + bad_forecast_exclusions = bad_forecast_exclusions + ) + ) + } + ) + ) +) diff --git a/scripts/covid_hosp_prod.Rmd b/scripts/covid_hosp_prod.Rmd new file mode 100644 index 00000000..29d506e4 --- /dev/null +++ b/scripts/covid_hosp_prod.Rmd @@ -0,0 +1,65 @@ +--- +title: COVID Forecaster Predictions +author: COVID Forecast Team +date: "Rendered: `r format(Sys.time(), '%d %B %Y')`" +output: + html_document: + toc: True + # self_contained: False + # lib_dir: libs +params: + forecast_generation_date: !r Sys.Date() + forecast: !r "" + bad_forecast_exclusions: !r "" +--- + +```{css, echo=FALSE} +body { + display: block; + max-width: 1280px !important; + margin-left: auto; + margin-right: auto; +} + +body .main-container { + max-width: 1280px !important; + width: 1280px !important; +} +``` + +```{r setup, include=FALSE} +library(dplyr) +library(magrittr) +source(here::here("scripts", "plotting.R")) +``` + +## epipredict states + +```{r, fig.height = 60, fig.width = 12, echo=FALSE} +plot_forecasts( + params$forecast %>% filter(geo_value != "us"), + params$forecast_generation_date, + params$bad_forecast_exclusions, + "state" +) %>% + suppressMessages() %>% + suppressWarnings() +``` + +## epipredict national + +```{r, fig.width = 12, echo=FALSE} +plot_forecasts( + params$forecast %>% + summarise( + value = sum(value), + .by = c("forecaster", "forecast_date", "target_end_date", "quantile") + ) %>% + mutate(geo_value = "us"), + params$forecast_generation_date, + params$bad_forecast_exclusions, + "nation" +) %>% + suppressMessages() %>% + suppressWarnings() +``` diff --git a/scripts/geo_exclusions.json b/scripts/geo_exclusions.json new file mode 100644 index 00000000..23a2df61 --- /dev/null +++ b/scripts/geo_exclusions.json @@ -0,0 +1,5 @@ +{ + "exclusions": + { + } +} \ No newline at end of file diff --git a/scripts/plotting.R b/scripts/plotting.R index 0391dbba..776231d1 100644 --- a/scripts/plotting.R +++ b/scripts/plotting.R @@ -1,221 +1,86 @@ -# Copied and modified on 2023-09-29 from: -# cmu-delphi/flu-hosp-forecast/code/plotting.R +# Version: 2024-04-23 + +library(assertthat) +library(dplyr) +library(epidatr) +library(ggplot2) +library(magrittr) +library(tidyr) + + +plot_forecasts <- function(predictions_cards, forecast_date, exclude_geos, geo_type) { + assert_that(nrow(predictions_cards) > 0) + assert_that(geo_type %in% c("state", "nation")) + + signal_data <- pub_covidcast( + source = "hhs", + signals = "confirmed_admissions_covid_1d", + geo_type = geo_type, + time_type = "day", + geo_values = "*", + time_values = epirange("2023-12-01", forecast_date) + ) %>% + filter(!.data$geo_value %in% exclude_geos) %>% + select(.data$geo_value, .data$time_value, .data$value) %>% + rename(target_end_date = .data$time_value) %>% + mutate(data_source = "hhs", forecaster = "hhs hosp truth") -get_quantiles_df <- function(predictions_cards, intervals = c(.5, .9), ...) { - predictions_cards <- predictions_cards %>% - dplyr::select( - geo_value, - quantile, - value, - forecast_date, - target_end_date, - forecaster - ) - - lower_bounds <- predictions_cards %>% - select(quantile) %>% - filter(.data$quantile < 0.5) %>% - unique() %>% - pull() - - quantiles_to_plot <- as.integer(sort( - round(500L * (1 + intervals %o% c(-1L, 1L))) - )) - - quantiles_df <- predictions_cards %>% - filter(as.integer(round(.data$quantile * 1000)) %in% c(quantiles_to_plot)) %>% - mutate( - endpoint_type = if_else(.data$quantile < 0.5, "lower", "upper"), - alp = if_else(.data$endpoint_type == "lower", - format(2 * .data$quantile, digits = 3, nsmall = 3), - format(2 * (1 - .data$quantile), digits = 3, nsmall = 3) - ), - interval = forcats::fct_rev( - paste0((1 - as.numeric(.data$alp)) * 100, "%") - ) - ) %>% - select(-.data$quantile, -.data$alp) %>% - pivot_wider(names_from = "endpoint_type", values_from = "value") - - return(quantiles_df) -} - -get_points_df <- function(predictions_cards) { - points_df <- predictions_cards %>% - filter( - as.integer(round(.data$quantile * 1000)) == 500L | - is.na(.data$quantile) - ) - if (any(is.na(points_df$quantile))) { - points_df %<>% - pivot_wider(names_from = "quantile", values_from = "value") %>% - mutate(value = if_else(!is.na(.data$`NA`), .data$`NA`, .data$`0.5`)) %>% - select(-.data$`0.5`, -.data$`NA`) - } else { - points_df %<>% - select(-.data$quantile) - } - - return(points_df) -} - -plot_quantiles <- function(g, quantiles_df) { - n_quantiles <- nlevels(quantiles_df$interval) - l_quantiles <- levels(quantiles_df$interval) - - alp <- c(.4, .2, .1) - for (qq in n_quantiles:1) { + # Setup plot + g <- ggplot(signal_data, mapping = aes( + x = .data$target_end_date, + color = .data$forecaster, + fill = .data$forecaster + )) + + geom_line(mapping = aes(y = .data$value)) + + # Plot (symmetric) quantiles + quantiles <- c(0.8, 0.95) + alpha <- c(0.4, 0.2) + for (i in seq_along(quantiles)) { + q <- quantiles[i] + a <- alpha[i] g <- g + geom_ribbon( - data = quantiles_df %>% - filter(.data$interval == l_quantiles[qq]), + data = predictions_cards %>% + filter(near(.data$quantile, q) | near(.data$quantile, 1 - q)) %>% + mutate( + quantile = ifelse(near(.data$quantile, q), "upper", "lower") %>% + as.factor() + ) %>% + pivot_wider(names_from = "quantile", values_from = "value"), mapping = aes( ymin = .data$lower, ymax = .data$upper, - group = interaction(.data$forecast_date, .data$forecaster) + group = interaction(.data$forecast_date, .data$forecaster), + color = NULL ), - alpha = alp[qq] + alpha = a ) } - return(g) -} - -plot_points <- function(g, points_df) { - g <- g + geom_point( - data = points_df, - mapping = aes( - y = .data$value, - group = interaction(.data$forecast_date, .data$forecaster) - ) - ) - - return(g) -} - -plot_state_forecasters <- function( - predictions_cards, - exclude_geos = c(), - start_day = NULL, - ncol = 5) { - if (nrow(predictions_cards) == 0) { - return(NULL) - } - - predictions_cards %<>% - filter(!geo_value %in% exclude_geos) - - td1 <- tar_read("hhs_latest_data_2022") %>% - filter(!geo_value %in% exclude_geos) %>% - mutate(value = .data$value) %>% - rename(target_end_date = time_value) - td1$data_source <- "hhs" - td2 <- tar_read("chng_latest_data_2022") %>% - filter(!geo_value %in% exclude_geos) %>% - rename(target_end_date = time_value) - td2$data_source <- "chng" - td1_max <- td1 %>% - group_by(geo_value) %>% - summarize(max_value = max(value)) - td2_max <- td2 %>% - group_by(geo_value) %>% - summarize(max_value = max(value)) - td2_max <- td2_max %>% - left_join(td1_max, by = "geo_value", suffix = c(".2", ".1")) %>% - mutate(max_ratio = max_value.1 / max_value.2) - td2 %<>% - left_join(td2_max, by = "geo_value") %>% - mutate(scaled_value = value * max_ratio) - td1 %<>% mutate(forecaster = "hhs hosp truth") - td2 %<>% mutate(forecaster = "chng smoothed_adj_outpatient_flu current, scaled") - - # Setup plot - g <- ggplot(td1, mapping = aes( - x = .data$target_end_date, - color = .data$forecaster, - fill = .data$forecaster - )) - - points_df <- get_points_df(predictions_cards) - g <- plot_points(g, points_df) - - quantiles_df <- get_quantiles_df(predictions_cards) - g <- plot_quantiles(g, quantiles_df) - - # Plot truth data by geo + # Plot points g <- g + - geom_line(mapping = aes(y = .data$value)) + - geom_line(data = td2, mapping = aes( - x = .data$target_end_date, - y = .data$scaled_value - )) + - facet_wrap(~ .data$geo_value, scales = "free_y", ncol = ncol, drop = TRUE) + - theme(legend.position = "top", legend.text = element_text(size = 7)) - - return(g) -} + geom_point( + data = predictions_cards %>% + filter(near(.data$quantile, 0.5)), + mapping = aes( + y = .data$value, + group = interaction(.data$forecast_date, .data$forecaster) + ), + size = 0.125 + ) -plot_nation_forecasters <- function( - predictions_cards, - exclude_geos = c(), - start_day = NULL, - ncol = 5) { - if (nrow(predictions_cards) == 0) { - return(NULL) + if (geo_type == "state") { + # Add lines, facet, and theme + g <- g + + facet_wrap(~ .data$geo_value, scales = "free_y", ncol = 2, drop = TRUE) + + theme(legend.position = "top", legend.text = element_text(size = 7)) + } else if (geo_type == "nation") { + # Add lines and theme + g + + labs(fill = "Reported Signal") + + theme(legend.position = "top", legend.text = element_text(size = 7)) } - predictions_cards %<>% - filter(!geo_value %in% exclude_geos) %>% - group_by(forecaster, forecast_date, quantile, target_end_date) %>% - summarize(value = sum(value)) %>% - ungroup() %>% - mutate(geo_value = "us") - - td1 <- tar_read("hhs_latest_data_2022") %>% - filter(!geo_value %in% exclude_geos) %>% - group_by(time_value) %>% - summarize(value = sum(value)) %>% - ungroup() %>% - mutate(value = 7L * .data$value) %>% - rename(target_end_date = time_value) %>% - mutate(geo_value = "us") - td1$data_source <- "hhs" - td2 <- tar_read("chng_latest_data_2022") %>% - filter(!geo_value %in% exclude_geos) %>% - group_by(time_value) %>% - summarize(value = sum(value)) %>% - ungroup() %>% - rename(target_end_date = time_value) %>% - mutate(geo_value = "us") - td2$data_source <- "chng" - td1.max <- td1 %>% - summarize(max_value = max(value)) %>% - pull(max_value) - td2.max <- td2 %>% - summarize(max_value = max(value)) %>% - pull(max_value) - td2 %<>% - mutate(scaled_value = value * td1.max / td2.max) - - # Setup plot - g <- ggplot(td1, mapping = aes(x = .data$target_end_date)) - - quantiles_df <- get_quantiles_df(predictions_cards) - g <- plot_quantiles(g, quantiles_df) - - points_df <- get_points_df(predictions_cards) - g <- plot_points(g, points_df) - - # Plot truth data by geo - g <- g + - geom_line(mapping = aes(y = .data$value, color = "confirmed admissions")) + - geom_line(data = td2, mapping = aes( - x = .data$target_end_date, - y = .data$scaled_value, - color = "chng smoothed_adj_outpatient_flu, scaled" - )) + - labs(fill = "Reported Signal") + - theme(legend.position = "top", legend.text = element_text(size = 7)) - return(g) } diff --git a/scripts/report.Rmd b/scripts/report.Rmd deleted file mode 100644 index 334cdfc9..00000000 --- a/scripts/report.Rmd +++ /dev/null @@ -1,54 +0,0 @@ ---- -title: "Flu Forecasts" -author: "CMU-TimeSeries" -date: "`r format(Sys.time(), '%B %d, %Y')`" -output: html_document -params: - exclude_geos: !r character(0L) ---- - -```{r setup, include=FALSE, echo=FALSE} -# Copied and modified on 2023-09-29 from: -# cmu-delphi/flu-hosp-forecast/code/plot_prediction_cards.R -library(dplyr) -library(magrittr) -library(targets) - -source("scripts/plotting.R") - -flu_forecasts <- bind_rows( - tar_read("forecast_cheap.managerial.1"), - tar_read("forecast_clearcut.antiromantic.1"), - tar_read("forecast_honorary.salmonoid.5"), - .id = "forecaster" -) %>% filter(forecaster == "cheap managerial 1") -``` - -## Trajectory plots - -- Fan displays 50/80/95% confidence intervals - -```{r, fig.height = 80, fig.width = 15, echo=FALSE} -# setup the plot and join corrections to the truth -plot_state_forecasters( - flu_forecasts %>% filter(geo_value != "us"), - exclude_geos = params$exclude_geos, - start_day = "2022-01-01", - ncol = 2 -) %>% - suppressMessages() %>% - suppressWarnings() %>% - plotly::ggplotly() -``` - -```{r, fig.height = 10, fig.width = 15, echo=FALSE} -# setup the plot and join corrections to the truth -plot_nation_forecasters( - flu_forecasts, - exclude_geos = params$exclude_geos, - start_day = "2022-01-01" -) %>% - suppressMessages() %>% - suppressWarnings() %>% - plotly::ggplotly() -```