wip: production pipeline

dshemetov · dshemetov · commit be5084c43b48 · 2024-04-09T17:57:37.000-07:00
diff --git a/R/plotting.R b/R/plotting.R
@@ -0,0 +1,184 @@
+library(dplyr)
+library(evalcast)
+library(ggplot2)
+library(magrittr)
+library(tidyr)
+
+
+get_truth_data <- function(exclude_geos, ...) {
+  download_args <- list(...)
+  truth_data <- do.call(download_signal, download_args)
+  truth_data %<>% filter(!(.data$geo_value %in% exclude_geos))
+  truth_data <- truth_data %>%
+    dplyr::select(.data$geo_value, .data$time_value, .data$value) %>%
+    dplyr::rename(target_end_date = .data$time_value)
+  return(truth_data %>% tibble())
+}
+
+get_quantiles_df <- function(predictions_cards, intervals = c(.5, .9), ...) {
+  predictions_cards <- predictions_cards %>%
+    dplyr::select(
+      .data$geo_value, .data$quantile,
+      .data$value, .data$forecaster, .data$forecast_date,
+      .data$target_end_date
+    )
+
+  lower_bounds <- predictions_cards %>%
+    select(.data$quantile) %>%
+    filter(.data$quantile < 0.5) %>%
+    unique() %>%
+    pull()
+  quantiles_to_plot <- as.integer(sort(
+    round(500L * (1 + intervals %o% c(-1L, 1L)))
+  ))
+
+  quantiles_df <- predictions_cards %>%
+    filter(as.integer(round(.data$quantile * 1000)) %in% c(quantiles_to_plot)) %>%
+    mutate(
+      endpoint_type = if_else(.data$quantile < 0.5, "lower", "upper"),
+      alp = if_else(.data$endpoint_type == "lower",
+        format(2 * .data$quantile, digits = 3, nsmall = 3),
+        format(2 * (1 - .data$quantile), digits = 3, nsmall = 3)
+      ),
+      interval = forcats::fct_rev(
+        paste0((1 - as.numeric(.data$alp)) * 100, "%")
+      )
+    ) %>%
+    select(-.data$quantile, -.data$alp) %>%
+    pivot_wider(names_from = "endpoint_type", values_from = "value")
+
+  return(quantiles_df)
+}
+
+get_points_df <- function(predictions_cards) {
+  points_df <- predictions_cards %>%
+    filter(as.integer(round(.data$quantile * 1000)) == 500L |
+      is.na(.data$quantile))
+  if (any(is.na(points_df$quantile))) {
+    points_df <- points_df %>%
+      pivot_wider(names_from = "quantile", values_from = "value") %>%
+      mutate(value = if_else(!is.na(.data$`NA`), .data$`NA`, .data$`0.5`)) %>%
+      select(-.data$`0.5`, -.data$`NA`)
+  } else {
+    points_df <- points_df %>%
+      select(-.data$quantile)
+  }
+
+  return(points_df)
+}
+
+plot_quantiles <- function(g, quantiles_df) {
+  n_quantiles <- nlevels(quantiles_df$interval)
+  l_quantiles <- levels(quantiles_df$interval)
+
+  alp <- c(.4, .2, .1)
+  for (qq in n_quantiles:1) {
+    g <- g +
+      geom_ribbon(
+        data = quantiles_df %>%
+          filter(.data$interval == l_quantiles[qq]),
+        mapping = aes(
+          ymin = .data$lower,
+          ymax = .data$upper,
+          group = interaction(.data$forecast_date, .data$forecaster),
+          color = NULL
+        ),
+        alpha = alp[qq]
+      )
+  }
+
+  return(g)
+}
+
+plot_points <- function(g, points_df) {
+  g <- g + geom_point(
+    data = points_df,
+    mapping = aes(
+      y = .data$value,
+      group = interaction(.data$forecast_date, .data$forecaster)
+    ),
+    size = 0.125
+  )
+
+  return(g)
+}
+
+plot_state_forecasters <- function(predictions_cards, exclude_geos = c(), start_day = NULL, ncol = 5, offline_signal_dir = NULL) {
+  if (nrow(predictions_cards) == 0) {
+    return(NULL)
+  }
+
+  td1 <- get_truth_data(exclude_geos = exclude_geos, data_source = "hhs", signal = "confirmed_admissions_covid_1d", start_day = start_day, geo_type = "state", offline_signal_dir = offline_signal_dir)
+  td1$data_source <- "hhs"
+  td2 <- get_truth_data(exclude_geos = exclude_geos, data_source = "jhu-csse", signal = "confirmed_7dav_incidence_num", start_day = start_day, geo_type = "state", offline_signal_dir = offline_signal_dir)
+  td2$data_source <- "jhu"
+  td1.max <- td1 %>%
+    group_by(geo_value) %>%
+    summarize(max_value = max(value))
+  td2.max <- td2 %>%
+    group_by(geo_value) %>%
+    summarize(max_value = max(value))
+  td2.max <- td2.max %>%
+    left_join(td1.max, by = "geo_value", suffix = c(".2", ".1")) %>%
+    mutate(max_ratio = max_value.1 / max_value.2)
+  td2 <- td2 %>%
+    left_join(td2.max, by = "geo_value") %>%
+    mutate(scaled_value = value * max_ratio)
+  td1 <- td1 %>% mutate(forecaster = "hhs hosp truth")
+  td2 <- td2 %>% mutate(forecaster = "jhu cases truth")
+
+  # Setup plot
+  g <- ggplot(td1, mapping = aes(x = .data$target_end_date, color = .data$forecaster, fill = .data$forecaster))
+
+  points_df <- get_points_df(predictions_cards)
+  g <- plot_points(g, points_df)
+
+  quantiles_df <- get_quantiles_df(predictions_cards)
+  g <- plot_quantiles(g, quantiles_df)
+
+  # Plot truth data by geo
+  g <- g +
+    geom_line(mapping = aes(y = .data$value)) +
+    geom_line(data = td2, mapping = aes(x = .data$target_end_date, y = .data$scaled_value)) +
+    facet_wrap(~ .data$geo_value, scales = "free_y", ncol = ncol, drop = TRUE) +
+    theme(legend.position = "top", legend.text = element_text(size = 7))
+
+  return(g)
+}
+
+plot_nation_forecasters <- function(predictions_cards, exclude_geos = c(), start_day = NULL, ncol = 5, offline_signal_dir = NULL) {
+  if (nrow(predictions_cards) == 0) {
+    return(NULL)
+  }
+
+  td1 <- get_truth_data(exclude_geos = exclude_geos, data_source = "hhs", signal = "confirmed_admissions_covid_1d", start_day = start_day, geo_type = "nation", offline_signal_dir = offline_signal_dir)
+  td1$data_source <- "hhs"
+  td2 <- get_truth_data(exclude_geos = exclude_geos, data_source = "jhu-csse", signal = "confirmed_7dav_incidence_num", start_day = start_day, geo_type = "nation", offline_signal_dir = offline_signal_dir)
+  td2$data_source <- "jhu"
+  td1.max <- td1 %>%
+    summarize(max_value = max(value)) %>%
+    pull(max_value)
+  td2.max <- td2 %>%
+    summarize(max_value = max(value)) %>%
+    pull(max_value)
+  td2 <- td2 %>%
+    mutate(scaled_value = value * td1.max / td2.max)
+
+  # Setup plot
+  g <- ggplot(td1, mapping = aes(x = .data$target_end_date))
+
+  quantiles_df <- get_quantiles_df(predictions_cards)
+  g <- plot_quantiles(g, quantiles_df)
+
+  points_df <- get_points_df(predictions_cards)
+  g <- plot_points(g, points_df)
+
+  # Plot truth data by geo
+  g <- g +
+    geom_line(mapping = aes(y = .data$value, color = "confirmed admissions")) +
+    geom_line(data = td2, mapping = aes(x = .data$target_end_date, y = .data$scaled_value, color = "7day case sum")) +
+    labs(fill = "Reported Signal") +
+    theme(legend.position = "top", legend.text = element_text(size = 7))
+
+  return(g)
+}
diff --git a/covid_hosp_prod/.gitignore b/covid_hosp_prod/.gitignore
@@ -0,0 +1,6 @@
+*
+!.gitignore
+!meta
+!*.R
+meta/*
+# !meta/meta
diff --git a/scripts/covid_hosp_prod.R b/scripts/covid_hosp_prod.R
@@ -1 +1,92 @@
-# TODO
+# The COVID Hospitalization Production Forecasting Pipeline.
+#
+# Ran into some issues with targets:
+#   https://github.com/ropensci/targets/discussions/666#discussioncomment-9050772
+
+source("scripts/targets-common.R")
+
+
+#' Get exclusions from a JSON file for a given date
+#'
+#' @param date A date
+#' @param exclusions_json A JSON file with exclusions in the format:
+#'
+#'    {"exclusions": {"2024-03-24": "ak,hi"}}
+get_exclusions <- function(date, exclusions_json = here::here("scripts", "geo_exclusions.json")) {
+  s <- jsonlite::read_json(exclusions_json)$exclusions[[as.character(date)]]
+  if (!is.null(s)) {
+    return(s)
+  }
+  return("")
+}
+
+forecast_generation_date <- as.character(seq.Date(as.Date("2024-01-01"), Sys.Date(), by = "1 week"))
+geo_exclusions <- Vectorize(get_exclusions)(forecast_generation_date)
+
+tib1 <- tidyr::expand_grid(
+  tibble(
+    forecast_generation_date = forecast_generation_date,
+    geo_exclusions = geo_exclusions
+  )
+)
+
+rlang::list2(
+  tar_target(
+    aheads,
+    command = {
+      c(1:7)
+    }
+  ),
+  tar_map(
+    values = tib1,
+    names = "forecast_generation_date",
+    tar_target(
+      hhs_latest_data,
+      command = {
+        epidatr::pub_covidcast(
+          source = "hhs",
+          signals = "confirmed_admissions_covid_1d",
+          geo_type = "state",
+          time_type = "day",
+          geo_values = "*",
+          time_values = epidatr::epirange(from = "2020-01-01", to = forecast_generation_date),
+          as_of = forecast_generation_date,
+          fetch_args = epidatr::fetch_args_list(return_empty = TRUE, timeout_seconds = 400)
+        ) %>%
+          select(geo_value, time_value, value, issue) %>%
+          rename("hhs" := value) %>%
+          rename(version = issue)
+      }
+    ),
+    tar_target(
+      forecast,
+      command = {
+        hhs_latest_data %>%
+          as_epi_df() %>%
+          smoothed_scaled(outcome = "hhs", ahead = aheads)
+      },
+      pattern = map(aheads)
+    ),
+    tar_target(
+      forecast_with_exclusions,
+      command = {
+        forecast %>% filter(!geo_value %in% strsplit(geo_exclusions, ",")[[1]])
+      }
+    ),
+    tar_target(
+      notebook,
+      command = {
+        rmarkdown::render(
+          "scripts/covid_hosp_prod.Rmd",
+          output_file = here::here(
+            "reports",
+            sprintf("covid_hosp_prod_%s.html", forecast_generation_date)
+          ),
+          params = list(
+            forecast = forecast
+          )
+        )
+      }
+    )
+  )
+)
diff --git a/scripts/covid_hosp_prod.Rmd b/scripts/covid_hosp_prod.Rmd
@@ -0,0 +1,44 @@
+---
+title: COVID Forecaster Predictions
+author: COVID Forecast Team
+date: "Rendered: `r format(Sys.time(), '%d %B %Y')`"
+output:
+  html_document:
+    toc: True
+    # self_contained: False
+    # lib_dir: libs
+params:
+  forecast_generation_date: !r Sys.Date()
+  forecast: ""
+---
+
+```{css, echo=FALSE}
+body {
+  display: block;
+  max-width: 1280px !important;
+  margin-left: auto;
+  margin-right: auto;
+}
+
+body .main-container {
+  max-width: 1280px !important;
+  width: 1280px !important;
+}
+```
+
+```{r setup, include=FALSE}
+library(dplyr)
+library(evalcast)
+library(here)
+library(magrittr)
+library(rlang)
+library(targets)
+library(tidyr)
+source(here("R", "plotting.R"))
+
+forecast <- params$forecast
+```
+
+```{r}
+print(forecast)
+```
diff --git a/scripts/geo_exclusions.json b/scripts/geo_exclusions.json
@@ -0,0 +1,6 @@
+{
+    "exclusions":
+    {
+        "2024-03-24": "ak,ca"
+    }
+}

-Original file line number
+Diff line change
@@ @@ -0,0 +1,6 @@ @@
 +*
 +!.gitignore
 +!meta
 +!*.R
 +meta/*
 +# !meta/meta