feat: add covid hosp prod pipeline

dshemetov · dshemetov · commit 4481a3bf1d8c · 2024-04-10T17:44:44.000-07:00
diff --git a/.gitignore b/.gitignore
@@ -9,3 +9,4 @@ scripts/**.html
 nohup.out
 run.Rout
 tmp.R
+reports/
diff --git a/R/plotting.R b/R/plotting.R
@@ -0,0 +1,204 @@
+library(dplyr)
+library(ggplot2)
+library(magrittr)
+library(tidyr)
+
+
+get_quantiles_df <- function(predictions_cards, intervals = c(.5, .9), ...) {
+  predictions_cards <- predictions_cards %>%
+    dplyr::select(
+      geo_value,
+      quantile,
+      value,
+      forecaster,
+      forecast_date,
+      target_end_date
+    )
+
+  lower_bounds <- predictions_cards %>%
+    select(.data$quantile) %>%
+    filter(.data$quantile < 0.5) %>%
+    unique() %>%
+    pull()
+  quantiles_to_plot <- as.integer(sort(
+    round(500L * (1 + intervals %o% c(-1L, 1L)))
+  ))
+
+  quantiles_df <- predictions_cards %>%
+    filter(as.integer(round(.data$quantile * 1000)) %in% c(quantiles_to_plot)) %>%
+    mutate(
+      endpoint_type = if_else(.data$quantile < 0.5, "lower", "upper"),
+      alp = if_else(.data$endpoint_type == "lower",
+        format(2 * .data$quantile, digits = 3, nsmall = 3),
+        format(2 * (1 - .data$quantile), digits = 3, nsmall = 3)
+      ),
+      interval = forcats::fct_rev(
+        paste0((1 - as.numeric(.data$alp)) * 100, "%")
+      )
+    ) %>%
+    select(-.data$quantile, -.data$alp) %>%
+    pivot_wider(names_from = "endpoint_type", values_from = "value")
+
+  return(quantiles_df)
+}
+
+get_points_df <- function(predictions_cards) {
+  points_df <- predictions_cards %>%
+    filter(as.integer(round(.data$quantile * 1000)) == 500L |
+      is.na(.data$quantile))
+  if (any(is.na(points_df$quantile))) {
+    points_df <- points_df %>%
+      pivot_wider(names_from = "quantile", values_from = "value") %>%
+      mutate(value = if_else(!is.na(.data$`NA`), .data$`NA`, .data$`0.5`)) %>%
+      select(-.data$`0.5`, -.data$`NA`)
+  } else {
+    points_df <- points_df %>%
+      select(-.data$quantile)
+  }
+
+  return(points_df)
+}
+
+plot_quantiles <- function(g, quantiles_df) {
+  n_quantiles <- nlevels(quantiles_df$interval)
+  l_quantiles <- levels(quantiles_df$interval)
+
+  alp <- c(.4, .2, .1)
+  for (qq in n_quantiles:1) {
+    g <- g +
+      geom_ribbon(
+        data = quantiles_df %>%
+          filter(.data$interval == l_quantiles[qq]),
+        mapping = aes(
+          ymin = .data$lower,
+          ymax = .data$upper,
+          group = interaction(.data$forecast_date, .data$forecaster),
+          color = NULL
+        ),
+        alpha = alp[qq]
+      )
+  }
+
+  return(g)
+}
+
+plot_points <- function(g, points_df) {
+  g <- g + geom_point(
+    data = points_df,
+    mapping = aes(
+      y = .data$value,
+      group = interaction(.data$forecast_date, .data$forecaster)
+    ),
+    size = 0.125
+  )
+
+  return(g)
+}
+
+plot_state_forecasters <- function(predictions_cards, exclude_geos = c(), start_day = NULL, ncol = 5) {
+  if (nrow(predictions_cards) == 0) {
+    return(NULL)
+  }
+
+  td1 <- epidatr::pub_covidcast(
+    source = "hhs",
+    signals = "confirmed_admissions_covid_1d",
+    geo_type = "state",
+    time_type = "day",
+    geo_values = "*",
+    time_values = epidatr::epirange(start_day, Sys.Date())
+  ) %>%
+    filter(!(.data$geo_value %in% exclude_geos)) %>%
+    dplyr::select(.data$geo_value, .data$time_value, .data$value) %>%
+    dplyr::rename(target_end_date = .data$time_value) %>%
+    mutate(data_source = "hhs")
+  td2 <- epidatr::pub_covidcast(
+    source = "jhu-csse",
+    signals = "confirmed_7dav_incidence_num",
+    geo_type = "state",
+    time_type = "day",
+    geo_values = "*",
+    time_values = epidatr::epirange(start_day, Sys.Date())
+  ) %>%
+    filter(!(.data$geo_value %in% exclude_geos)) %>%
+    dplyr::select(.data$geo_value, .data$time_value, .data$value) %>%
+    dplyr::rename(target_end_date = .data$time_value) %>%
+    mutate(data_source = "jhu")
+
+  td1.max <- td1 %>%
+    group_by(geo_value) %>%
+    summarize(max_value = max(value))
+  td2.max <- td2 %>%
+    group_by(geo_value) %>%
+    summarize(max_value = max(value))
+  td2.max <- td2.max %>%
+    left_join(td1.max, by = "geo_value", suffix = c(".2", ".1")) %>%
+    mutate(max_ratio = max_value.1 / max_value.2)
+  td2 <- td2 %>%
+    left_join(td2.max, by = "geo_value") %>%
+    mutate(scaled_value = value * max_ratio)
+  td1 <- td1 %>% mutate(forecaster = "hhs hosp truth")
+  td2 <- td2 %>% mutate(forecaster = "jhu cases truth")
+
+  # Setup plot
+  g <- ggplot(td1, mapping = aes(x = .data$target_end_date, color = .data$forecaster, fill = .data$forecaster))
+  g <- plot_points(g, get_points_df(predictions_cards))
+  g <- plot_quantiles(g, get_quantiles_df(predictions_cards))
+  g <- g +
+    geom_line(mapping = aes(y = .data$value)) +
+    geom_line(data = td2, mapping = aes(x = .data$target_end_date, y = .data$scaled_value)) +
+    facet_wrap(~ .data$geo_value, scales = "free_y", ncol = ncol, drop = TRUE) +
+    theme(legend.position = "top", legend.text = element_text(size = 7))
+
+  return(g)
+}
+
+plot_nation_forecasters <- function(predictions_cards, exclude_geos = c(), start_day = NULL, ncol = 5) {
+  if (nrow(predictions_cards) == 0) {
+    return(NULL)
+  }
+
+  td1 <- epidatr::pub_covidcast(
+    source = "hhs",
+    signals = "confirmed_admissions_covid_1d",
+    geo_type = "nation",
+    time_type = "day",
+    geo_values = "*",
+    time_values = epidatr::epirange(start_day, Sys.Date())
+  ) %>%
+    filter(!(.data$geo_value %in% exclude_geos)) %>%
+    dplyr::select(.data$time_value, .data$value) %>%
+    dplyr::rename(target_end_date = .data$time_value) %>%
+    mutate(data_source = "hhs")
+  td2 <- epidatr::pub_covidcast(
+    source = "jhu-csse",
+    signals = "confirmed_7dav_incidence_num",
+    geo_type = "nation",
+    time_type = "day",
+    geo_values = "*",
+    time_values = epidatr::epirange(start_day, Sys.Date())
+  ) %>%
+    filter(!(.data$geo_value %in% exclude_geos)) %>%
+    dplyr::select(.data$time_value, .data$value) %>%
+    dplyr::rename(target_end_date = .data$time_value) %>%
+    mutate(data_source = "jhu")
+  td1.max <- td1 %>%
+    summarize(max_value = max(value)) %>%
+    pull(max_value)
+  td2.max <- td2 %>%
+    summarize(max_value = max(value)) %>%
+    pull(max_value)
+  td2 <- td2 %>% mutate(scaled_value = value * td1.max / td2.max)
+
+  # Setup plot
+  g <- ggplot(td1, mapping = aes(x = .data$target_end_date))
+  g <- plot_quantiles(g, get_quantiles_df(predictions_cards))
+  g <- plot_points(g, get_points_df(predictions_cards))
+  g <- g +
+    geom_line(mapping = aes(y = .data$value, color = "confirmed admissions")) +
+    geom_line(data = td2, mapping = aes(x = .data$target_end_date, y = .data$scaled_value, color = "7day case sum")) +
+    labs(fill = "Reported Signal") +
+    theme(legend.position = "top", legend.text = element_text(size = 7))
+
+  return(g)
+}
diff --git a/covid_hosp_prod/.gitignore b/covid_hosp_prod/.gitignore
@@ -0,0 +1,6 @@
+*
+!.gitignore
+!meta
+!*.R
+meta/*
+# !meta/meta
diff --git a/scripts/covid_hosp_prod.R b/scripts/covid_hosp_prod.R
@@ -1 +1,90 @@
-# TODO
+# The COVID Hospitalization Production Forecasting Pipeline.
+#
+# Ran into some issues with targets:
+#   https://github.com/ropensci/targets/discussions/666#discussioncomment-9050772
+
+source("scripts/targets-common.R")
+
+
+#' Get exclusions from a JSON file for a given date
+#'
+#' @param date A date
+#' @param exclusions_json A JSON file with exclusions in the format:
+#'
+#'    {"exclusions": {"2024-03-24": "ak,hi"}}
+get_exclusions <- function(date, exclusions_json = here::here("scripts", "geo_exclusions.json")) {
+  s <- jsonlite::read_json(exclusions_json)$exclusions[[as.character(date)]]
+  if (!is.null(s)) {
+    return(s)
+  }
+  return("")
+}
+
+forecast_generation_date <- as.character(seq.Date(as.Date("2024-01-01"), Sys.Date(), by = "1 week"))
+geo_exclusions <- Vectorize(get_exclusions)(forecast_generation_date)
+
+rlang::list2(
+  tar_target(
+    aheads,
+    command = {
+      c(1:7)
+    }
+  ),
+  tar_map(
+    values = tidyr::expand_grid(
+      tibble(
+        forecast_generation_date = forecast_generation_date,
+        geo_exclusions = geo_exclusions
+      )
+    ),
+    names = "forecast_generation_date",
+    tar_target(
+      hhs_latest_data,
+      command = {
+        epidatr::pub_covidcast(
+          source = "hhs",
+          signals = "confirmed_admissions_covid_1d",
+          geo_type = "state",
+          time_type = "day",
+          geo_values = "*",
+          time_values = epidatr::epirange(from = "2020-01-01", to = forecast_generation_date),
+          as_of = forecast_generation_date,
+          fetch_args = epidatr::fetch_args_list(return_empty = TRUE, timeout_seconds = 400)
+        ) %>%
+          select(geo_value, time_value, value, issue) %>%
+          rename("hhs" := value) %>%
+          rename(version = issue)
+      }
+    ),
+    tar_target(
+      forecast,
+      command = {
+        hhs_latest_data %>%
+          as_epi_df() %>%
+          smoothed_scaled(outcome = "hhs", ahead = aheads)
+      },
+      pattern = map(aheads)
+    ),
+    tar_target(
+      forecast_with_exclusions,
+      command = {
+        forecast %>% filter(!geo_value %in% strsplit(geo_exclusions, ",")[[1]])
+      }
+    ),
+    tar_target(
+      notebook,
+      command = {
+        rmarkdown::render(
+          "scripts/covid_hosp_prod.Rmd",
+          output_file = here::here(
+            "reports",
+            sprintf("covid_hosp_prod_%s.html", forecast_generation_date)
+          ),
+          params = list(
+            forecast = forecast
+          )
+        )
+      }
+    )
+  )
+)
diff --git a/scripts/covid_hosp_prod.Rmd b/scripts/covid_hosp_prod.Rmd
diff --git a/scripts/geo_exclusions.json b/scripts/geo_exclusions.json

-Original file line number
+Diff line change
@@ @@ -0,0 +1,6 @@ @@
 +*
 +!.gitignore
 +!meta
 +!*.R
 +meta/*
 +# !meta/meta