wip: production pipeline

dshemetov · dshemetov · commit 84909526fd3c · 2024-04-09T18:12:04.000-07:00
diff --git a/R/plotting.R b/R/plotting.R
@@ -1,26 +1,18 @@
 library(dplyr)
-library(evalcast)
 library(ggplot2)
 library(magrittr)
 library(tidyr)
 
 
-get_truth_data <- function(exclude_geos, ...) {
-  download_args <- list(...)
-  truth_data <- do.call(download_signal, download_args)
-  truth_data %<>% filter(!(.data$geo_value %in% exclude_geos))
-  truth_data <- truth_data %>%
-    dplyr::select(.data$geo_value, .data$time_value, .data$value) %>%
-    dplyr::rename(target_end_date = .data$time_value)
-  return(truth_data %>% tibble())
-}
-
 get_quantiles_df <- function(predictions_cards, intervals = c(.5, .9), ...) {
   predictions_cards <- predictions_cards %>%
     dplyr::select(
-      .data$geo_value, .data$quantile,
-      .data$value, .data$forecaster, .data$forecast_date,
-      .data$target_end_date
+      geo_value,
+      quantile,
+      value,
+      forecaster,
+      forecast_date,
+      target_end_date
     )
 
   lower_bounds <- predictions_cards %>%
@@ -103,15 +95,36 @@ plot_points <- function(g, points_df) {
   return(g)
 }
 
-plot_state_forecasters <- function(predictions_cards, exclude_geos = c(), start_day = NULL, ncol = 5, offline_signal_dir = NULL) {
+plot_state_forecasters <- function(predictions_cards, exclude_geos = c(), start_day = NULL, ncol = 5) {
   if (nrow(predictions_cards) == 0) {
     return(NULL)
   }
 
-  td1 <- get_truth_data(exclude_geos = exclude_geos, data_source = "hhs", signal = "confirmed_admissions_covid_1d", start_day = start_day, geo_type = "state", offline_signal_dir = offline_signal_dir)
-  td1$data_source <- "hhs"
-  td2 <- get_truth_data(exclude_geos = exclude_geos, data_source = "jhu-csse", signal = "confirmed_7dav_incidence_num", start_day = start_day, geo_type = "state", offline_signal_dir = offline_signal_dir)
-  td2$data_source <- "jhu"
+  td1 <- epidatr::pub_covidcast(
+    source = "hhs",
+    signals = "confirmed_admissions_covid_1d",
+    geo_type = "state",
+    time_type = "day",
+    geo_values = "*",
+    time_values = epidatr::epirange(start_day, Sys.Date())
+  ) %>%
+    filter(!(.data$geo_value %in% exclude_geos)) %>%
+    dplyr::select(.data$geo_value, .data$time_value, .data$value) %>%
+    dplyr::rename(target_end_date = .data$time_value) %>%
+    mutate(data_source = "hhs")
+  td2 <- epidatr::pub_covidcast(
+    source = "jhu-csse",
+    signals = "confirmed_7dav_incidence_num",
+    geo_type = "state",
+    time_type = "day",
+    geo_values = "*",
+    time_values = epidatr::epirange(start_day, Sys.Date())
+  ) %>%
+    filter(!(.data$geo_value %in% exclude_geos)) %>%
+    dplyr::select(.data$geo_value, .data$time_value, .data$value) %>%
+    dplyr::rename(target_end_date = .data$time_value) %>%
+    mutate(data_source = "jhu")
+
   td1.max <- td1 %>%
     group_by(geo_value) %>%
     summarize(max_value = max(value))
@@ -129,14 +142,8 @@ plot_state_forecasters <- function(predictions_cards, exclude_geos = c(), start_
 
   # Setup plot
   g <- ggplot(td1, mapping = aes(x = .data$target_end_date, color = .data$forecaster, fill = .data$forecaster))
-
-  points_df <- get_points_df(predictions_cards)
-  g <- plot_points(g, points_df)
-
-  quantiles_df <- get_quantiles_df(predictions_cards)
-  g <- plot_quantiles(g, quantiles_df)
-
-  # Plot truth data by geo
+  g <- plot_points(g, get_points_df(predictions_cards))
+  g <- plot_quantiles(g, get_quantiles_df(predictions_cards))
   g <- g +
     geom_line(mapping = aes(y = .data$value)) +
     geom_line(data = td2, mapping = aes(x = .data$target_end_date, y = .data$scaled_value)) +
@@ -146,34 +153,47 @@ plot_state_forecasters <- function(predictions_cards, exclude_geos = c(), start_
   return(g)
 }
 
-plot_nation_forecasters <- function(predictions_cards, exclude_geos = c(), start_day = NULL, ncol = 5, offline_signal_dir = NULL) {
+plot_nation_forecasters <- function(predictions_cards, exclude_geos = c(), start_day = NULL, ncol = 5) {
   if (nrow(predictions_cards) == 0) {
     return(NULL)
   }
 
-  td1 <- get_truth_data(exclude_geos = exclude_geos, data_source = "hhs", signal = "confirmed_admissions_covid_1d", start_day = start_day, geo_type = "nation", offline_signal_dir = offline_signal_dir)
-  td1$data_source <- "hhs"
-  td2 <- get_truth_data(exclude_geos = exclude_geos, data_source = "jhu-csse", signal = "confirmed_7dav_incidence_num", start_day = start_day, geo_type = "nation", offline_signal_dir = offline_signal_dir)
-  td2$data_source <- "jhu"
+  td1 <- epidatr::pub_covidcast(
+    source = "hhs",
+    signals = "confirmed_admissions_covid_1d",
+    geo_type = "nation",
+    time_type = "day",
+    geo_values = "*",
+    time_values = epidatr::epirange(start_day, Sys.Date())
+  ) %>%
+    filter(!(.data$geo_value %in% exclude_geos)) %>%
+    dplyr::select(.data$time_value, .data$value) %>%
+    dplyr::rename(target_end_date = .data$time_value) %>%
+    mutate(data_source = "hhs")
+  td2 <- epidatr::pub_covidcast(
+    source = "jhu-csse",
+    signals = "confirmed_7dav_incidence_num",
+    geo_type = "nation",
+    time_type = "day",
+    geo_values = "*",
+    time_values = epidatr::epirange(start_day, Sys.Date())
+  ) %>%
+    filter(!(.data$geo_value %in% exclude_geos)) %>%
+    dplyr::select(.data$time_value, .data$value) %>%
+    dplyr::rename(target_end_date = .data$time_value) %>%
+    mutate(data_source = "jhu")
   td1.max <- td1 %>%
     summarize(max_value = max(value)) %>%
     pull(max_value)
   td2.max <- td2 %>%
     summarize(max_value = max(value)) %>%
     pull(max_value)
-  td2 <- td2 %>%
-    mutate(scaled_value = value * td1.max / td2.max)
+  td2 <- td2 %>% mutate(scaled_value = value * td1.max / td2.max)
 
   # Setup plot
   g <- ggplot(td1, mapping = aes(x = .data$target_end_date))
-
-  quantiles_df <- get_quantiles_df(predictions_cards)
-  g <- plot_quantiles(g, quantiles_df)
-
-  points_df <- get_points_df(predictions_cards)
-  g <- plot_points(g, points_df)
-
-  # Plot truth data by geo
+  g <- plot_quantiles(g, get_quantiles_df(predictions_cards))
+  g <- plot_points(g, get_points_df(predictions_cards))
   g <- g +
     geom_line(mapping = aes(y = .data$value, color = "confirmed admissions")) +
     geom_line(data = td2, mapping = aes(x = .data$target_end_date, y = .data$scaled_value, color = "7day case sum")) +
diff --git a/scripts/covid_hosp_prod.Rmd b/scripts/covid_hosp_prod.Rmd
@@ -28,11 +28,9 @@ body .main-container {
 
 ```{r setup, include=FALSE}
 library(dplyr)
-library(evalcast)
 library(here)
 library(magrittr)
 library(rlang)
-library(targets)
 library(tidyr)
 source(here("R", "plotting.R"))