cmu-delphi · dajmcdon · Jun 6, 2022 · May 31, 2022 · May 31, 2022
@@ -9,3 +9,4 @@
 ^docs$
 ^pkgdown$
 ^musings$
+^data-raw$
@@ -42,3 +42,6 @@ Config/testthat/edition: 3
 Encoding: UTF-8
 Roxygen: list(markdown = TRUE)
 RoxygenNote: 7.2.0
+Depends: 
+    R (>= 2.10)
+LazyData: true
@@ -20,6 +20,7 @@ export(df_mat_mul)
 export(epi_keys)
 export(epi_recipe)
 export(get_precision)
+export(get_test_data)
 export(grab_names)
 export(knn_iteraive_ar_args_list)
 export(knn_iteraive_ar_forecaster)

@@ -0,0 +1,35 @@
+#' Subset of JHU daily state cases and deaths
+#'
+#' This data source of confirmed COVID-19 cases and deaths
+#' is based on reports made available by the Center for
+#' Systems Science and Engineering at Johns Hopkins University.
+#' This example data ranges from Dec 31, 2020 to Dec 31, 2021,
+#' and includes all states.
+#'
+#' @format A tibble with 20,496 rows and 4 variables:
+#' \describe{
+#'   \item{geo_value}{the geographic value associated with each row
+#'       of measurements.}
+#'   \item{time_value}{the time value associated with each row of measurements.}
+#'   \item{case_rate}{7-day average signal of number of new
+#'       confirmed COVID-19 cases per 100,000 population, daily}
+#'   \item{death_rate}{7-day average signal of number of new confirmed
+#'       deaths due to COVID-19 per 100,000 population, daily}
+#' }
+#' @source This object contains a modified part of the
+#'   \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University}
+#'   as \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}.
+#'   This data set is licensed under the terms of the
+#'   \href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license}
+#'   by the Johns Hopkins University on behalf of its Center for Systems Science
+#'   in Engineering. Copyright Johns Hopkins University 2020.
+#'
+#' Modifications:
+#' * \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}:
+#'   These signals are taken directly from the JHU CSSE
+#'   \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub repository}
+#'   without changes. The 7-day average signals are computed by Delphi by
+#'   calculating moving averages of the preceding 7 days, so the signal for
+#'   June 7 is the average of the underlying data for June 1 through 7,
+#'   inclusive.
+"case_death_rate_subset"
@@ -0,0 +1,56 @@
+#' Get test data for prediction based on longest lag period
+#'
+#' Based on the longest lag period in the recipe,
+#' `get_test_data()` creates a tibble in [epiprocess::epi_df]
+#' format with columns `geo_value`, `time_value`
+#' and other variables in the original dataset,
+#' which will be used to create test data.
+#'
+#' @param recipe A recipe object. The step will be added to the
+#'  sequence of operations for this recipe.
+#' @param x A data frame, tibble, or epi_df data set.
+#'
+#' @return A tibble with columns `geo_value`, `time_value`
+#' and other variables in the original dataset.
+#' @examples
+#' # create recipe
+#'  rec <- epi_recipe(case_death_rate_subset) %>%
+#'   step_epi_ahead(death_rate, ahead = 7) %>%
+#'   step_epi_lag(death_rate, lag = c(0, 7, 14)) %>%
+#'   step_epi_lag(case_rate, lag = c(0, 7, 14))
+#'  get_test_data(recipe = rec, x = case_death_rate_subset)
+#' @export
+
+get_test_data <- function(recipe, x){
+  # TO-DO: SOME CHECKS OF THE DATASET
+  if (any(!(c('geo_value','time_value') %in% colnames(x)))) {
+    rlang::abort("`geo_value`, `time_value` does not exist in data")
+  }
+  ## CHECK if it is epi_df?
+
+
+  # initialize vector to hold max lags for each variable
+  max_lags <- c()
+  for(i in c(1:length(recipe$steps))){
+    if("lag" %in% names(recipe$steps[[i]])){
+      max_lags <- append(max_lags, max(recipe$steps[[i]]$lag))
+    }
+  }
+
+  # CHECK: Return NA if insufficient training data
+  if (dplyr::n_distinct(x$time_value)< max(max_lags)) {
+    stop("insufficient training data")
+  }
+
+  test_data <- x %>%
+    dplyr::filter(
+      dplyr::if_any(
+        .cols = recipe$term_info$variable[which(recipe$var_info$role == 'raw')],
+        .fns = ~ !is.na(.x)
+      )
+    ) %>%
+    dplyr::group_by(geo_value) %>%
+    dplyr::slice_tail(n = max(max_lags) + 1)
+
+  return(test_data)
+}
@@ -0,0 +1,32 @@
+library(tidyverse)
+library(covidcast)
+library(delphi.epidata)
+library(epiprocess)
+
+x <- covidcast(
+  data_source = "jhu-csse",
+  signals = "confirmed_7dav_incidence_prop",
+  time_type = "day",
+  geo_type = "state",
+  time_values = epirange(20201231, 20211231),
+  geo_values = "*"
+) %>%
+  fetch_tbl() %>%
+  select(geo_value, time_value, case_rate = value)
+
+y <- covidcast(
+  data_source = "jhu-csse",
+  signals = "deaths_7dav_incidence_prop",
+  time_type = "day",
+  geo_type = "state",
+  time_values = epirange(20201231, 20211231),
+  geo_values = "*"
+) %>%
+  fetch_tbl() %>%
+  select(geo_value, time_value, death_rate = value)
+
+case_death_rate_subset <- x %>%
+  full_join(y, by = c("geo_value", "time_value")) %>%
+  as_epi_df()
+
+usethis::use_data(case_death_rate_subset, overwrite = TRUE)
@@ -0,0 +1,37 @@
+test_that("return expected number of rows", {
+  r <- epi_recipe(case_death_rate_subset) %>%
+    step_epi_ahead(death_rate, ahead = 7) %>%
+    step_epi_lag(death_rate, lag = c(0, 7, 14, 21, 28)) %>%
+    step_epi_lag(case_rate, lag = c(0, 7, 14)) %>%
+    step_naomit(all_predictors()) %>%
+    step_naomit(all_outcomes(), skip = TRUE)
+
+  test <- get_test_data(recipe = r, x = case_death_rate_subset)
+
+  expect_equal(nrow(test),
+               dplyr::n_distinct(case_death_rate_subset$geo_value)* 29)
+})
+
+
+test_that("expect insufficient training data error", {
+  r <- epi_recipe(case_death_rate_subset) %>%
+    step_epi_ahead(death_rate, ahead = 7) %>%
+    step_epi_lag(death_rate, lag = c(0, 367)) %>%
+    step_naomit(all_predictors()) %>%
+    step_naomit(all_outcomes(), skip = TRUE)
+
+  expect_error(get_test_data(recipe = r, x = case_death_rate_subset))
+})
+
+test_that("expect error that geo_value or time_value does not exist", {
+  r <-  epi_recipe(case_death_rate_subset) %>%
+    step_epi_ahead(death_rate, ahead = 7) %>%
+    step_epi_lag(death_rate, lag = c(0, 7, 14)) %>%
+    step_epi_lag(case_rate, lag = c(0, 7, 14)) %>%
+    step_naomit(all_predictors()) %>%
+    step_naomit(all_outcomes(), skip = TRUE)
+
+  wrong_epi_df <- case_death_rate_subset %>% dplyr::select(-geo_value)
+
+  expect_error(get_test_data(recipe = r, x = wrong_epi_df))
+})