diff --git a/.Rbuildignore b/.Rbuildignore index 7007d2e22..aeec9a2cb 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -9,3 +9,4 @@ ^docs$ ^pkgdown$ ^musings$ +^data-raw$ diff --git a/DESCRIPTION b/DESCRIPTION index f51ff8a26..e4c939dd4 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -42,3 +42,6 @@ Config/testthat/edition: 3 Encoding: UTF-8 Roxygen: list(markdown = TRUE) RoxygenNote: 7.2.0 +Depends: + R (>= 2.10) +LazyData: true diff --git a/NAMESPACE b/NAMESPACE index b632f8922..e3d6d3365 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -20,6 +20,7 @@ export(df_mat_mul) export(epi_keys) export(epi_recipe) export(get_precision) +export(get_test_data) export(grab_names) export(knn_iteraive_ar_args_list) export(knn_iteraive_ar_forecaster) diff --git a/R/data.R b/R/data.R new file mode 100644 index 000000000..9c93506a3 --- /dev/null +++ b/R/data.R @@ -0,0 +1,35 @@ +#' Subset of JHU daily state cases and deaths +#' +#' This data source of confirmed COVID-19 cases and deaths +#' is based on reports made available by the Center for +#' Systems Science and Engineering at Johns Hopkins University. +#' This example data ranges from Dec 31, 2020 to Dec 31, 2021, +#' and includes all states. +#' +#' @format A tibble with 20,496 rows and 4 variables: +#' \describe{ +#' \item{geo_value}{the geographic value associated with each row +#' of measurements.} +#' \item{time_value}{the time value associated with each row of measurements.} +#' \item{case_rate}{7-day average signal of number of new +#' confirmed COVID-19 cases per 100,000 population, daily} +#' \item{death_rate}{7-day average signal of number of new confirmed +#' deaths due to COVID-19 per 100,000 population, daily} +#' } +#' @source This object contains a modified part of the +#' \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University} +#' as \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}. +#' This data set is licensed under the terms of the +#' \href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license} +#' by the Johns Hopkins University on behalf of its Center for Systems Science +#' in Engineering. Copyright Johns Hopkins University 2020. +#' +#' Modifications: +#' * \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}: +#' These signals are taken directly from the JHU CSSE +#' \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub repository} +#' without changes. The 7-day average signals are computed by Delphi by +#' calculating moving averages of the preceding 7 days, so the signal for +#' June 7 is the average of the underlying data for June 1 through 7, +#' inclusive. +"case_death_rate_subset" diff --git a/R/get_test_data.R b/R/get_test_data.R new file mode 100644 index 000000000..69ec92d1f --- /dev/null +++ b/R/get_test_data.R @@ -0,0 +1,56 @@ +#' Get test data for prediction based on longest lag period +#' +#' Based on the longest lag period in the recipe, +#' `get_test_data()` creates a tibble in [epiprocess::epi_df] +#' format with columns `geo_value`, `time_value` +#' and other variables in the original dataset, +#' which will be used to create test data. +#' +#' @param recipe A recipe object. The step will be added to the +#' sequence of operations for this recipe. +#' @param x A data frame, tibble, or epi_df data set. +#' +#' @return A tibble with columns `geo_value`, `time_value` +#' and other variables in the original dataset. +#' @examples +#' # create recipe +#' rec <- epi_recipe(case_death_rate_subset) %>% +#' step_epi_ahead(death_rate, ahead = 7) %>% +#' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% +#' step_epi_lag(case_rate, lag = c(0, 7, 14)) +#' get_test_data(recipe = rec, x = case_death_rate_subset) +#' @export + +get_test_data <- function(recipe, x){ + # TO-DO: SOME CHECKS OF THE DATASET + if (any(!(c('geo_value','time_value') %in% colnames(x)))) { + rlang::abort("`geo_value`, `time_value` does not exist in data") + } + ## CHECK if it is epi_df? + + + # initialize vector to hold max lags for each variable + max_lags <- c() + for(i in c(1:length(recipe$steps))){ + if("lag" %in% names(recipe$steps[[i]])){ + max_lags <- append(max_lags, max(recipe$steps[[i]]$lag)) + } + } + + # CHECK: Return NA if insufficient training data + if (dplyr::n_distinct(x$time_value)< max(max_lags)) { + stop("insufficient training data") + } + + test_data <- x %>% + dplyr::filter( + dplyr::if_any( + .cols = recipe$term_info$variable[which(recipe$var_info$role == 'raw')], + .fns = ~ !is.na(.x) + ) + ) %>% + dplyr::group_by(geo_value) %>% + dplyr::slice_tail(n = max(max_lags) + 1) + + return(test_data) +} diff --git a/data-raw/case_death_rate_subset.R b/data-raw/case_death_rate_subset.R new file mode 100644 index 000000000..11f552f4d --- /dev/null +++ b/data-raw/case_death_rate_subset.R @@ -0,0 +1,32 @@ +library(tidyverse) +library(covidcast) +library(delphi.epidata) +library(epiprocess) + +x <- covidcast( + data_source = "jhu-csse", + signals = "confirmed_7dav_incidence_prop", + time_type = "day", + geo_type = "state", + time_values = epirange(20201231, 20211231), + geo_values = "*" +) %>% + fetch_tbl() %>% + select(geo_value, time_value, case_rate = value) + +y <- covidcast( + data_source = "jhu-csse", + signals = "deaths_7dav_incidence_prop", + time_type = "day", + geo_type = "state", + time_values = epirange(20201231, 20211231), + geo_values = "*" +) %>% + fetch_tbl() %>% + select(geo_value, time_value, death_rate = value) + +case_death_rate_subset <- x %>% + full_join(y, by = c("geo_value", "time_value")) %>% + as_epi_df() + +usethis::use_data(case_death_rate_subset, overwrite = TRUE) diff --git a/data/case_death_rate_subset.rda b/data/case_death_rate_subset.rda new file mode 100644 index 000000000..2e5ced29e Binary files /dev/null and b/data/case_death_rate_subset.rda differ diff --git a/man/case_death_rate_subset.Rd b/man/case_death_rate_subset.Rd new file mode 100644 index 000000000..119c8ee26 --- /dev/null +++ b/man/case_death_rate_subset.Rd @@ -0,0 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{case_death_rate_subset} +\alias{case_death_rate_subset} +\title{Subset of JHU daily state cases and deaths} +\format{ +A tibble with 20,496 rows and 4 variables: +\describe{ +\item{geo_value}{the geographic value associated with each row +of measurements.} +\item{time_value}{the time value associated with each row of measurements.} +\item{case_rate}{7-day average signal of number of new +confirmed COVID-19 cases per 100,000 population, daily} +\item{death_rate}{7-day average signal of number of new confirmed +deaths due to COVID-19 per 100,000 population, daily} +} +} +\source{ +This object contains a modified part of the +\href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University} +as \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}. +This data set is licensed under the terms of the +\href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license} +by the Johns Hopkins University on behalf of its Center for Systems Science +in Engineering. Copyright Johns Hopkins University 2020. + +Modifications: +\itemize{ +\item \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}: +These signals are taken directly from the JHU CSSE +\href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub repository} +without changes. The 7-day average signals are computed by Delphi by +calculating moving averages of the preceding 7 days, so the signal for +June 7 is the average of the underlying data for June 1 through 7, +inclusive. +} +} +\usage{ +case_death_rate_subset +} +\description{ +This data source of confirmed COVID-19 cases and deaths +is based on reports made available by the Center for +Systems Science and Engineering at Johns Hopkins University. +This example data ranges from Dec 31, 2020 to Dec 31, 2021, +and includes all states. +} +\keyword{datasets} diff --git a/man/get_test_data.Rd b/man/get_test_data.Rd new file mode 100644 index 000000000..b71bad8f6 --- /dev/null +++ b/man/get_test_data.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_test_data.R +\name{get_test_data} +\alias{get_test_data} +\title{Get test data for prediction based on longest lag period} +\usage{ +get_test_data(recipe, x) +} +\arguments{ +\item{recipe}{A recipe object. The step will be added to the +sequence of operations for this recipe.} + +\item{x}{A data frame, tibble, or epi_df data set.} +} +\value{ +A tibble with columns \code{geo_value}, \code{time_value} +and other variables in the original dataset. +} +\description{ +Based on the longest lag period in the recipe, +\code{get_test_data()} creates a tibble in \link[epiprocess:epi_df]{epiprocess::epi_df} +format with columns \code{geo_value}, \code{time_value} +and other variables in the original dataset, +which will be used to create test data. +} +\examples{ +# create recipe + rec <- epi_recipe(case_death_rate_subset) \%>\% + step_epi_ahead(death_rate, ahead = 7) \%>\% + step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% + step_epi_lag(case_rate, lag = c(0, 7, 14)) + get_test_data(recipe = rec, x = case_death_rate_subset) +} diff --git a/tests/testthat/test-get_test_data.R b/tests/testthat/test-get_test_data.R new file mode 100644 index 000000000..e063523a1 --- /dev/null +++ b/tests/testthat/test-get_test_data.R @@ -0,0 +1,37 @@ +test_that("return expected number of rows", { + r <- epi_recipe(case_death_rate_subset) %>% + step_epi_ahead(death_rate, ahead = 7) %>% + step_epi_lag(death_rate, lag = c(0, 7, 14, 21, 28)) %>% + step_epi_lag(case_rate, lag = c(0, 7, 14)) %>% + step_naomit(all_predictors()) %>% + step_naomit(all_outcomes(), skip = TRUE) + + test <- get_test_data(recipe = r, x = case_death_rate_subset) + + expect_equal(nrow(test), + dplyr::n_distinct(case_death_rate_subset$geo_value)* 29) +}) + + +test_that("expect insufficient training data error", { + r <- epi_recipe(case_death_rate_subset) %>% + step_epi_ahead(death_rate, ahead = 7) %>% + step_epi_lag(death_rate, lag = c(0, 367)) %>% + step_naomit(all_predictors()) %>% + step_naomit(all_outcomes(), skip = TRUE) + + expect_error(get_test_data(recipe = r, x = case_death_rate_subset)) +}) + +test_that("expect error that geo_value or time_value does not exist", { + r <- epi_recipe(case_death_rate_subset) %>% + step_epi_ahead(death_rate, ahead = 7) %>% + step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% + step_epi_lag(case_rate, lag = c(0, 7, 14)) %>% + step_naomit(all_predictors()) %>% + step_naomit(all_outcomes(), skip = TRUE) + + wrong_epi_df <- case_death_rate_subset %>% dplyr::select(-geo_value) + + expect_error(get_test_data(recipe = r, x = wrong_epi_df)) +})