Skip to content

get test data function created; internal dataset added #32

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 6, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@
^docs$
^pkgdown$
^musings$
^data-raw$
3 changes: 3 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,6 @@ Config/testthat/edition: 3
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.0
Depends:
R (>= 2.10)
LazyData: true
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ export(df_mat_mul)
export(epi_keys)
export(epi_recipe)
export(get_precision)
export(get_test_data)
export(grab_names)
export(knn_iteraive_ar_args_list)
export(knn_iteraive_ar_forecaster)
Expand Down
35 changes: 35 additions & 0 deletions R/data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#' Subset of JHU daily state cases and deaths
#'
#' This data source of confirmed COVID-19 cases and deaths
#' is based on reports made available by the Center for
#' Systems Science and Engineering at Johns Hopkins University.
#' This example data ranges from Dec 31, 2020 to Dec 31, 2021,
#' and includes all states.
#'
#' @format A tibble with 20,496 rows and 4 variables:
#' \describe{
#' \item{geo_value}{the geographic value associated with each row
#' of measurements.}
#' \item{time_value}{the time value associated with each row of measurements.}
#' \item{case_rate}{7-day average signal of number of new
#' confirmed COVID-19 cases per 100,000 population, daily}
#' \item{death_rate}{7-day average signal of number of new confirmed
#' deaths due to COVID-19 per 100,000 population, daily}
#' }
#' @source This object contains a modified part of the
#' \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University}
#' as \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}.
#' This data set is licensed under the terms of the
#' \href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license}
#' by the Johns Hopkins University on behalf of its Center for Systems Science
#' in Engineering. Copyright Johns Hopkins University 2020.
#'
#' Modifications:
#' * \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}:
#' These signals are taken directly from the JHU CSSE
#' \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub repository}
#' without changes. The 7-day average signals are computed by Delphi by
#' calculating moving averages of the preceding 7 days, so the signal for
#' June 7 is the average of the underlying data for June 1 through 7,
#' inclusive.
"case_death_rate_subset"
56 changes: 56 additions & 0 deletions R/get_test_data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#' Get test data for prediction based on longest lag period
#'
#' Based on the longest lag period in the recipe,
#' `get_test_data()` creates a tibble in [epiprocess::epi_df]
#' format with columns `geo_value`, `time_value`
#' and other variables in the original dataset,
#' which will be used to create test data.
#'
#' @param recipe A recipe object. The step will be added to the
#' sequence of operations for this recipe.
#' @param x A data frame, tibble, or epi_df data set.
#'
#' @return A tibble with columns `geo_value`, `time_value`
#' and other variables in the original dataset.
#' @examples
#' # create recipe
#' rec <- epi_recipe(case_death_rate_subset) %>%
#' step_epi_ahead(death_rate, ahead = 7) %>%
#' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>%
#' step_epi_lag(case_rate, lag = c(0, 7, 14))
#' get_test_data(recipe = rec, x = case_death_rate_subset)
#' @export

get_test_data <- function(recipe, x){
# TO-DO: SOME CHECKS OF THE DATASET
if (any(!(c('geo_value','time_value') %in% colnames(x)))) {
rlang::abort("`geo_value`, `time_value` does not exist in data")
}
## CHECK if it is epi_df?


# initialize vector to hold max lags for each variable
max_lags <- c()
for(i in c(1:length(recipe$steps))){
if("lag" %in% names(recipe$steps[[i]])){
max_lags <- append(max_lags, max(recipe$steps[[i]]$lag))
}
}

# CHECK: Return NA if insufficient training data
if (dplyr::n_distinct(x$time_value)< max(max_lags)) {
stop("insufficient training data")
}

test_data <- x %>%
dplyr::filter(
dplyr::if_any(
.cols = recipe$term_info$variable[which(recipe$var_info$role == 'raw')],
.fns = ~ !is.na(.x)
)
) %>%
dplyr::group_by(geo_value) %>%
dplyr::slice_tail(n = max(max_lags) + 1)

return(test_data)
}
32 changes: 32 additions & 0 deletions data-raw/case_death_rate_subset.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
library(tidyverse)
library(covidcast)
library(delphi.epidata)
library(epiprocess)

x <- covidcast(
data_source = "jhu-csse",
signals = "confirmed_7dav_incidence_prop",
time_type = "day",
geo_type = "state",
time_values = epirange(20201231, 20211231),
geo_values = "*"
) %>%
fetch_tbl() %>%
select(geo_value, time_value, case_rate = value)

y <- covidcast(
data_source = "jhu-csse",
signals = "deaths_7dav_incidence_prop",
time_type = "day",
geo_type = "state",
time_values = epirange(20201231, 20211231),
geo_values = "*"
) %>%
fetch_tbl() %>%
select(geo_value, time_value, death_rate = value)

case_death_rate_subset <- x %>%
full_join(y, by = c("geo_value", "time_value")) %>%
as_epi_df()

usethis::use_data(case_death_rate_subset, overwrite = TRUE)
Binary file added data/case_death_rate_subset.rda
Binary file not shown.
49 changes: 49 additions & 0 deletions man/case_death_rate_subset.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

33 changes: 33 additions & 0 deletions man/get_test_data.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

37 changes: 37 additions & 0 deletions tests/testthat/test-get_test_data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
test_that("return expected number of rows", {
r <- epi_recipe(case_death_rate_subset) %>%
step_epi_ahead(death_rate, ahead = 7) %>%
step_epi_lag(death_rate, lag = c(0, 7, 14, 21, 28)) %>%
step_epi_lag(case_rate, lag = c(0, 7, 14)) %>%
step_naomit(all_predictors()) %>%
step_naomit(all_outcomes(), skip = TRUE)

test <- get_test_data(recipe = r, x = case_death_rate_subset)

expect_equal(nrow(test),
dplyr::n_distinct(case_death_rate_subset$geo_value)* 29)
})


test_that("expect insufficient training data error", {
r <- epi_recipe(case_death_rate_subset) %>%
step_epi_ahead(death_rate, ahead = 7) %>%
step_epi_lag(death_rate, lag = c(0, 367)) %>%
step_naomit(all_predictors()) %>%
step_naomit(all_outcomes(), skip = TRUE)

expect_error(get_test_data(recipe = r, x = case_death_rate_subset))
})

test_that("expect error that geo_value or time_value does not exist", {
r <- epi_recipe(case_death_rate_subset) %>%
step_epi_ahead(death_rate, ahead = 7) %>%
step_epi_lag(death_rate, lag = c(0, 7, 14)) %>%
step_epi_lag(case_rate, lag = c(0, 7, 14)) %>%
step_naomit(all_predictors()) %>%
step_naomit(all_outcomes(), skip = TRUE)

wrong_epi_df <- case_death_rate_subset %>% dplyr::select(-geo_value)

expect_error(get_test_data(recipe = r, x = wrong_epi_df))
})