Skip to content

Commit 8b73cc6

Browse files
authored
Merge pull request #32 from ChloeYou/main
get test data function created; internal dataset added
2 parents 7b13e55 + 74aed7a commit 8b73cc6

10 files changed

+247
-0
lines changed

.Rbuildignore

+1
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,4 @@
99
^docs$
1010
^pkgdown$
1111
^musings$
12+
^data-raw$

DESCRIPTION

+3
Original file line numberDiff line numberDiff line change
@@ -50,3 +50,6 @@ Config/testthat/edition: 3
5050
Encoding: UTF-8
5151
Roxygen: list(markdown = TRUE)
5252
RoxygenNote: 7.2.0
53+
Depends:
54+
R (>= 2.10)
55+
LazyData: true

NAMESPACE

+1
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ export(epi_keys)
2424
export(epi_recipe)
2525
export(epi_workflow)
2626
export(get_precision)
27+
export(get_test_data)
2728
export(grab_names)
2829
export(is_epi_recipe)
2930
export(is_epi_workflow)

R/data.R

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
#' Subset of JHU daily state cases and deaths
2+
#'
3+
#' This data source of confirmed COVID-19 cases and deaths
4+
#' is based on reports made available by the Center for
5+
#' Systems Science and Engineering at Johns Hopkins University.
6+
#' This example data ranges from Dec 31, 2020 to Dec 31, 2021,
7+
#' and includes all states.
8+
#'
9+
#' @format A tibble with 20,496 rows and 4 variables:
10+
#' \describe{
11+
#' \item{geo_value}{the geographic value associated with each row
12+
#' of measurements.}
13+
#' \item{time_value}{the time value associated with each row of measurements.}
14+
#' \item{case_rate}{7-day average signal of number of new
15+
#' confirmed COVID-19 cases per 100,000 population, daily}
16+
#' \item{death_rate}{7-day average signal of number of new confirmed
17+
#' deaths due to COVID-19 per 100,000 population, daily}
18+
#' }
19+
#' @source This object contains a modified part of the
20+
#' \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University}
21+
#' as \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}.
22+
#' This data set is licensed under the terms of the
23+
#' \href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license}
24+
#' by the Johns Hopkins University on behalf of its Center for Systems Science
25+
#' in Engineering. Copyright Johns Hopkins University 2020.
26+
#'
27+
#' Modifications:
28+
#' * \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}:
29+
#' These signals are taken directly from the JHU CSSE
30+
#' \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub repository}
31+
#' without changes. The 7-day average signals are computed by Delphi by
32+
#' calculating moving averages of the preceding 7 days, so the signal for
33+
#' June 7 is the average of the underlying data for June 1 through 7,
34+
#' inclusive.
35+
"case_death_rate_subset"

R/get_test_data.R

+56
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
#' Get test data for prediction based on longest lag period
2+
#'
3+
#' Based on the longest lag period in the recipe,
4+
#' `get_test_data()` creates a tibble in [epiprocess::epi_df]
5+
#' format with columns `geo_value`, `time_value`
6+
#' and other variables in the original dataset,
7+
#' which will be used to create test data.
8+
#'
9+
#' @param recipe A recipe object. The step will be added to the
10+
#' sequence of operations for this recipe.
11+
#' @param x A data frame, tibble, or epi_df data set.
12+
#'
13+
#' @return A tibble with columns `geo_value`, `time_value`
14+
#' and other variables in the original dataset.
15+
#' @examples
16+
#' # create recipe
17+
#' rec <- epi_recipe(case_death_rate_subset) %>%
18+
#' step_epi_ahead(death_rate, ahead = 7) %>%
19+
#' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>%
20+
#' step_epi_lag(case_rate, lag = c(0, 7, 14))
21+
#' get_test_data(recipe = rec, x = case_death_rate_subset)
22+
#' @export
23+
24+
get_test_data <- function(recipe, x){
25+
# TO-DO: SOME CHECKS OF THE DATASET
26+
if (any(!(c('geo_value','time_value') %in% colnames(x)))) {
27+
rlang::abort("`geo_value`, `time_value` does not exist in data")
28+
}
29+
## CHECK if it is epi_df?
30+
31+
32+
# initialize vector to hold max lags for each variable
33+
max_lags <- c()
34+
for(i in c(1:length(recipe$steps))){
35+
if("lag" %in% names(recipe$steps[[i]])){
36+
max_lags <- append(max_lags, max(recipe$steps[[i]]$lag))
37+
}
38+
}
39+
40+
# CHECK: Return NA if insufficient training data
41+
if (dplyr::n_distinct(x$time_value)< max(max_lags)) {
42+
stop("insufficient training data")
43+
}
44+
45+
test_data <- x %>%
46+
dplyr::filter(
47+
dplyr::if_any(
48+
.cols = recipe$term_info$variable[which(recipe$var_info$role == 'raw')],
49+
.fns = ~ !is.na(.x)
50+
)
51+
) %>%
52+
dplyr::group_by(geo_value) %>%
53+
dplyr::slice_tail(n = max(max_lags) + 1)
54+
55+
return(test_data)
56+
}

data-raw/case_death_rate_subset.R

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
library(tidyverse)
2+
library(covidcast)
3+
library(delphi.epidata)
4+
library(epiprocess)
5+
6+
x <- covidcast(
7+
data_source = "jhu-csse",
8+
signals = "confirmed_7dav_incidence_prop",
9+
time_type = "day",
10+
geo_type = "state",
11+
time_values = epirange(20201231, 20211231),
12+
geo_values = "*"
13+
) %>%
14+
fetch_tbl() %>%
15+
select(geo_value, time_value, case_rate = value)
16+
17+
y <- covidcast(
18+
data_source = "jhu-csse",
19+
signals = "deaths_7dav_incidence_prop",
20+
time_type = "day",
21+
geo_type = "state",
22+
time_values = epirange(20201231, 20211231),
23+
geo_values = "*"
24+
) %>%
25+
fetch_tbl() %>%
26+
select(geo_value, time_value, death_rate = value)
27+
28+
case_death_rate_subset <- x %>%
29+
full_join(y, by = c("geo_value", "time_value")) %>%
30+
as_epi_df()
31+
32+
usethis::use_data(case_death_rate_subset, overwrite = TRUE)

data/case_death_rate_subset.rda

208 KB
Binary file not shown.

man/case_death_rate_subset.Rd

+49
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/get_test_data.Rd

+33
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/testthat/test-get_test_data.R

+37
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
test_that("return expected number of rows", {
2+
r <- epi_recipe(case_death_rate_subset) %>%
3+
step_epi_ahead(death_rate, ahead = 7) %>%
4+
step_epi_lag(death_rate, lag = c(0, 7, 14, 21, 28)) %>%
5+
step_epi_lag(case_rate, lag = c(0, 7, 14)) %>%
6+
step_naomit(all_predictors()) %>%
7+
step_naomit(all_outcomes(), skip = TRUE)
8+
9+
test <- get_test_data(recipe = r, x = case_death_rate_subset)
10+
11+
expect_equal(nrow(test),
12+
dplyr::n_distinct(case_death_rate_subset$geo_value)* 29)
13+
})
14+
15+
16+
test_that("expect insufficient training data error", {
17+
r <- epi_recipe(case_death_rate_subset) %>%
18+
step_epi_ahead(death_rate, ahead = 7) %>%
19+
step_epi_lag(death_rate, lag = c(0, 367)) %>%
20+
step_naomit(all_predictors()) %>%
21+
step_naomit(all_outcomes(), skip = TRUE)
22+
23+
expect_error(get_test_data(recipe = r, x = case_death_rate_subset))
24+
})
25+
26+
test_that("expect error that geo_value or time_value does not exist", {
27+
r <- epi_recipe(case_death_rate_subset) %>%
28+
step_epi_ahead(death_rate, ahead = 7) %>%
29+
step_epi_lag(death_rate, lag = c(0, 7, 14)) %>%
30+
step_epi_lag(case_rate, lag = c(0, 7, 14)) %>%
31+
step_naomit(all_predictors()) %>%
32+
step_naomit(all_outcomes(), skip = TRUE)
33+
34+
wrong_epi_df <- case_death_rate_subset %>% dplyr::select(-geo_value)
35+
36+
expect_error(get_test_data(recipe = r, x = wrong_epi_df))
37+
})

0 commit comments

Comments
 (0)