cmu-delphi
diff --git a/‎R/reexports-epidatasets.R
+54 b/‎R/reexports-epidatasets.R
+54
diff --git a/‎inst/extdata/can_prov_cases.rds
-155 KB b/‎inst/extdata/can_prov_cases.rds
-155 KB
diff --git a/‎inst/extdata/canada-case-rates.R
-23 b/‎inst/extdata/canada-case-rates.R
-23
diff --git a/‎inst/extdata/epi_archive.rds
-479 KB b/‎inst/extdata/epi_archive.rds
-479 KB
diff --git a/‎vignettes/articles/case_death_rate_archive.rds
-1.25 MB b/‎vignettes/articles/case_death_rate_archive.rds
-1.25 MB
diff --git a/‎vignettes/articles/sliding.Rmd
+56-28 b/‎vignettes/articles/sliding.Rmd
+56-28
diff --git a/‎vignettes/articles/symptom-surveys.Rmd
+39-14 b/‎vignettes/articles/symptom-surveys.Rmd
+39-14
@@ -87,3 +87,57 @@ delayedAssign("counts_subset", epidatasets::counts_subset)
 #' data(ctis_covid_behaviours, package = "epipredict")
 #' @export
 delayedAssign("ctis_covid_behaviours", epidatasets::ctis_covid_behaviours)
+
+#' @inherit epidatasets::county_smoothed_cli_comparison description source references title
+#' @inheritSection epidatasets::county_smoothed_cli_comparison Data dictionary
+#' @examples
+#' # Since this is a re-exported dataset, it cannot be loaded using
+#' # the `data()` function. `data()` looks for a file of the same name
+#' # in the `data/` directory, which doesn't exist in this package.
+#' # works
+#' epipredict::county_smoothed_cli_comparison
+#'
+#' # works
+#' library(epipredict)
+#' county_smoothed_cli_comparison
+#'
+#' # fails
+#' data(county_smoothed_cli_comparison, package = "epipredict")
+#' @export
+delayedAssign("county_smoothed_cli_comparison", epidatasets::county_smoothed_cli_comparison)
+
+#' @inherit epidatasets::case_death_rate_archive description source references title
+#' @inheritSection epidatasets::case_death_rate_archive Data dictionary
+#' @examples
+#' # Since this is a re-exported dataset, it cannot be loaded using
+#' # the `data()` function. `data()` looks for a file of the same name
+#' # in the `data/` directory, which doesn't exist in this package.
+#' # works
+#' epipredict::case_death_rate_archive
+#'
+#' # works
+#' library(epipredict)
+#' case_death_rate_archive
+#'
+#' # fails
+#' data(case_death_rate_archive, package = "epipredict")
+#' @export
+delayedAssign("case_death_rate_archive", epidatasets::case_death_rate_archive)
+
+#' @inherit epidatasets::archive_cases_dv_subset_all_states description source references title
+#' @inheritSection epidatasets::archive_cases_dv_subset_all_states Data dictionary
+#' @examples
+#' # Since this is a re-exported dataset, it cannot be loaded using
+#' # the `data()` function. `data()` looks for a file of the same name
+#' # in the `data/` directory, which doesn't exist in this package.
+#' # works
+#' epipredict::archive_cases_dv_subset_all_states
+#'
+#' # works
+#' library(epipredict)
+#' archive_cases_dv_subset_all_states
+#'
+#' # fails
+#' data(archive_cases_dv_subset_all_states, package = "epipredict")
+#' @export
+delayedAssign("archive_cases_dv_subset_all_states", epidatasets::archive_cases_dv_subset_all_states)
@@ -14,7 +14,6 @@ knitr::opts_chunk$set(
 
 ```{r pkgs}
 library(epipredict)
-library(epidatr)
 library(data.table)
 library(dplyr)
 library(tidyr)
@@ -60,25 +59,54 @@ claims and the number of new confirmed COVID-19 cases per 100,000 population
 
 <summary>Load a data archive</summary>
 
-We process as before, with the
-modification that we use `sync = locf` in `epix_merge()` so that the last
-version of each observation can be carried forward to extrapolate unavailable
-versions for the less up-to-date input archive.
+This dataset is processed as before, with the modification that we use `sync =
+locf` in `epix_merge()` so that the last version of each observation can be
+carried forward to extrapolate unavailable versions for the less up-to-date
+input archive.
 
 ```{r grab-epi-data}
 theme_set(theme_bw())
 
-y <- readRDS("all_states_covidcast_signals.rds")
+x <- archive_cases_dv_subset_all_states
+```
+
+The data can also be constructed from data the Delphi API with the following code:
 
-y <- purrr::map(y, ~ select(.x, geo_value, time_value, version = issue, value))
+```{r generate-data, eval=FALSE}
+library(epidatr)
 
-x <- epix_merge(
-  y[[1]] %>% rename(percent_cli = value) %>% as_epi_archive(compactify = FALSE),
-  y[[2]] %>% rename(case_rate = value) %>% as_epi_archive(compactify = FALSE),
+dv_subset <- pub_covidcast(
+  source = "doctor-visits",
+  signals = "smoothed_adj_cli",
+  time_type = "day",
+  geo_type = "state",
+  time_values = epirange(20200601, 20211201),
+  geo_values = "*",
+  issues = epirange(20200601, 20211201)
+) %>%
+  select(geo_value, time_value, version = issue, percent_cli = value) %>%
+  # We're using compactify=FALSE here and below to avoid some testthat test
+  # failures on tests that were based on a non-compactified version.
+  as_epi_archive(compactify = FALSE)
+
+case_rate_subset <- pub_covidcast(
+  source = "jhu-csse",
+  signals = "confirmed_7dav_incidence_prop",
+  time_type = "day",
+  geo_type = "state",
+  time_values = epirange(20200601, 20211201),
+  geo_values = "*",
+  issues = epirange(20200601, 20211201)
+) %>%
+  select(geo_value, time_value, version = issue, case_rate_7d_av = value) %>%
+  as_epi_archive(compactify = FALSE)
+
+# Use `epiprocess::epix_merge` to avoid having to reimplement `sync`ing
+# behavior. After merging, convert DT component back to tibble.
+archive_cases_dv_subset_all_states_dt = epix_merge(
+  dv_subset, case_rate_subset,
   sync = "locf",
-  compactify = TRUE
-)
-rm(y)
+  compactify = TRUE)
 ```
 
 </details>
@@ -217,11 +245,7 @@ the American data, but here we compare the forecasts produced from using simple
 linear regression with those from using boosted regression trees.
 
 ```{r get-can-fc, warning = FALSE}
-# source("drafts/canada-case-rates.R)
-can <- readRDS(system.file(
-  "extdata", "can_prov_cases.rds",
-  package = "epipredict", mustWork = TRUE
-))
+can <- can_prov_cases
 
 can <- can %>%
   group_by(version, geo_value) %>%
@@ -325,9 +349,20 @@ combined data from all US states and territories) to train our model.
 
 <details>
 
-<summary>Download data using `{epidatr}`</summary>
-```{r load-data, eval=FALSE}
-# loading in the data
+<summary>Fetch data</summary>
+The data are included in this package (via the
+[`epidatasets` package](https://cmu-delphi.github.io/epidatasets/)), 
+and can be loaded with:
+
+```{r load-data, message = FALSE, warning = FALSE}
+x <- case_death_rate_archive
+```
+
+The data can also be constructed from data the Delphi API with the following code:
+
+```{r generate-data, eval=FALSE}
+library(epidatr)
+
 states <- "*"
 
 confirmed_incidence_prop <- pub_covidcast(
@@ -384,13 +419,6 @@ x <- x %>%
     death_rate_7d_av = slide_value_death_rate_7d_av
   ) %>%
   as_epi_archive(compactify = TRUE)
-
-saveRDS(x$DT, file = "case_death_rate_archive.rds")
-```
-
-```{r load-stored-data}
-x <- readRDS("case_death_rate_archive.rds")
-x <- as_epi_archive(x)
 ```
 </details>
 
 
@@ -145,22 +145,40 @@ own forecaster under the `epipredict` framework, we could easily add steps to
 re-scale and transform the signals to our `epi_recipe`. This would make the code
 more succinct and self-contained.
 
+We will compare two CLI-in-community indicators from
+different sources. The data are included in this package (via the
+[`epidatasets` package](https://cmu-delphi.github.io/epidatasets/)), 
+and can be loaded with:
+
 ```{r, message = FALSE, warning = FALSE}
-library(epidatr)
 library(dplyr)
 library(purrr)
 library(epipredict)
 library(recipes)
 
+z <- county_smoothed_cli_comparison
+```
+
+The data can also be constructed from data the Delphi API with the following code:
+
+```{r, message = FALSE, warning = FALSE, eval = FALSE}
+library(epidatr)
+
+d <- "2020-09-21"
+
 case_num <- 200
-as_of_date <- "2020-05-14"
-geo_values <- pub_covidcast(
+geos_date <- "2020-05-14"
+
+# Find counties that on 2020-05-14 had >= 200 cases reported.
+# For later datasets, we will only keep data for these geos.
+geo_values_initial <- pub_covidcast(
   source = "jhu-csse",
   signals = "confirmed_cumulative_num",
   geo_type = "county",
   time_type = "day",
   geo_values = "*",
-  time_values = epirange(20200514, 20200514)
+  time_values = epirange(geos_date, geos_date),
+  as_of = d
 ) %>%
   filter(value >= case_num) %>%
   pull(geo_value) %>%
@@ -177,9 +195,10 @@ goog_sm_cli <- pub_covidcast(
   geo_type = "county",
   time_type = "day",
   geo_values = "*",
-  time_values = epirange(start_day, end_day)
+  time_values = epirange(start_day, end_day),
+  as_of = d
 ) %>%
-  filter(geo_value %in% geo_values) %>%
+  filter(geo_value %in% geo_values_initial) %>%
   select(geo_value, time_value, value) %>%
   rename(goog = value)
 
@@ -189,9 +208,10 @@ fb_survey <- pub_covidcast(
   geo_type = "county",
   time_type = "day",
   geo_values = "*",
-  time_values = epirange(start_day, end_day)
+  time_values = epirange(start_day, end_day),
+  as_of = d
 ) %>%
-  filter(geo_value %in% geo_values) %>%
+  filter(geo_value %in% geo_values_initial) %>%
   select(geo_value, time_value, value) %>%
   rename(fb = value)
 
@@ -201,26 +221,31 @@ jhu_7dav_incid <- pub_covidcast(
   geo_type = "county",
   time_type = "day",
   geo_values = "*",
-  time_values = epirange(start_day, end_day)
+  time_values = epirange(start_day, end_day),
+  as_of = d
 ) %>%
-  filter(geo_value %in% geo_values) %>%
+  filter(geo_value %in% geo_values_initial) %>%
   select(geo_value, time_value, value) %>%
   rename(case = value)
 
-# Find "complete" counties, present in all three data signals at all times
+# Find "complete" counties, present in all three data signals, and also 
+# present in the `geo_values_initial` object.
 geo_values_complete <- intersect(
   intersect(goog_sm_cli$geo_value, fb_survey$geo_value),
   jhu_7dav_incid$geo_value
 )
 
-# Make one big matrix by joining these three data frames
-z <- full_join(full_join(goog_sm_cli, fb_survey, by = c("geo_value", "time_value")),
+# Join the three data frames together
+z <- full_join(
+  full_join(goog_sm_cli, fb_survey, by = c("geo_value", "time_value")),
   jhu_7dav_incid,
   by = c("geo_value", "time_value")
 ) %>%
   filter(geo_value %in% geo_values_complete) %>%
-  as_epi_df()
+  as_epi_df(as_of = d)
+```
 
+```{r, message = FALSE, warning = FALSE}
 Logit <- function(x, a = 0.01) log((x + a) / (1 - x + a))
 Sigmd <- function(y, a = 0.01) (exp(y) * (1 + a) - a) / (1 + exp(y))