Skip to content

Commit 4815e37

Browse files
committed
wip: move datasets to epidatasets and use in vignettes
1 parent 63699ac commit 4815e37

8 files changed

+195
-83
lines changed

R/reexports-epidatasets.R

+54
Original file line numberDiff line numberDiff line change
@@ -87,3 +87,57 @@ delayedAssign("counts_subset", epidatasets::counts_subset)
8787
#' data(ctis_covid_behaviours, package = "epipredict")
8888
#' @export
8989
delayedAssign("ctis_covid_behaviours", epidatasets::ctis_covid_behaviours)
90+
91+
#' @inherit epidatasets::county_smoothed_cli_comparison description source references title
92+
#' @inheritSection epidatasets::county_smoothed_cli_comparison Data dictionary
93+
#' @examples
94+
#' # Since this is a re-exported dataset, it cannot be loaded using
95+
#' # the `data()` function. `data()` looks for a file of the same name
96+
#' # in the `data/` directory, which doesn't exist in this package.
97+
#' # works
98+
#' epipredict::county_smoothed_cli_comparison
99+
#'
100+
#' # works
101+
#' library(epipredict)
102+
#' county_smoothed_cli_comparison
103+
#'
104+
#' # fails
105+
#' data(county_smoothed_cli_comparison, package = "epipredict")
106+
#' @export
107+
delayedAssign("county_smoothed_cli_comparison", epidatasets::county_smoothed_cli_comparison)
108+
109+
#' @inherit epidatasets::case_death_rate_archive description source references title
110+
#' @inheritSection epidatasets::case_death_rate_archive Data dictionary
111+
#' @examples
112+
#' # Since this is a re-exported dataset, it cannot be loaded using
113+
#' # the `data()` function. `data()` looks for a file of the same name
114+
#' # in the `data/` directory, which doesn't exist in this package.
115+
#' # works
116+
#' epipredict::case_death_rate_archive
117+
#'
118+
#' # works
119+
#' library(epipredict)
120+
#' case_death_rate_archive
121+
#'
122+
#' # fails
123+
#' data(case_death_rate_archive, package = "epipredict")
124+
#' @export
125+
delayedAssign("case_death_rate_archive", epidatasets::case_death_rate_archive)
126+
127+
#' @inherit epidatasets::archive_cases_dv_subset_all_states description source references title
128+
#' @inheritSection epidatasets::archive_cases_dv_subset_all_states Data dictionary
129+
#' @examples
130+
#' # Since this is a re-exported dataset, it cannot be loaded using
131+
#' # the `data()` function. `data()` looks for a file of the same name
132+
#' # in the `data/` directory, which doesn't exist in this package.
133+
#' # works
134+
#' epipredict::archive_cases_dv_subset_all_states
135+
#'
136+
#' # works
137+
#' library(epipredict)
138+
#' archive_cases_dv_subset_all_states
139+
#'
140+
#' # fails
141+
#' data(archive_cases_dv_subset_all_states, package = "epipredict")
142+
#' @export
143+
delayedAssign("archive_cases_dv_subset_all_states", epidatasets::archive_cases_dv_subset_all_states)

inst/extdata/can_prov_cases.rds

-155 KB
Binary file not shown.

inst/extdata/canada-case-rates.R

-23
This file was deleted.

inst/extdata/epi_archive.rds

-479 KB
Binary file not shown.
-1.25 MB
Binary file not shown.

vignettes/articles/sliding.Rmd

+56-28
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ knitr::opts_chunk$set(
1414

1515
```{r pkgs}
1616
library(epipredict)
17-
library(epidatr)
1817
library(data.table)
1918
library(dplyr)
2019
library(tidyr)
@@ -60,25 +59,54 @@ claims and the number of new confirmed COVID-19 cases per 100,000 population
6059

6160
<summary>Load a data archive</summary>
6261

63-
We process as before, with the
64-
modification that we use `sync = locf` in `epix_merge()` so that the last
65-
version of each observation can be carried forward to extrapolate unavailable
66-
versions for the less up-to-date input archive.
62+
This dataset is processed as before, with the modification that we use `sync =
63+
locf` in `epix_merge()` so that the last version of each observation can be
64+
carried forward to extrapolate unavailable versions for the less up-to-date
65+
input archive.
6766

6867
```{r grab-epi-data}
6968
theme_set(theme_bw())
7069
71-
y <- readRDS("all_states_covidcast_signals.rds")
70+
x <- archive_cases_dv_subset_all_states
71+
```
72+
73+
The data can also be constructed from data the Delphi API with the following code:
7274

73-
y <- purrr::map(y, ~ select(.x, geo_value, time_value, version = issue, value))
75+
```{r generate-data, eval=FALSE}
76+
library(epidatr)
7477
75-
x <- epix_merge(
76-
y[[1]] %>% rename(percent_cli = value) %>% as_epi_archive(compactify = FALSE),
77-
y[[2]] %>% rename(case_rate = value) %>% as_epi_archive(compactify = FALSE),
78+
dv_subset <- pub_covidcast(
79+
source = "doctor-visits",
80+
signals = "smoothed_adj_cli",
81+
time_type = "day",
82+
geo_type = "state",
83+
time_values = epirange(20200601, 20211201),
84+
geo_values = "*",
85+
issues = epirange(20200601, 20211201)
86+
) %>%
87+
select(geo_value, time_value, version = issue, percent_cli = value) %>%
88+
# We're using compactify=FALSE here and below to avoid some testthat test
89+
# failures on tests that were based on a non-compactified version.
90+
as_epi_archive(compactify = FALSE)
91+
92+
case_rate_subset <- pub_covidcast(
93+
source = "jhu-csse",
94+
signals = "confirmed_7dav_incidence_prop",
95+
time_type = "day",
96+
geo_type = "state",
97+
time_values = epirange(20200601, 20211201),
98+
geo_values = "*",
99+
issues = epirange(20200601, 20211201)
100+
) %>%
101+
select(geo_value, time_value, version = issue, case_rate_7d_av = value) %>%
102+
as_epi_archive(compactify = FALSE)
103+
104+
# Use `epiprocess::epix_merge` to avoid having to reimplement `sync`ing
105+
# behavior. After merging, convert DT component back to tibble.
106+
archive_cases_dv_subset_all_states_dt = epix_merge(
107+
dv_subset, case_rate_subset,
78108
sync = "locf",
79-
compactify = TRUE
80-
)
81-
rm(y)
109+
compactify = TRUE)
82110
```
83111

84112
</details>
@@ -217,11 +245,7 @@ the American data, but here we compare the forecasts produced from using simple
217245
linear regression with those from using boosted regression trees.
218246

219247
```{r get-can-fc, warning = FALSE}
220-
# source("drafts/canada-case-rates.R)
221-
can <- readRDS(system.file(
222-
"extdata", "can_prov_cases.rds",
223-
package = "epipredict", mustWork = TRUE
224-
))
248+
can <- can_prov_cases
225249
226250
can <- can %>%
227251
group_by(version, geo_value) %>%
@@ -325,9 +349,20 @@ combined data from all US states and territories) to train our model.
325349

326350
<details>
327351

328-
<summary>Download data using `{epidatr}`</summary>
329-
```{r load-data, eval=FALSE}
330-
# loading in the data
352+
<summary>Fetch data</summary>
353+
The data are included in this package (via the
354+
[`epidatasets` package](https://cmu-delphi.github.io/epidatasets/)),
355+
and can be loaded with:
356+
357+
```{r load-data, message = FALSE, warning = FALSE}
358+
x <- case_death_rate_archive
359+
```
360+
361+
The data can also be constructed from data the Delphi API with the following code:
362+
363+
```{r generate-data, eval=FALSE}
364+
library(epidatr)
365+
331366
states <- "*"
332367
333368
confirmed_incidence_prop <- pub_covidcast(
@@ -384,13 +419,6 @@ x <- x %>%
384419
death_rate_7d_av = slide_value_death_rate_7d_av
385420
) %>%
386421
as_epi_archive(compactify = TRUE)
387-
388-
saveRDS(x$DT, file = "case_death_rate_archive.rds")
389-
```
390-
391-
```{r load-stored-data}
392-
x <- readRDS("case_death_rate_archive.rds")
393-
x <- as_epi_archive(x)
394422
```
395423
</details>
396424

vignettes/articles/symptom-surveys.Rmd

+39-14
Original file line numberDiff line numberDiff line change
@@ -145,22 +145,40 @@ own forecaster under the `epipredict` framework, we could easily add steps to
145145
re-scale and transform the signals to our `epi_recipe`. This would make the code
146146
more succinct and self-contained.
147147

148+
We will compare two CLI-in-community indicators from
149+
different sources. The data are included in this package (via the
150+
[`epidatasets` package](https://cmu-delphi.github.io/epidatasets/)),
151+
and can be loaded with:
152+
148153
```{r, message = FALSE, warning = FALSE}
149-
library(epidatr)
150154
library(dplyr)
151155
library(purrr)
152156
library(epipredict)
153157
library(recipes)
154158
159+
z <- county_smoothed_cli_comparison
160+
```
161+
162+
The data can also be constructed from data the Delphi API with the following code:
163+
164+
```{r, message = FALSE, warning = FALSE, eval = FALSE}
165+
library(epidatr)
166+
167+
d <- "2020-09-21"
168+
155169
case_num <- 200
156-
as_of_date <- "2020-05-14"
157-
geo_values <- pub_covidcast(
170+
geos_date <- "2020-05-14"
171+
172+
# Find counties that on 2020-05-14 had >= 200 cases reported.
173+
# For later datasets, we will only keep data for these geos.
174+
geo_values_initial <- pub_covidcast(
158175
source = "jhu-csse",
159176
signals = "confirmed_cumulative_num",
160177
geo_type = "county",
161178
time_type = "day",
162179
geo_values = "*",
163-
time_values = epirange(20200514, 20200514)
180+
time_values = epirange(geos_date, geos_date),
181+
as_of = d
164182
) %>%
165183
filter(value >= case_num) %>%
166184
pull(geo_value) %>%
@@ -177,9 +195,10 @@ goog_sm_cli <- pub_covidcast(
177195
geo_type = "county",
178196
time_type = "day",
179197
geo_values = "*",
180-
time_values = epirange(start_day, end_day)
198+
time_values = epirange(start_day, end_day),
199+
as_of = d
181200
) %>%
182-
filter(geo_value %in% geo_values) %>%
201+
filter(geo_value %in% geo_values_initial) %>%
183202
select(geo_value, time_value, value) %>%
184203
rename(goog = value)
185204
@@ -189,9 +208,10 @@ fb_survey <- pub_covidcast(
189208
geo_type = "county",
190209
time_type = "day",
191210
geo_values = "*",
192-
time_values = epirange(start_day, end_day)
211+
time_values = epirange(start_day, end_day),
212+
as_of = d
193213
) %>%
194-
filter(geo_value %in% geo_values) %>%
214+
filter(geo_value %in% geo_values_initial) %>%
195215
select(geo_value, time_value, value) %>%
196216
rename(fb = value)
197217
@@ -201,26 +221,31 @@ jhu_7dav_incid <- pub_covidcast(
201221
geo_type = "county",
202222
time_type = "day",
203223
geo_values = "*",
204-
time_values = epirange(start_day, end_day)
224+
time_values = epirange(start_day, end_day),
225+
as_of = d
205226
) %>%
206-
filter(geo_value %in% geo_values) %>%
227+
filter(geo_value %in% geo_values_initial) %>%
207228
select(geo_value, time_value, value) %>%
208229
rename(case = value)
209230
210-
# Find "complete" counties, present in all three data signals at all times
231+
# Find "complete" counties, present in all three data signals, and also
232+
# present in the `geo_values_initial` object.
211233
geo_values_complete <- intersect(
212234
intersect(goog_sm_cli$geo_value, fb_survey$geo_value),
213235
jhu_7dav_incid$geo_value
214236
)
215237
216-
# Make one big matrix by joining these three data frames
217-
z <- full_join(full_join(goog_sm_cli, fb_survey, by = c("geo_value", "time_value")),
238+
# Join the three data frames together
239+
z <- full_join(
240+
full_join(goog_sm_cli, fb_survey, by = c("geo_value", "time_value")),
218241
jhu_7dav_incid,
219242
by = c("geo_value", "time_value")
220243
) %>%
221244
filter(geo_value %in% geo_values_complete) %>%
222-
as_epi_df()
245+
as_epi_df(as_of = d)
246+
```
223247

248+
```{r, message = FALSE, warning = FALSE}
224249
Logit <- function(x, a = 0.01) log((x + a) / (1 - x + a))
225250
Sigmd <- function(y, a = 0.01) (exp(y) * (1 + a) - a) / (1 + exp(y))
226251

0 commit comments

Comments
 (0)