Skip to content

Commit ca86f03

Browse files
authored
Merge pull request #3 from cmu-delphi/ndefries/make-data-gen-scripts-run
Load libraries in data-generating scripts
2 parents cc8f2a0 + 472dfd9 commit ca86f03

15 files changed

+277
-80
lines changed

DESCRIPTION

+4-3
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ Authors@R:
66
person(c("Daniel", "J."), "McDonald", , "[email protected]", role = c("cre", "aut"))
77
Description: This package contains data sets used to compile vignettes and
88
other documentation in Delphi R Packages. The goal is to avoid calls
9-
to the Delphi Epidata API, and deposit some examples here for easy
9+
to the Delphi Epidata API, and to deposit some examples here for easy
1010
offline use.
1111
License: MIT + file LICENSE
1212
Depends:
@@ -15,15 +15,16 @@ Suggests:
1515
covidcast,
1616
dplyr,
1717
epidatr,
18-
epipredict,
18+
epiprocess,
1919
here,
20+
httr,
21+
jsonlite,
2022
lubridate,
2123
magrittr,
2224
purrr,
2325
readr
2426
Remotes:
2527
cmu-delphi/epidatr,
26-
cmu-delphi/epipredict,
2728
cmu-delphi/epiprocess
2829
Encoding: UTF-8
2930
LazyData: true

R/epipredict-data.R

+23
Original file line numberDiff line numberDiff line change
@@ -80,3 +80,26 @@
8080
#' by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering.
8181
#' Copyright Johns Hopkins University 2020.
8282
"counts_subset"
83+
84+
#' Canadian COVID-19 case rates
85+
#'
86+
#' Data set for all Canadian provinces and territories containing COVID-19
87+
#' case rates (COVID-19 cases per 100,000 people) derived from COVID-19 case
88+
#' counts as reported by the
89+
#' \href{https://opencovid.ca/}{COVID-19 Canada Open Data Working Group (CCODWG)}.
90+
#' Data is available both through the archived
91+
#' \href{https://github.com/ccodwg/Covid19Canada}{ccodwg/Covid19Canada GitHub repository}
92+
#' and the newer
93+
#' \href{https://github.com/ccodwg/CovidTimelineCanada}{ccodwg/CovidTimelineCanada GitHub repository},
94+
#' which also reports vaccine-related signals.
95+
#'
96+
#' This dataset contains versioned data covering the period from April 2020 to
97+
#' December 2021 and is used in the [epipredict] slide vignette.
98+
#'
99+
#' @source This object contains a modified part of the COVID-19 Canada Open
100+
#' Data Working Group's
101+
#' \href{https://github.com/ccodwg/Covid19Canada}{Covid19Canada data repository} (archived).
102+
#' This data set is licensed under the terms of the
103+
#' \href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license}
104+
#' by the COVID-19 Canada Open Data Working Group.
105+
"can_prov_cases"

data-raw/archive_cases_dv_subset.R renamed to data-raw/archive_cases_dv_subset_dt.R

+8-6
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,30 @@
1-
dv_subset <- covidcast(
2-
data_source = "doctor-visits",
1+
library(dplyr)
2+
library(epidatr)
3+
library(epiprocess)
4+
5+
dv_subset <- pub_covidcast(
6+
source = "doctor-visits",
37
signals = "smoothed_adj_cli",
48
time_type = "day",
59
geo_type = "state",
610
time_values = epirange(20200601, 20211201),
711
geo_values = "ca,fl,ny,tx",
812
issues = epirange(20200601, 20211201)
913
) %>%
10-
fetch() %>%
1114
select(geo_value, time_value, version = issue, percent_cli = value) %>%
1215
# We're using compactify=FALSE here and below to avoid some testthat test
1316
# failures on tests that were based on a non-compactified version.
1417
as_epi_archive(compactify = FALSE)
1518

16-
case_rate_subset <- covidcast(
17-
data_source = "jhu-csse",
19+
case_rate_subset <- pub_covidcast(
20+
source = "jhu-csse",
1821
signals = "confirmed_7dav_incidence_prop",
1922
time_type = "day",
2023
geo_type = "state",
2124
time_values = epirange(20200601, 20211201),
2225
geo_values = "ca,fl,ny,tx",
2326
issues = epirange(20200601, 20211201)
2427
) %>%
25-
fetch() %>%
2628
select(geo_value, time_value, version = issue, case_rate_7d_av = value) %>%
2729
as_epi_archive(compactify = FALSE)
2830

data-raw/can_prov_cases.R

+148
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
## code to prepare `can_prov_cases` dataset goes here
2+
3+
library(dplyr)
4+
library(epiprocess)
5+
library(readr)
6+
library(purrr)
7+
library(httr)
8+
library(jsonlite)
9+
10+
11+
# Look for a GitHub API token.
12+
# Returns an empty string "" if env variable not found.
13+
gh_token <- Sys.getenv("GITHUB_PAT")
14+
if (gh_token == "") {
15+
# Try again with the secondary name.
16+
gh_token <- Sys.getenv("GITHUB_TOKEN")
17+
}
18+
if (gh_token == "") {
19+
warning("Token is not set or is not able to be fetched from the environment.",
20+
" Proceeding without authentication, but the requests may be blocked",
21+
" due to GitHub API rate limits.")
22+
}
23+
24+
# Construct a header to send with GET requests
25+
if (gh_token == "") {
26+
# Empty header
27+
auth_header <- httr::add_headers()
28+
} else {
29+
auth_header <- httr::add_headers(Authorization = paste("Bearer", gh_token))
30+
}
31+
32+
## Get list of new and modified files to download
33+
# The `path` field filters commits to only those that modifying the listed dir
34+
# From https://www.github.com/ccodwg/Covid19Canada
35+
BASE_URL <- "https://api.github.com/repos/ccodwg/Covid19Canada/commits?sha=%s&per_page=%s&path=timeseries_prov/cases_timeseries_prov.csv&until=%s&page=%s"
36+
ITEMS_PER_PAGE <- 100
37+
BRANCH <- "master"
38+
39+
40+
41+
# We want to fetch all commits made since Mar 13 2022 (version the original
42+
# dataset was created from).
43+
#
44+
# Timestamp should be in ISO 8601 format. See
45+
# https://docs.github.com/en/rest/reference/commits#list-commits--parameters for
46+
# details.
47+
since_date <- strftime("2022-03-13", "%Y-%m-%dT%H:%M:%SZ", tz = "UTC")
48+
49+
page <- 0
50+
commit_pages <- list()
51+
52+
# Fetch list of commits from API, one page at a time. Each page contains up to
53+
# 100 commits. If a page contains 100 commits, assume that there are more
54+
# results and fetch the next page.
55+
while (page == 0 || nrow(commit_page) == 100) {
56+
page <- page + 1
57+
# Construct the URL
58+
commits_url <- sprintf(BASE_URL, BRANCH, ITEMS_PER_PAGE, since_date, page)
59+
60+
request <- GET(commits_url, auth_header)
61+
# Convert any HTTP errors to R errors automatically.
62+
stop_for_status(request)
63+
64+
# Convert results from nested JSON/list to dataframe. If no results returned,
65+
# `commit_page` will be an empty list.
66+
commit_page <- content(request, as = "text") %>%
67+
fromJSON(simplifyDataFrame = TRUE, flatten = TRUE) %>%
68+
# Trim message down a bit.
69+
mutate(message = substr(commit.message, 1, 40)) %>%
70+
select(sha, url = commit.url, message)
71+
72+
# No more results are being returned.
73+
if (identical(commit_page, list())) {
74+
break
75+
}
76+
77+
commit_pages[[page]] <- commit_page
78+
}
79+
80+
# Combine all requested pages of commits into one dataframe
81+
commit_pages <- bind_rows(commit_pages)
82+
83+
# Missing value `%s` to be filled in with a commit sha or branch name.
84+
BASE_DATA_URL <- "https://raw.githubusercontent.com/ccodwg/Covid19Canada/%s/timeseries_prov/cases_timeseries_prov.csv"
85+
86+
fc_time_values <- seq(as.Date("2021-02-01"), as.Date("2021-12-01"),
87+
by = "1 month")
88+
commit_pages <- mutate(
89+
commit_pages,
90+
data_url = sprintf(BASE_DATA_URL, sha),
91+
date = strsplit(message, " ") %>% map_chr(~ substr(.x[3], start=1, stop=10)) %>% as.Date()
92+
) %>%
93+
# select(data_url, date) %>%
94+
na.omit() %>%
95+
filter(date %in% fc_time_values)
96+
97+
# From https://github.com/mountainMath/BCCovidSnippets/blob/main/data/prov_pop.csv
98+
ca_pop_url <- "https://raw.githubusercontent.com/mountainMath/BCCovidSnippets/main/data/prov_pop.csv"
99+
ca_pop <- read_csv(
100+
ca_pop_url,
101+
col_types = cols(
102+
Province = col_character(),
103+
shortProvince = col_character(),
104+
Population = col_integer()
105+
)
106+
) %>%
107+
rename(province = Province, abbreviation = shortProvince, population = Population)
108+
abbrev_map <- setNames(ca_pop$province, ca_pop$abbreviation)
109+
110+
# Read in data and convert to `epi_df`s.
111+
can_prov_cases <- purrr::map2(commit_pages$data_url, commit_pages$date, function(url, date) {
112+
raw <- readr::read_csv(
113+
url,
114+
col_types = cols(
115+
province = col_character(),
116+
date_report = col_character(),
117+
cases = col_double(),
118+
cumulative_cases = col_double()
119+
)
120+
)
121+
122+
# Raw data uses a mix of full names and abbreviations. Switch to using only full names.
123+
raw$province <- case_when(
124+
raw$province == "NWT" ~ abbrev_map["NT"],
125+
raw$province == "PEI" ~ abbrev_map["PE"],
126+
raw$province %in% ca_pop$province ~ raw$province,
127+
raw$province %in% ca_pop$abbreviation ~ abbrev_map[raw$province],
128+
# Mark everything else as missing. Only applies to "Repatriated" region.
129+
TRUE ~ NA
130+
)
131+
132+
result <- raw %>%
133+
mutate(time_value = lubridate::dmy(date_report)) %>%
134+
left_join(ca_pop, by="province") %>%
135+
filter(!is.na(province), time_value > "2020-04-01") %>%
136+
mutate(geo_value = province,
137+
case_rate = cases / population * 1e5) %>%
138+
select(geo_value, time_value, case_rate) %>%
139+
as_epi_df(geo_type = "province", as_of = date)
140+
141+
return(result)
142+
})
143+
names(can_prov_cases) <- commit_pages$date
144+
can_prov_cases <- can_prov_cases %>% bind_rows(.id = "version") %>%
145+
mutate(version = lubridate::ymd(version)) %>%
146+
arrange(version)
147+
148+
usethis::use_data(can_prov_cases, overwrite = TRUE)

data-raw/cancovid.R

-25
This file was deleted.

data-raw/cases_deaths_subset.R

+12-12
Original file line numberDiff line numberDiff line change
@@ -1,48 +1,48 @@
1-
confirmed_7dav_incidence_prop <- covidcast(
2-
data_source = "jhu-csse",
1+
library(dplyr)
2+
library(epidatr)
3+
library(epiprocess)
4+
5+
confirmed_7dav_incidence_prop <- pub_covidcast(
6+
source = "jhu-csse",
37
signals = "confirmed_7dav_incidence_prop",
48
time_type = "day",
59
geo_type = "state",
610
time_values = epirange(20200301, 20211231),
711
geo_values = "ca,fl,ny,tx,ga,pa"
812
) %>%
9-
fetch() %>%
1013
select(geo_value, time_value, case_rate_7d_av = value) %>%
1114
arrange(geo_value, time_value)
1215

13-
deaths_7dav_incidence_prop <- covidcast(
14-
data_source = "jhu-csse",
16+
deaths_7dav_incidence_prop <- pub_covidcast(
17+
source = "jhu-csse",
1518
signals = "deaths_7dav_incidence_prop",
1619
time_type = "day",
1720
geo_type = "state",
1821
time_values = epirange(20200301, 20211231),
1922
geo_values = "ca,fl,ny,tx,ga,pa"
2023
) %>%
21-
fetch() %>%
2224
select(geo_value, time_value, death_rate_7d_av = value) %>%
2325
arrange(geo_value, time_value)
2426

25-
confirmed_incidence_num <- covidcast(
26-
data_source = "jhu-csse",
27+
confirmed_incidence_num <- pub_covidcast(
28+
source = "jhu-csse",
2729
signals = "confirmed_incidence_num",
2830
time_type = "day",
2931
geo_type = "state",
3032
time_values = epirange(20200301, 20211231),
3133
geo_values = "ca,fl,ny,tx,ga,pa"
3234
) %>%
33-
fetch() %>%
3435
select(geo_value, time_value, cases = value) %>%
3536
arrange(geo_value, time_value)
3637

37-
confirmed_7dav_incidence_num <- covidcast(
38-
data_source = "jhu-csse",
38+
confirmed_7dav_incidence_num <- pub_covidcast(
39+
source = "jhu-csse",
3940
signals = "confirmed_7dav_incidence_num",
4041
time_type = "day",
4142
geo_type = "state",
4243
time_values = epirange(20200301, 20211231),
4344
geo_values = "ca,fl,ny,tx,ga,pa"
4445
) %>%
45-
fetch() %>%
4646
select(geo_value, time_value, cases_7d_av = value) %>%
4747
arrange(geo_value, time_value)
4848

data-raw/counts_subset.R

+8-6
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,25 @@
11
## code to prepare jhu_incidence_num dataset goes here
22

3-
x <- covidcast(
4-
data_source = "jhu-csse",
3+
library(dplyr)
4+
library(epidatr)
5+
library(epiprocess)
6+
7+
x <- pub_covidcast(
8+
source = "jhu-csse",
59
signals = "confirmed_incidence_num",
610
time_type = "day",
711
geo_type = "state",
812
time_values = epirange(20210604, 20211231),
913
geo_values = "ca,fl,tx,ny,nj") %>%
10-
fetch() %>%
1114
select(geo_value, time_value, cases = value)
1215

13-
y <- covidcast(
14-
data_source = "jhu-csse",
16+
y <- pub_covidcast(
17+
source = "jhu-csse",
1518
signals = "deaths_incidence_num",
1619
time_type = "day",
1720
geo_type = "state",
1821
time_values = epirange(20210604, 20211231),
1922
geo_values = "ca,fl,tx,ny,nj") %>%
20-
fetch() %>%
2123
select(geo_value, time_value, deaths = value)
2224

2325
counts_subset <- full_join(x, y, by = c("geo_value", "time_value")) %>%

data-raw/covid_case_death_rates.R

+10-8
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,29 @@
1-
x <- covidcast(
2-
data_source = "jhu-csse",
1+
library(dplyr)
2+
library(epidatr)
3+
library(epiprocess)
4+
5+
x <- pub_covidcast(
6+
source = "jhu-csse",
37
signals = "confirmed_7dav_incidence_prop",
48
time_type = "day",
59
geo_type = "state",
610
time_values = epirange(20200301, 20211231),
711
geo_values = "*"
812
) %>%
9-
fetch() %>%
1013
select(geo_value, time_value, case_rate = value)
1114

12-
y <- covidcast(
13-
data_source = "jhu-csse",
15+
y <- pub_covidcast(
16+
source = "jhu-csse",
1417
signals = "deaths_7dav_incidence_prop",
1518
time_type = "day",
1619
geo_type = "state",
1720
time_values = epirange(20200301, 20211231),
1821
geo_values = "*"
1922
) %>%
20-
fetch() %>%
2123
select(geo_value, time_value, death_rate = value)
2224

23-
case_death_rate_subset <- x %>%
25+
covid_case_death_rates <- x %>%
2426
full_join(y, by = c("geo_value", "time_value")) %>%
2527
as_epi_df()
2628

27-
usethis::use_data(case_death_rate_subset, overwrite = TRUE)
29+
usethis::use_data(covid_case_death_rates, overwrite = TRUE)

0 commit comments

Comments
 (0)