Skip to content

Commit 99c30c6

Browse files
authored
Merge pull request #115 from cmu-delphi/ml-99-panel-data-vignette
Panel Data Vignette (Issue 99)
2 parents 08bb5ee + 55e8166 commit 99c30c6

10 files changed

+675
-10
lines changed

R/canned-epipred.R

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -73,11 +73,17 @@ print.canned_epipred <- function(x, name, ...) {
7373
)
7474
cli::cli_text("")
7575
cli::cli_text("Training data was an {.cls epi_df} with:")
76-
cli::cli_ul(c(
77-
"Geography: {.field {x$metadata$training$geo_type}},",
78-
"Time type: {.field {x$metadata$training$time_type}},",
79-
"Using data up-to-date as of: {.field {format(x$metadata$training$as_of)}}."
80-
))
76+
fn_meta <- function() {
77+
cli::cli_ul()
78+
cli::cli_li("Geography: {.field {x$metadata$training$geo_type}},")
79+
if (!is.null(x$metadata$training$other_keys)) {
80+
cli::cli_li("Other keys: {.field {x$metadata$training$other_keys}},")
81+
}
82+
cli::cli_li("Time type: {.field {x$metadata$training$time_type}},")
83+
cli::cli_li("Using data up-to-date as of: {.field {format(x$metadata$training$as_of)}}.")
84+
cli::cli_end()
85+
}
86+
fn_meta()
8187
cli::cli_text("")
8288

8389
cli::cli_rule("Predictions")

R/data.R

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,3 +56,32 @@
5656
#' \url{https://www.census.gov/data/tables/time-series/demo/popest/2010s-total-puerto-rico-municipios.html},
5757
#' and \url{https://www.census.gov/data/tables/2010/dec/2010-island-areas.html}
5858
"state_census"
59+
60+
#' Subset of Statistics Canada median employment income for postsecondary graduates
61+
#'
62+
#' @format An [epiprocess::epi_df] with 10193 rows and 8 variables:
63+
#' \describe{
64+
#' \item{geo_value}{The province in Canada associated with each
65+
#' row of measurements.}
66+
#' \item{time_value}{The time value, a year integer in YYYY format}
67+
#' \item{edu_qual}{The education qualification}
68+
#' \item{fos}{The field of study}
69+
#' \item{age_group}{The age group; either 15 to 34 or 35 to 64}
70+
#' \item{num_graduates}{The number of graduates for the given row of characteristics}
71+
#' \item{med_income_2y}{The median employment income two years after graduation}
72+
#' \item{med_income_5y}{The median employment income five years after graduation}
73+
#' }
74+
#' @source This object contains modified data from the following Statistics Canada
75+
#' data table: \href{https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=3710011501}{
76+
#' Characteristics and median employment income of longitudinal cohorts of postsecondary
77+
#' graduates two and five years after graduation, by educational qualification and
78+
#' field of study (primary groupings)
79+
#' }
80+
#'
81+
#' Modifications:
82+
#' * Only provincial-level geo_values are kept
83+
#' * Only age group, field of study, and educational qualification are kept as
84+
#' covariates. For the remaining covariates, we keep aggregated values and
85+
#' drop the level-specific rows.
86+
#' * No modifications were made to the time range of the data
87+
"grad_employ_subset"

_pkgdown.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ reference:
105105
contents:
106106
- case_death_rate_subset
107107
- state_census
108+
- grad_employ_subset
108109

109110

110111

data-raw/grad_employ_subset.R

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
library(epipredict)
2+
library(epiprocess)
3+
library(cansim)
4+
library(dplyr)
5+
library(stringr)
6+
library(tidyr)
7+
8+
# https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=3710011501
9+
statcan_grad_employ <- get_cansim("37-10-0115-01")
10+
11+
gemploy <- statcan_grad_employ %>%
12+
select(c(
13+
"REF_DATE",
14+
"GEO",
15+
# "DGUID",
16+
# "UOM",
17+
# "UOM_ID",
18+
# "SCALAR_FACTOR",
19+
# "SCALAR_ID",
20+
# "VECTOR",
21+
# "COORDINATE",
22+
"VALUE",
23+
"STATUS",
24+
# "SYMBOL",
25+
# "TERMINATED",
26+
# "DECIMALS",
27+
# "GeoUID",
28+
# "Hierarchy for GEO",
29+
# "Classification Code for Educational qualification",
30+
# "Hierarchy for Educational qualification",
31+
# "Classification Code for Field of study",
32+
# "Hierarchy for Field of study",
33+
# "Classification Code for Gender",
34+
# "Hierarchy for Gender",
35+
# "Classification Code for Age group",
36+
# "Hierarchy for Age group",
37+
# "Classification Code for Status of student in Canada",
38+
# "Hierarchy for Status of student in Canada",
39+
# "Classification Code for Characteristics after graduation",
40+
# "Hierarchy for Characteristics after graduation",
41+
# "Classification Code for Graduate statistics",
42+
# "Hierarchy for Graduate statistics",
43+
# "val_norm",
44+
# "Date",
45+
"Educational qualification",
46+
"Field of study",
47+
"Gender",
48+
"Age group",
49+
"Status of student in Canada",
50+
"Characteristics after graduation",
51+
"Graduate statistics"
52+
)) %>%
53+
rename(
54+
"geo_value" = "GEO",
55+
"time_value" = "REF_DATE",
56+
"value" = "VALUE",
57+
"status" = "STATUS",
58+
"edu_qual" = "Educational qualification",
59+
"fos" = "Field of study",
60+
"gender" = "Gender",
61+
"age_group" = "Age group",
62+
"student_status" = "Status of student in Canada",
63+
"grad_charac" = "Characteristics after graduation",
64+
"grad_stat" = "Graduate statistics"
65+
) %>%
66+
mutate(
67+
grad_stat = recode_factor(
68+
grad_stat,
69+
`Number of graduates` = "num_graduates",
70+
`Median employment income two years after graduation` = "med_income_2y",
71+
`Median employment income five years after graduation` = "med_income_5y"
72+
),
73+
time_value = as.integer(time_value)
74+
) %>%
75+
pivot_wider(names_from = grad_stat, values_from = value) %>%
76+
filter(
77+
# Drop aggregates for some columns
78+
geo_value != "Canada" &
79+
age_group != "15 to 64 years" &
80+
edu_qual != "Total, educational qualification" &
81+
# Keep aggregates for keys we don't want to keep
82+
fos == "Total, field of study" &
83+
gender == "Total, gender" &
84+
student_status == "Canadian and international students" &
85+
# Since we're looking at 2y and 5y employment income, the only
86+
# characteristics remaining are:
87+
# - Graduates reporting employment income
88+
# - Graduates reporting wages, salaries, and commissions only
89+
# For simplicity, keep the first one only
90+
grad_charac == "Graduates reporting employment income" &
91+
# Only keep "good" data
92+
is.na(status) &
93+
# Drop NA value rows
94+
!is.na(num_graduates) & !is.na(med_income_2y) & !is.na(med_income_5y)
95+
) %>%
96+
select(-c(status, gender, student_status, grad_charac, fos))
97+
98+
nrow(gemploy)
99+
ncol(gemploy)
100+
101+
grad_employ_subset <- gemploy %>%
102+
as_epi_df(
103+
as_of = "2022-07-19",
104+
additional_metadata = list(other_keys = c("age_group", "edu_qual"))
105+
)
106+
usethis::use_data(grad_employ_subset, overwrite = TRUE)

data/grad_employ_subset.rda

8.29 KB
Binary file not shown.

man/grad_employ_subset.Rd

Lines changed: 44 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vignettes/.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
*.html
2-
*.R
32
*_cache/
3+
*.R

vignettes/articles/sliding.Rmd

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -61,10 +61,7 @@ versions for the less up-to-date input archive.
6161
```{r grab-epi-data}
6262
theme_set(theme_bw())
6363
64-
y <- readRDS(system.file(
65-
"extdata", "all_states_covidcast_signals.rds",
66-
package = "epipredict", mustWork = TRUE
67-
))
64+
y <- readRDS("all_states_covidcast_signals.rds")
6865
6966
y <- purrr::map(y, ~ select(.x, geo_value, time_value, version = issue, value))
7067

0 commit comments

Comments
 (0)