@@ -20,13 +20,12 @@ knitr::opts_chunk$set(
20
20
library(epiprocess)
21
21
library(epipredict)
22
22
library(dplyr)
23
- library(stringr)
24
23
library(parsnip)
25
24
library(recipes)
26
25
```
27
26
28
27
[ Panel data] ( https://en.wikipedia.org/wiki/Panel_data ) , or longitudinal data,
29
- contains cross-sectional measurements of subjects over time. The ` epipredict `
28
+ contain cross-sectional measurements of subjects over time. The ` epipredict `
30
29
package is most suitable for running forecasters on epidemiological panel data.
31
30
A built-in example of this is the [ ` case_death_rate_subset ` ] (
32
31
https://cmu-delphi.github.io/epipredict/reference/case_death_rate_subset.html )
@@ -41,8 +40,8 @@ head(case_death_rate_subset)
41
40
https://cmu-delphi.github.io/epiprocess/reference/epi_df.html )
42
41
format. Despite the stated goal and name of the package, other panel datasets
43
42
are also valid candidates for ` epipredict ` functionality. Specifically, the
44
- ` epipredict ` framework and direct forecasters are able to work with any panel
45
- data, as long as it's in ` epi_df ` format,
43
+ ` epipredict ` framework and direct forecasters can work with any panel data, as
44
+ long as it's in ` epi_df ` format.
46
45
47
46
``` {r employ-stats, include=F}
48
47
year_start <- min(grad_employ_subset$time_value)
@@ -59,8 +58,9 @@ from Statistics Canada. We will be using
59
58
graduation, by educational qualification and field of study (primary
60
59
groupings)
61
60
] ( https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=3710011501 ) .
61
+
62
62
The full dataset contains yearly median employment income two and five years
63
- after graduation, and number of graduates. The data is further stratified by
63
+ after graduation, and number of graduates. The data is stratified by
64
64
variables such as geographic region (Canadian province), field of study, and
65
65
age group. The year range of the dataset is ` r year_start ` to ` r year_end ` ,
66
66
inclusive. The full dataset also contains metadata that describes the
@@ -80,8 +80,11 @@ just described:
80
80
``` {r employ-query, eval=F}
81
81
library(cansim)
82
82
83
+ # Get original dataset
83
84
statcan_grad_employ <- get_cansim("37-10-0115-01")
85
+
84
86
gemploy <- statcan_grad_employ %>%
87
+ # Drop some columns and rename the ones we keep
85
88
select(c("REF_DATE", "GEO", "VALUE", "STATUS", "Educational qualification",
86
89
"Field of study", "Gender", "Age group", "Status of student in Canada",
87
90
"Characteristics after graduation", "Graduate statistics")) %>%
@@ -101,11 +104,13 @@ gemploy <- statcan_grad_employ %>%
101
104
# `Graduate statistics` in the original data. Below we pivot the data
102
105
# wider so that each unique statistic can have its own column.
103
106
mutate(
107
+ # Recode for easier pivoting
104
108
grad_stat = recode_factor(
105
109
grad_stat,
106
110
`Number of graduates` = "num_graduates",
107
111
`Median employment income two years after graduation` = "med_income_2y",
108
112
`Median employment income five years after graduation` = "med_income_5y"),
113
+ # They are originally strings but want ints for conversion to epi_df later
109
114
time_value = as.integer(time_value)
110
115
) %>%
111
116
pivot_wider(names_from = grad_stat, values_from = value) %>%
@@ -137,7 +142,11 @@ using [`as_epi_df`](
137
142
with additional keys. In our case, the additional keys are ` age_group ` , ` fos `
138
143
and ` edu_qual ` . Note that in the above modifications, we encoded ` time_value `
139
144
as type ` integer ` . This allows us to set ` time_type ` to ` "year" ` , and to ensure
140
- lag and ahead modifications later on are using the correct time units.
145
+ lag and ahead modifications later on are using the correct time units. See the
146
+ [ ` epi_df ` documentation] (
147
+ https://cmu-delphi.github.io/epiprocess/reference/epi_df.html#time-types ) for
148
+ a list of all the ` type_type ` s available.
149
+
141
150
142
151
``` {r convert-to-epidf, eval=F}
143
152
grad_employ_subset <- gemploy %>%
@@ -235,24 +244,25 @@ out_fl <- flatline_forecaster(employ, "med_income_2y",
235
244
augment(out_fl$epi_workflow, employ)
236
245
```
237
246
238
- ``` {r arx, include=T}
247
+ ``` {r arx-lr , include=T}
239
248
arx_args <- arx_args_list(
240
- lags = c(0L, 1L, 2L), ahead = 1L, forecast_date = as.Date("2022-08-01"))
241
- out_arx <- arx_forecaster(employ, "med_income_2y",
249
+ lags = c(0L, 1L), ahead = 1L, forecast_date = as.Date("2022-08-01"))
250
+
251
+ out_arx_lr <- arx_forecaster(employ, "med_income_2y",
242
252
c("med_income_2y", "med_income_5y", "num_graduates"),
243
253
args_list = arx_args)
244
254
245
- out_arx $predictions
255
+ out_arx_lr $predictions
246
256
```
247
257
248
258
Other changes to the direct AR forecaster, like changing the engine, also work
249
259
as expected.
250
260
251
- ``` {r arx-epi- rf, include=F, warning=F}
252
- out_rf <- arx_forecaster(
253
- employ, "med_income_2y", c("med_income_2y", "med_income_5y"),
254
- trainer = parsnip::rand_forest (mode= "regression", trees=100 ),
255
- args_list = args )
261
+ ``` {r arx-rf, include=F, warning=F}
262
+ out_arx_rf <- arx_forecaster(
263
+ employ, "med_income_2y", c("med_income_2y", "med_income_5y", "num_graduates" ),
264
+ trainer = parsnip::boost_tree (mode = "regression", trees = 20 ),
265
+ args_list = arx_args )
256
266
257
- out_rf $predictions
267
+ out_arx_rf $predictions
258
268
```
0 commit comments