@@ -9,10 +9,10 @@ vignette: >
9
9
10
10
``` {r setup, include=F}
11
11
knitr::opts_chunk$set(
12
+ echo = TRUE,
12
13
collapse = TRUE,
13
14
comment = "#>",
14
- warning = FALSE,
15
- message = FALSE
15
+ out.width = "100%"
16
16
)
17
17
```
18
18
@@ -24,6 +24,8 @@ library(recipes)
24
24
library(epiprocess)
25
25
library(epipredict)
26
26
library(ggplot2)
27
+ library(lubridate)
28
+ theme_set(theme_bw())
27
29
```
28
30
29
31
[ Panel data] ( https://en.wikipedia.org/wiki/Panel_data ) , or longitudinal data,
@@ -38,16 +40,18 @@ dataset, which contains daily state-wise measures of `case_rate` and
38
40
head(case_death_rate_subset, 3)
39
41
```
40
42
41
- ` epipredict ` functions work with data in [ ` epi_df ` ] (
42
- https://cmu-delphi.github.io/epiprocess/reference/epi_df.html )
43
+ ` epipredict ` functions work with data in
44
+ [ ` epi_df ` ] ( https://cmu-delphi.github.io/epiprocess/reference/epi_df.html )
43
45
format. Despite the stated goal and name of the package, other panel datasets
44
46
are also valid candidates for ` epipredict ` functionality, as long as they are
45
47
in ` epi_df ` format.
46
48
47
49
``` {r employ-stats, include=F}
48
50
data("grad_employ_subset")
49
- year_start <- min(grad_employ_subset$time_value)
50
- year_end <- max(grad_employ_subset$time_value)
51
+ grad_employ_subset <- grad_employ_subset %>%
52
+ mutate(time_value = ymd(paste0(time_value, "0101")))
53
+ year_start <- year(min(grad_employ_subset$time_value))
54
+ year_end <- year(max(grad_employ_subset$time_value))
51
55
```
52
56
53
57
# Example panel data overview
@@ -90,7 +94,8 @@ gemploy <- statcan_grad_employ %>%
90
94
select(c(
91
95
"REF_DATE", "GEO", "VALUE", "STATUS", "Educational qualification",
92
96
"Field of study", "Gender", "Age group", "Status of student in Canada",
93
- "Characteristics after graduation", "Graduate statistics")) %>%
97
+ "Characteristics after graduation", "Graduate statistics"
98
+ )) %>%
94
99
rename(
95
100
"geo_value" = "GEO",
96
101
"time_value" = "REF_DATE",
@@ -102,7 +107,8 @@ gemploy <- statcan_grad_employ %>%
102
107
"age_group" = "Age group",
103
108
"student_status" = "Status of student in Canada",
104
109
"grad_charac" = "Characteristics after graduation",
105
- "grad_stat" = "Graduate statistics") %>%
110
+ "grad_stat" = "Graduate statistics"
111
+ ) %>%
106
112
# The original `VALUE` column contain the statistic indicated by
107
113
# `Graduate statistics` in the original data. Below we pivot the data
108
114
# wider so that each unique statistic can have its own column.
@@ -115,7 +121,8 @@ gemploy <- statcan_grad_employ %>%
115
121
`Median employment income five years after graduation` = "med_income_5y"
116
122
),
117
123
# They are originally strings but want ints for conversion to epi_df later
118
- time_value = as.integer(time_value)) %>%
124
+ time_value = as.integer(time_value)
125
+ ) %>%
119
126
pivot_wider(names_from = grad_stat, values_from = value) %>%
120
127
filter(
121
128
# Drop aggregates for some columns
@@ -135,7 +142,8 @@ gemploy <- statcan_grad_employ %>%
135
142
# Only keep "good" data
136
143
is.na(status) &
137
144
# Drop NA value rows
138
- !is.na(num_graduates) & !is.na(med_income_2y) & !is.na(med_income_5y)) %>%
145
+ !is.na(num_graduates) & !is.na(med_income_2y) & !is.na(med_income_5y)
146
+ ) %>%
139
147
select(-c(status, gender, student_status, grad_charac, fos))
140
148
```
141
149
@@ -154,10 +162,12 @@ a list of all the `type_type`s available.
154
162
grad_employ_subset <- gemploy %>%
155
163
tsibble::as_tsibble(
156
164
index = time_value,
157
- key = c(geo_value, age_group, edu_qual)) %>%
165
+ key = c(geo_value, age_group, edu_qual)
166
+ ) %>%
158
167
as_epi_df(
159
168
geo_type = "custom", time_type = "year",
160
- additional_metadata = c(other_keys = list("age_group", "edu_qual")))
169
+ additional_metadata = c(other_keys = list("age_group", "edu_qual"))
170
+ )
161
171
```
162
172
163
173
``` {r data-dim, include=F}
@@ -169,7 +179,7 @@ Now, we are ready to use `grad_employ_subset` with `epipredict`.
169
179
Our ` epi_df ` contains ` r employ_rowcount ` rows and ` r employ_colcount ` columns.
170
180
Here is a quick summary of the columns in our ` epi_df ` :
171
181
172
- * ` time_value ` (time value): year in YYYY format
182
+ * ` time_value ` (time value): year in ` date ` format
173
183
* ` geo_value ` (geo value): province in Canada
174
184
* ` num_graduates ` (raw, time series value): number of graduates
175
185
* ` med_income_2y ` (raw, time series value): median employment income 2 years
@@ -208,7 +218,8 @@ employ_small <- employ %>%
208
218
mutate(
209
219
num_graduates_prop = num_graduates / sum(num_graduates),
210
220
med_income_2y_prop = med_income_2y / sum(med_income_2y),
211
- med_income_5y_prop = med_income_5y / sum(med_income_5y)) %>%
221
+ med_income_5y_prop = med_income_5y / sum(med_income_5y)
222
+ ) %>%
212
223
ungroup()
213
224
head(employ_small)
214
225
```
@@ -226,7 +237,8 @@ employ_small %>%
226
237
facet_grid(rows = vars(edu_qual), cols = vars(age_group)) +
227
238
xlab("Year") +
228
239
ylab("# of graduates as proportion of sum within group") +
229
- ggtitle("Trend in # of Graduates by Age Group and Education in BC and ON")
240
+ ggtitle("Trend in # of Graduates by Age Group and Education in BC and ON") +
241
+ theme(legend.position = "bottom")
230
242
```
231
243
232
244
We will predict the "standardized" number of graduates (a proportion) in the
@@ -254,8 +266,8 @@ values are both in years.
254
266
255
267
``` {r make-recipe, include=T, eval=T}
256
268
r <- epi_recipe(employ_small) %>%
257
- step_epi_ahead(num_graduates_prop, ahead = 1 ) %>% # lag & ahead units in years
258
- step_epi_lag(num_graduates_prop, lag = 0:2) %>%
269
+ step_epi_ahead(num_graduates_prop, ahead = 365 ) %>% # lag & ahead units in days
270
+ step_epi_lag(num_graduates_prop, lag = 0:2 * 365 ) %>%
259
271
step_epi_naomit()
260
272
r
261
273
```
@@ -265,11 +277,11 @@ and `ahead` columns.
265
277
266
278
``` {r view-preprocessed, include=T}
267
279
# Display a sample of the preprocessed data
268
- bake_and_show_sample <- function(recipe, new_data , n= 5) {
269
- recipe %>% prep() %>% bake(new_data = new_data ) %>% sample_n(n)
280
+ bake_and_show_sample <- function(recipe, data , n = 5) {
281
+ recipe %>% prep(data ) %>% bake(new_data = data ) %>% sample_n(n)
270
282
}
271
283
272
- bake_and_show_sample(r, employ_small)
284
+ r %>% bake_and_show_sample( employ_small)
273
285
```
274
286
275
287
We can see that the ` prep ` and ` bake ` steps created new columns according to
@@ -337,7 +349,8 @@ First, we'll plot the residuals (that is, $y_{t} - \hat{y}_{t}$) against the
337
349
fitted values ($\hat{y}_ {t}$).
338
350
339
351
``` {r lienarreg-resid-plot, include=T, fig.height=8}
340
- par(mfrow = c(2,2)); plot(extract_fit_engine(wf_linreg))
352
+ par(mfrow = c(2, 2), mar = c(5, 3.5, 1, 1) + .5)
353
+ plot(extract_fit_engine(wf_linreg))
341
354
```
342
355
343
356
The fitted values vs. residuals plot shows us that the residuals are mostly
@@ -381,14 +394,14 @@ $z_i$ is the number of graduates (proportion) at time $i$.
381
394
Again, we construct an ` epi_recipe ` detailing the preprocessing steps.
382
395
383
396
``` {r custom-arx, include=T}
384
- rx <- epi_recipe(employ_small) %>%
385
- step_epi_ahead(med_income_5y_prop, ahead = 1) %>%
386
- # 5-year median income has 3 lags c(0,1,2)
387
- step_epi_lag(med_income_5y_prop, lag = c(0,1, 2)) %>%
388
- # But the two exogenous variables have 2 lags c(0,1)
389
- step_epi_lag(med_income_2y_prop, lag = c(0,1)) %>%
390
- step_epi_lag(num_graduates_prop, lag = c(0,1)) %>%
391
- step_epi_naomit()
397
+ rx <- epi_recipe(employ_small) %>%
398
+ step_epi_ahead(med_income_5y_prop, ahead = 1) %>%
399
+ # 5-year median income has 3 lags c(0,1,2)
400
+ step_epi_lag(med_income_5y_prop, lag = c(0, 1, 2)) %>%
401
+ # But the two exogenous variables have 2 lags c(0,1)
402
+ step_epi_lag(med_income_2y_prop, lag = c(0, 1)) %>%
403
+ step_epi_lag(num_graduates_prop, lag = c(0, 1)) %>%
404
+ step_epi_naomit()
392
405
393
406
bake_and_show_sample(rx, employ_small)
394
407
```
@@ -414,20 +427,25 @@ quantify the uncertainty associated with future predicted values.
414
427
# Only have to include med_income_5y since that is our outcome
415
428
totals <- employ_small %>%
416
429
group_by(geo_value, age_group, edu_qual) %>%
417
- summarise(med_income_5y_tot = sum(med_income_5y))
430
+ summarise(med_income_5y_tot = sum(med_income_5y), .groups = "keep" )
418
431
419
- # Define post-processing steps
432
+ # Define post-processing steps
420
433
f <- frosting() %>%
421
434
layer_predict() %>%
422
- layer_naomit(.pred) %>%
423
- layer_threshold(.pred, lower = 0) %>%
435
+ layer_naomit(.pred) %>%
436
+ layer_threshold(.pred, lower = 0) %>%
424
437
# 90% prediction interval
425
- layer_residual_quantiles(probs = c(0.05, 0.95), symmetrize = F) %>%
438
+ layer_residual_quantiles(
439
+ quantile_levels = c(0.05, 0.95),
440
+ symmetrize = FALSE
441
+ ) %>%
426
442
layer_population_scaling(
427
- .pred, .pred_distn, df = totals, df_pop_col = "med_income_5y_tot")
428
-
429
- wfx_linreg <- epi_workflow(rx, parsnip::linear_reg()) %>%
430
- fit(employ_small) %>%
443
+ .pred, .pred_distn,
444
+ df = totals, df_pop_col = "med_income_5y_tot"
445
+ )
446
+
447
+ wfx_linreg <- epi_workflow(rx, parsnip::linear_reg()) %>%
448
+ fit(employ_small) %>%
431
449
add_frosting(f)
432
450
433
451
summary(extract_fit_engine(wfx_linreg))
@@ -440,20 +458,19 @@ confidence level. Both lags for the number of graduates were insigificant.
440
458
441
459
Let's take a look at the predictions along with their 90% prediction intervals.
442
460
443
- ``` {r}
461
+ ``` {r}
444
462
latest <- get_test_data(recipe = rx, x = employ_small)
445
463
predsx <- predict(wfx_linreg, latest)
446
464
447
465
# Display values within prediction intervals
448
466
predsx %>%
449
- select(
450
- geo_value, time_value, edu_qual, age_group,
451
- .pred_scaled, .pred_distn_scaled) %>%
452
- dplyr::mutate(.quantiles = nested_quantiles(.pred_distn_scaled)) %>%
453
- tidyr::unnest(.quantiles) %>%
454
- pivot_wider(names_from = tau, values_from = q) %>%
455
- head()
456
- ```
467
+ select(
468
+ geo_value, time_value, edu_qual, age_group, fos,
469
+ .pred_scaled, .pred_distn_scaled
470
+ ) %>%
471
+ head() %>%
472
+ pivot_quantiles_wider(.pred_distn_scaled)
473
+ ```
457
474
458
475
# Using canned forecasters
459
476
@@ -486,11 +503,11 @@ where $y_i$ is the 2-year median income (proportion) at time $i$.
486
503
``` {r flatline, include=T, warning=F}
487
504
out_fl <- flatline_forecaster(employ_small, "med_income_2y_prop",
488
505
args_list = flatline_args_list(
489
- ahead = 1L, forecast_date = as.Date("2015-01-01")))
506
+ ahead = 365L, forecast_date = as.Date("2015-01-01"),
507
+ )
508
+ )
490
509
491
- # The first argument to augment grabs the epi_workflow object from the
492
- # forecaster output.
493
- augment(out_fl$epi_workflow, employ_small) %>% sample_n(5)
510
+ out_fl
494
511
```
495
512
496
513
## Autoregressive forecaster with exogenous inputs
@@ -507,13 +524,15 @@ same number of lags.
507
524
508
525
``` {r arx-lr, include=T, warning=F}
509
526
arx_args <- arx_args_list(
510
- lags = c(0L, 1L), ahead = 1L, forecast_date = as.Date("2015-01-01"))
527
+ lags = c(0L, 365L), ahead = 365L, forecast_date = as.Date("2015-01-01")
528
+ )
511
529
512
530
out_arx_lr <- arx_forecaster(employ_small, "med_income_5y_prop",
513
531
c("med_income_5y_prop", "med_income_2y_prop", "num_graduates_prop"),
514
- args_list = arx_args)
532
+ args_list = arx_args
533
+ )
515
534
516
- augment( out_arx_lr$epi_workflow, employ_small) %>% sample_n(5)
535
+ out_arx_lr
517
536
```
518
537
519
538
Other changes to the direct AR forecaster, like changing the engine, also work
@@ -524,7 +543,8 @@ out_arx_rf <- arx_forecaster(
524
543
employ_small, "med_income_5y_prop",
525
544
c("med_income_5y_prop", "med_income_2y_prop", "num_graduates_prop"),
526
545
trainer = parsnip::boost_tree(mode = "regression", trees = 20),
527
- args_list = arx_args)
546
+ args_list = arx_args
547
+ )
528
548
529
- augment( out_arx_rf$epi_workflow, employ_small) %>% sample_n(5)
549
+ out_arx_rf
530
550
```
0 commit comments