@@ -22,6 +22,7 @@ library(parsnip)
22
22
library(recipes)
23
23
library(epiprocess)
24
24
library(epipredict)
25
+ library(ggplot2)
25
26
```
26
27
27
28
[ Panel data] ( https://en.wikipedia.org/wiki/Panel_data ) , or longitudinal data,
@@ -68,8 +69,8 @@ modifications to get a subset of the full dataset:
68
69
69
70
* Only keep provincial-level geographic region (the full data also has
70
71
"Canada" as a region)
71
- * Only keep "good" or better quality data rows, as indicated by the [ ` STATUS ` ]
72
- ( https://www.statcan.gc.ca/en/concepts/definitions/guide-symbol ) column
72
+ * Only keep "good" or better quality data rows, as indicated by the [ ` STATUS ` ] (
73
+ https://www.statcan.gc.ca/en/concepts/definitions/guide-symbol ) column
73
74
* Choose a subset of covariates and aggregate across the remaining ones. The
74
75
chosen covariates are age group, field of study, and educational qualification.
75
76
@@ -82,11 +83,13 @@ library(cansim)
82
83
# Get statcan data using get_cansim, which returns a tibble
83
84
statcan_grad_employ <- get_cansim("37-10-0115-01")
84
85
85
- gemploy <- statcan_grad_employ %>%
86
+ gemploy <- statcan_grad_employ %>%
86
87
# Drop some columns and rename the ones we keep
87
- select(c("REF_DATE", "GEO", "VALUE", "STATUS", "Educational qualification",
88
- "Field of study", "Gender", "Age group", "Status of student in Canada",
89
- "Characteristics after graduation", "Graduate statistics")) %>%
88
+ select(c(
89
+ "REF_DATE", "GEO", "VALUE", "STATUS", "Educational qualification",
90
+ "Field of study", "Gender", "Age group", "Status of student in Canada",
91
+ "Characteristics after graduation", "Graduate statistics"
92
+ )) %>%
90
93
rename(
91
94
"geo_value" = "GEO",
92
95
"time_value" = "REF_DATE",
@@ -98,40 +101,43 @@ gemploy <- statcan_grad_employ %>%
98
101
"age_group" = "Age group",
99
102
"student_status" = "Status of student in Canada",
100
103
"grad_charac" = "Characteristics after graduation",
101
- "grad_stat" = "Graduate statistics") %>%
102
- # The original `VALUE` column contain the statistic indicated by
103
- # `Graduate statistics` in the original data. Below we pivot the data
104
+ "grad_stat" = "Graduate statistics"
105
+ ) %>%
106
+ # The original `VALUE` column contain the statistic indicated by
107
+ # `Graduate statistics` in the original data. Below we pivot the data
104
108
# wider so that each unique statistic can have its own column.
105
109
mutate(
106
110
# Recode for easier pivoting
107
111
grad_stat = recode_factor(
108
- grad_stat,
109
- `Number of graduates` = "num_graduates",
112
+ grad_stat,
113
+ `Number of graduates` = "num_graduates",
110
114
`Median employment income two years after graduation` = "med_income_2y",
111
- `Median employment income five years after graduation` = "med_income_5y"),
115
+ `Median employment income five years after graduation` = "med_income_5y"
116
+ ),
112
117
# They are originally strings but want ints for conversion to epi_df later
113
118
time_value = as.integer(time_value)
114
119
) %>%
115
120
pivot_wider(names_from = grad_stat, values_from = value) %>%
116
121
filter(
117
122
# Drop aggregates for some columns
118
- geo_value != "Canada" &
119
- age_group != "15 to 64 years" &
120
- fos != "Total, field of study" &
121
- edu_qual != "Total, educational qualification" &
122
- # Keep aggregates for keys we don't want to keep
123
- gender == "Total, gender" &
124
- student_status == "Canadian and international students" &
125
- # Since we're looking at 2y and 5y employment income, the only
126
- # characteristics remaining are:
127
- # - Graduates reporting employment income
128
- # - Graduates reporting wages, salaries, and commissions only
129
- # For simplicity, keep the first one only
130
- grad_charac == "Graduates reporting employment income" &
131
- # Only keep "good" data
132
- is.na(status) &
133
- # Drop NA value rows
134
- !is.na(num_graduates) & !is.na(med_income_2y) & !is.na(med_income_5y)) %>%
123
+ geo_value != "Canada" &
124
+ age_group != "15 to 64 years" &
125
+ fos != "Total, field of study" &
126
+ edu_qual != "Total, educational qualification" &
127
+ # Keep aggregates for keys we don't want to keep
128
+ gender == "Total, gender" &
129
+ student_status == "Canadian and international students" &
130
+ # Since we're looking at 2y and 5y employment income, the only
131
+ # characteristics remaining are:
132
+ # - Graduates reporting employment income
133
+ # - Graduates reporting wages, salaries, and commissions only
134
+ # For simplicity, keep the first one only
135
+ grad_charac == "Graduates reporting employment income" &
136
+ # Only keep "good" data
137
+ is.na(status) &
138
+ # Drop NA value rows
139
+ !is.na(num_graduates) & !is.na(med_income_2y) & !is.na(med_income_5y)
140
+ ) %>%
135
141
select(-c(status, gender, student_status, grad_charac))
136
142
```
137
143
@@ -149,15 +155,17 @@ a list of all the `type_type`s available.
149
155
``` {r convert-to-epidf, eval=F}
150
156
grad_employ_subset <- gemploy %>%
151
157
tsibble::as_tsibble(
152
- index=time_value,
153
- key=c(geo_value, age_group, fos, edu_qual)) %>%
158
+ index = time_value,
159
+ key = c(geo_value, age_group, fos, edu_qual)
160
+ ) %>%
154
161
as_epi_df(
155
162
geo_type = "custom", time_type = "year",
156
- additional_metadata=c(other_keys=list("age_group", "fos", "edu_qual")))
163
+ additional_metadata = c(other_keys = list("age_group", "fos", "edu_qual"))
164
+ )
157
165
```
158
166
159
167
``` {r data-dim, include=F}
160
- employ_rowcount <- format(nrow(grad_employ_subset), big.mark= ",")
168
+ employ_rowcount <- format(nrow(grad_employ_subset), big.mark = ",")
161
169
employ_colcount <- length(names(grad_employ_subset))
162
170
```
163
171
@@ -190,83 +198,160 @@ In the following sections, we will go over preprocessing the data in the
190
198
191
199
# Preprocessing
192
200
193
- We will create an ` epi_recipe ` that adds one ` ahead ` column and 3 ` lag ` columns.
194
- The ` ahead ` column tells us how many time units ahead to predict, and the ` lag `
195
- columns tell us how many previous time points to include as covariates. We will
196
- just work with one of the time series in our data for now: ` num_graduates ` .
201
+ As a simple example, let's work with the ` num_graduates ` column for now.
202
+
203
+ ``` {r employ-small, include=T}
204
+ employ_small <- employ %>%
205
+ group_by(geo_value, time_value, age_group, edu_qual) %>%
206
+ summarise_if(is.numeric, sum) %>%
207
+ ungroup() %>%
208
+ # Incomplete data - exclude
209
+ filter(geo_value != "Territories") %>%
210
+ # Select groups where there are complete timeseries values
211
+ group_by(geo_value, age_group, edu_qual) %>%
212
+ filter(n() >= 6) %>%
213
+ mutate(
214
+ num_graduates_prop = num_graduates / sum(num_graduates)
215
+ ) %>%
216
+ # med_income_2y_prop = med_income_2y / sum(med_income_2y),
217
+ # med_income_5y_prop = med_income_5y / sum(med_income_5y)) %>%
218
+ ungroup() %>%
219
+ # select(-c(med_income_2y, med_income_5y, num_graduates)) %>%
220
+ # Bug: shouldn't have to cast back to epi_df
221
+ as_epi_df(
222
+ geo_type = "custom",
223
+ time_type = "year",
224
+ additional_metadata = c(other_keys = list("age_group", "edu_qual")))
225
+ head(employ_small)
226
+ ```
227
+
228
+ Below is a visualization for a sample of the small data. Note that some groups
229
+ do not have any time series information since we filtered out all timeseries
230
+ with incomplete dates.
231
+
232
+ ``` {r employ-small-graph, include=F, eval=F}
233
+ employ_small %>%
234
+ filter(geo_value %in% c("British Columbia", "Ontario")) %>%
235
+ filter(grepl("degree", edu_qual, fixed = T)) %>%
236
+ ggplot(aes(x = time_value, y = num_graduates_prop, color = geo_value)) +
237
+ geom_line() +
238
+ facet_grid(rows = vars(edu_qual), cols = vars(age_group)) +
239
+ xlab("Year") +
240
+ ylab("# of graduates as proportion of sum within group") +
241
+ ggtitle("Trend in # of Graduates by Age Group and Education in BC and ON")
242
+ ```
243
+
244
+ We will predict the number of graduates in the next year (time $t+1$) using an
245
+ autoregressive model with three lags (i.e., an AR(3) model). Such a model is
246
+ represented algebraically like this:
247
+
248
+ \[
249
+ x_ {t+1} =
250
+ \phi_0 + \phi_1 x_ {t} + \phi_2 x_ {t-1} + \phi_3 x_ {t-2} + \epsilon_t
251
+ \]
252
+
253
+ where $x_i$ is the number of graduates at time $i$, and the current time is $t$.
197
254
198
- In this preprocessing step, no computation really happens. It just provides
199
- a series of steps that will be applied when using ` epi_workflow ` later. And note
200
- that since we specified our ` time_type ` to be ` year ` , our ` lag ` and ` lead `
255
+ In the preprocessing step, we need to create additional columns in ` employ ` for
256
+ each of $x_ {t+1}$, $x_ {t}$, $x_ {t-1}$, and $x_ {t-2}$. We do this via an
257
+ ` epi_recipe ` . Note that creating an ` epi_recipe ` alone doesn't add these
258
+ outcome and predictor columns; the recipe just stores the instructions for
259
+ adding them.
260
+
261
+ Our ` epi_recipe ` should add one ` ahead ` column representing $x_ {t+1}$ and
262
+ 3 ` lag ` columns representing $x_ {t}$, $x_ {t-1}$, and $x_ {t-2}$. Also note that
263
+ since we specified our ` time_type ` to be ` year ` , our ` lag ` and ` lead `
201
264
values are both in years.
202
265
203
- ``` {r make-recipe, include=T}
204
- r <- epi_recipe(employ) %>%
205
- step_epi_ahead(num_graduates, ahead = 1) %>% # lag & ahead units in years
206
- step_epi_lag(num_graduates, lag = 0:2) %>%
207
- step_epi_naomit()
266
+ ``` {r make-recipe, include=T, eval=F}
267
+ # r <- epi_recipe(employ) %>%
268
+ # step_epi_ahead(num_graduates, ahead = 1) %>% # lag & ahead units in years
269
+ # step_epi_lag(num_graduates, lag = 0:2) %>%
270
+ # step_epi_naomit()
271
+ # r
272
+
273
+ r <- epi_recipe(employ_small) %>%
274
+ step_epi_ahead(num_graduates_prop, ahead = 1) %>% # lag & ahead units in years
275
+ step_epi_lag(num_graduates_prop, lag = 0:2) %>%
276
+ step_epi_naomit()
208
277
r
209
278
```
210
279
211
280
There are 3 ` raw ` roles which are our three lagged ` num_graduates ` columns, and
212
281
three ` key ` roles which are our additional keys ` age_group ` , ` fos ` and
213
- ` edu_qual ` . Let's apply this recipe using ` prep ` and ` bake ` to see all of the
214
- additional ` lag ` and ` ahead ` columns created.
282
+ ` edu_qual ` .
283
+
284
+ Let's apply this recipe using ` prep ` and ` bake ` to generate and view the ` lag `
285
+ and ` ahead ` columns.
215
286
216
287
``` {r view-preprocessed, include=T}
217
288
# Display a sample of the preprocessed data
218
- baked_sample <- r %>% prep() %>% bake(new_data = employ) %>% sample_n(5)
219
- baked_sample
289
+ baked_sample <- r %>%
290
+ prep() %>%
291
+ bake(new_data = employ_small) %>%
292
+ sample_n(5)
293
+ # baked_sample
220
294
```
221
295
296
+ We can see that the ` prep ` and ` bake ` steps created new columns according to
297
+ our ` epi_recipe ` :
298
+
299
+ - ` ahead_1_num_graduates ` corresponds to $x_ {t+1}$
300
+ - ` lag_0_num_graduates ` , ` lag_1_num_graduates ` , and ` lag_2_num_graduates `
301
+ correspond to $x_ {t}$, $x_ {t-1}$, and $x_ {t-2}$ respectively.
302
+
222
303
# Model fitting and prediction
223
304
224
305
## Within recipes framework
225
306
226
- We will look at a simple model: [ ` parsnip::linear_reg() ` ] (
307
+ Since our goal for now is to fit a simple autoregressive model, we can use
308
+ [ ` parsnip::linear_reg() ` ] (
227
309
https://parsnip.tidymodels.org/reference/linear_reg.html ) with the default
228
310
engine ` lm ` , which fits a linear regression using ordinary least squares.
229
- Specifically, our model will be an autoregressive linear model with:
230
-
231
- * Outcome $y_ {t}$, the number of graduates at time $t$. Corresponds to
232
- setting ` ahead = 1 ` in our recipe.
233
- <!-- TODO i don't like the wording i used here -->
234
- * Predictors $x_ {t-1}$, $x_ {t-2}$, and $x_ {t-3}$, the number of graduates in the
235
- three consecutive years prior to time $t$. Corresponds to setting ` lag = 0:2 ` in
236
- our recipe.
237
-
238
- The model is represented algebraically as follows:
239
-
240
- \[
241
- y_ {t+1} =
242
- \beta_0 + \beta_1 x_ {t} + \beta_2 x_ {t-1} + \beta_3 x_ {t-2} + \epsilon_t
243
- \]
244
311
245
312
We will use ` epi_workflow ` with the ` epi_recipe ` we defined in the
246
- preprocessing section to fit this model.
313
+ preprocessing section along with the ` parsnip::linear_reg() ` model. Note again
314
+ that ` epi_workflow ` is a container and doesn't actually do the fitting. We have
315
+ to pass the workflow into ` fit() ` to get our model coefficients
316
+ $\phi_i, i=0,...,3$.
247
317
248
318
``` {r linearreg-wf, include=T}
249
- wf_linreg <- epi_workflow(r, parsnip::linear_reg()) %>% fit(employ)
319
+ wf_linreg <- epi_workflow(r, parsnip::linear_reg()) %>%
320
+ parsnip::fit(employ_small)
250
321
wf_linreg
251
322
```
252
323
253
- <!-- TODO comment on the coefficients, say something about beta_i = lag_{i-1}_num_graduates -->
324
+ This output tells us the coefficients of the fitted model; for instance,
325
+ the intercept is $\phi_0 = -2.2426$ and the coefficient for $x_ {t}$ is
326
+ $\phi_1 = 1.14401$.
254
327
255
328
Now that we have our workflow, we can generate predictions from a subset of our
256
- data. For this demo, we will predict the number of graduates from the last 2
329
+ data. For this demo, we will predict the number of graduates using the last 2
257
330
years of our dataset.
258
331
259
332
``` {r linearreg-predict, include=T}
260
- latest <- employ %>% filter(time_value >= max(time_value) - 2)
333
+ latest <- employ_small %>% filter(time_value >= max(time_value) - 2)
261
334
preds <- stats::predict(wf_linreg, latest) %>% filter(!is.na(.pred))
262
- # Display a sample of the prediction values
263
- head(preds )
335
+ # Display a sample of the prediction values, excluding NAs
336
+ preds %>% head()
264
337
```
265
338
266
- Notice that ` predict ` still returns an ` epi_df ` with all of the keys that were
267
- present in the original dataset.
339
+ We can do this using the ` augment ` function too:
340
+ ``` {r linearreg-augment, include=T}
341
+ employ_small_with_preds <- augment(wf_linreg, latest)
342
+ employ_small_with_preds %>% head()
343
+
344
+ employ_small_with_preds %>%
345
+ mutate(resid = med_income_2y - .pred) %>%
346
+ ggplot(aes(x = .pred, y = resid, color = geo_value)) +
347
+ geom_point() +
348
+ xlab("Fitted values") +
349
+ ylab("Residuals") +
350
+ ggtitle("Plot of fitted values vs. residuals")
351
+ ```
268
352
269
- <!-- TODO: residuals, predictions commentary -->
353
+ Notice that ` predict ` and ` augment ` both still returns an ` epi_df ` with all of
354
+ the keys that were present in the original dataset.
270
355
271
356
## With canned forecasters
272
357
@@ -278,33 +363,52 @@ and the direct autoregressive (AR) forecaster
278
363
[ ` arx_forecaster ` ] (
279
364
https://cmu-delphi.github.io/epipredict/reference/arx_forecaster.html ).
280
365
281
- ``` {r flatline, include=T}
282
- out_fl <- flatline_forecaster(employ, "med_income_2y",
366
+ With canned forecasters, we don't need to manually create a recipe and workflow;
367
+ we just need to specify the lags, aheads, and some additional arguments that
368
+ are passed in a forecast-specific way that we'll see below.
369
+
370
+ In this first example, we'll use ` flatline_forecaster ` to make a simple
371
+ prediction of the 2-year median income for the next year, based on one previous
372
+ time point. This model is representated algebraically as:
373
+ \[ y_ {t+1} = \phi_0 + \phi_1 y_ {t}\]
374
+ where $y_i$ is the 2-year median income at time $i$.
375
+
376
+ ``` {r flatline, include=T, warning=F}
377
+ out_fl <- flatline_forecaster(employ_small, "med_income_2y",
283
378
args_list = flatline_args_list(
284
- ahead= 1L, forecast_date = as.Date("2022-08-16 ")))
379
+ ahead = 1L, forecast_date = as.Date("2015-01-01 ")))
285
380
286
- augment(out_fl$epi_workflow, employ )
381
+ augment(out_fl$epi_workflow, employ_small) %>% head( )
287
382
```
288
383
289
- ``` {r arx-lr, include=T}
384
+ In this second example, we'll use ` arx_forecaster ` to make a prediction of the
385
+ 2-year median income based on the previous two time points' 2-year median
386
+ income _ and_ 5-year median income. This model is represented algebraically as:
387
+ \[
388
+ y_ {t+1} =
389
+ \phi_0 + \phi_1 y_ {t} + \phi_2 y_ {t-1} + \phi_3 z_ {t} + \phi_4 z_ {t-1}
390
+ \]
391
+ where $y_i$ is as before, and $z_i$ is the 5-year median income at time $i$.
392
+
393
+ ``` {r arx-lr, include=T, warning=F}
290
394
arx_args <- arx_args_list(
291
- lags = c(0L, 1L), ahead = 1L, forecast_date = as.Date("2022-08 -01"))
395
+ lags = c(0L, 1L), ahead = 1L, forecast_date = as.Date("2015-01 -01"))
292
396
293
- out_arx_lr <- arx_forecaster(employ , "med_income_2y",
294
- c("med_income_2y", "med_income_5y", "num_graduates"),
397
+ out_arx_lr <- arx_forecaster(employ_small , "med_income_2y",
398
+ c("med_income_2y", "med_income_5y"),
295
399
args_list = arx_args)
296
400
297
- out_arx_lr$predictions
401
+ out_arx_lr$predictions %>% head()
298
402
```
299
403
300
404
Other changes to the direct AR forecaster, like changing the engine, also work
301
- as expected.
405
+ as expected. Below we use a boosted tree model instead of a linear regression.
302
406
303
- ``` {r arx-rf, include=F , warning=F}
407
+ ``` {r arx-rf, include=T , warning=F}
304
408
out_arx_rf <- arx_forecaster(
305
- employ , "med_income_2y", c("med_income_2y", "med_income_5y", "num_graduates"),
409
+ employ_small , "med_income_2y", c("med_income_2y", "med_income_5y"),
306
410
trainer = parsnip::boost_tree(mode = "regression", trees = 20),
307
411
args_list = arx_args)
308
412
309
- out_arx_rf$predictions
413
+ out_arx_rf$predictions %>% head()
310
414
```
0 commit comments