Skip to content

Commit 3bf0c58

Browse files
committed
WIP
1 parent 0a6bc06 commit 3bf0c58

File tree

1 file changed

+192
-88
lines changed

1 file changed

+192
-88
lines changed

vignettes/panel-data.Rmd

Lines changed: 192 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ library(parsnip)
2222
library(recipes)
2323
library(epiprocess)
2424
library(epipredict)
25+
library(ggplot2)
2526
```
2627

2728
[Panel data](https://en.wikipedia.org/wiki/Panel_data), or longitudinal data,
@@ -68,8 +69,8 @@ modifications to get a subset of the full dataset:
6869

6970
* Only keep provincial-level geographic region (the full data also has
7071
"Canada" as a region)
71-
* Only keep "good" or better quality data rows, as indicated by the [`STATUS`]
72-
(https://www.statcan.gc.ca/en/concepts/definitions/guide-symbol) column
72+
* Only keep "good" or better quality data rows, as indicated by the [`STATUS`](
73+
https://www.statcan.gc.ca/en/concepts/definitions/guide-symbol) column
7374
* Choose a subset of covariates and aggregate across the remaining ones. The
7475
chosen covariates are age group, field of study, and educational qualification.
7576

@@ -82,11 +83,13 @@ library(cansim)
8283
# Get statcan data using get_cansim, which returns a tibble
8384
statcan_grad_employ <- get_cansim("37-10-0115-01")
8485
85-
gemploy <- statcan_grad_employ %>%
86+
gemploy <- statcan_grad_employ %>%
8687
# Drop some columns and rename the ones we keep
87-
select(c("REF_DATE", "GEO", "VALUE", "STATUS", "Educational qualification",
88-
"Field of study", "Gender", "Age group", "Status of student in Canada",
89-
"Characteristics after graduation", "Graduate statistics")) %>%
88+
select(c(
89+
"REF_DATE", "GEO", "VALUE", "STATUS", "Educational qualification",
90+
"Field of study", "Gender", "Age group", "Status of student in Canada",
91+
"Characteristics after graduation", "Graduate statistics"
92+
)) %>%
9093
rename(
9194
"geo_value" = "GEO",
9295
"time_value" = "REF_DATE",
@@ -98,40 +101,43 @@ gemploy <- statcan_grad_employ %>%
98101
"age_group" = "Age group",
99102
"student_status" = "Status of student in Canada",
100103
"grad_charac" = "Characteristics after graduation",
101-
"grad_stat" = "Graduate statistics") %>%
102-
# The original `VALUE` column contain the statistic indicated by
103-
# `Graduate statistics` in the original data. Below we pivot the data
104+
"grad_stat" = "Graduate statistics"
105+
) %>%
106+
# The original `VALUE` column contain the statistic indicated by
107+
# `Graduate statistics` in the original data. Below we pivot the data
104108
# wider so that each unique statistic can have its own column.
105109
mutate(
106110
# Recode for easier pivoting
107111
grad_stat = recode_factor(
108-
grad_stat,
109-
`Number of graduates` = "num_graduates",
112+
grad_stat,
113+
`Number of graduates` = "num_graduates",
110114
`Median employment income two years after graduation` = "med_income_2y",
111-
`Median employment income five years after graduation` = "med_income_5y"),
115+
`Median employment income five years after graduation` = "med_income_5y"
116+
),
112117
# They are originally strings but want ints for conversion to epi_df later
113118
time_value = as.integer(time_value)
114119
) %>%
115120
pivot_wider(names_from = grad_stat, values_from = value) %>%
116121
filter(
117122
# Drop aggregates for some columns
118-
geo_value != "Canada" &
119-
age_group != "15 to 64 years" &
120-
fos != "Total, field of study" &
121-
edu_qual != "Total, educational qualification" &
122-
# Keep aggregates for keys we don't want to keep
123-
gender == "Total, gender" &
124-
student_status == "Canadian and international students" &
125-
# Since we're looking at 2y and 5y employment income, the only
126-
# characteristics remaining are:
127-
# - Graduates reporting employment income
128-
# - Graduates reporting wages, salaries, and commissions only
129-
# For simplicity, keep the first one only
130-
grad_charac == "Graduates reporting employment income" &
131-
# Only keep "good" data
132-
is.na(status) &
133-
# Drop NA value rows
134-
!is.na(num_graduates) & !is.na(med_income_2y) & !is.na(med_income_5y)) %>%
123+
geo_value != "Canada" &
124+
age_group != "15 to 64 years" &
125+
fos != "Total, field of study" &
126+
edu_qual != "Total, educational qualification" &
127+
# Keep aggregates for keys we don't want to keep
128+
gender == "Total, gender" &
129+
student_status == "Canadian and international students" &
130+
# Since we're looking at 2y and 5y employment income, the only
131+
# characteristics remaining are:
132+
# - Graduates reporting employment income
133+
# - Graduates reporting wages, salaries, and commissions only
134+
# For simplicity, keep the first one only
135+
grad_charac == "Graduates reporting employment income" &
136+
# Only keep "good" data
137+
is.na(status) &
138+
# Drop NA value rows
139+
!is.na(num_graduates) & !is.na(med_income_2y) & !is.na(med_income_5y)
140+
) %>%
135141
select(-c(status, gender, student_status, grad_charac))
136142
```
137143

@@ -149,15 +155,17 @@ a list of all the `type_type`s available.
149155
```{r convert-to-epidf, eval=F}
150156
grad_employ_subset <- gemploy %>%
151157
tsibble::as_tsibble(
152-
index=time_value,
153-
key=c(geo_value, age_group, fos, edu_qual)) %>%
158+
index = time_value,
159+
key = c(geo_value, age_group, fos, edu_qual)
160+
) %>%
154161
as_epi_df(
155162
geo_type = "custom", time_type = "year",
156-
additional_metadata=c(other_keys=list("age_group", "fos", "edu_qual")))
163+
additional_metadata = c(other_keys = list("age_group", "fos", "edu_qual"))
164+
)
157165
```
158166

159167
```{r data-dim, include=F}
160-
employ_rowcount <- format(nrow(grad_employ_subset), big.mark=",")
168+
employ_rowcount <- format(nrow(grad_employ_subset), big.mark = ",")
161169
employ_colcount <- length(names(grad_employ_subset))
162170
```
163171

@@ -190,83 +198,160 @@ In the following sections, we will go over preprocessing the data in the
190198

191199
# Preprocessing
192200

193-
We will create an `epi_recipe` that adds one `ahead` column and 3 `lag` columns.
194-
The `ahead` column tells us how many time units ahead to predict, and the `lag`
195-
columns tell us how many previous time points to include as covariates. We will
196-
just work with one of the time series in our data for now: `num_graduates`.
201+
As a simple example, let's work with the `num_graduates` column for now.
202+
203+
```{r employ-small, include=T}
204+
employ_small <- employ %>%
205+
group_by(geo_value, time_value, age_group, edu_qual) %>%
206+
summarise_if(is.numeric, sum) %>%
207+
ungroup() %>%
208+
# Incomplete data - exclude
209+
filter(geo_value != "Territories") %>%
210+
# Select groups where there are complete timeseries values
211+
group_by(geo_value, age_group, edu_qual) %>%
212+
filter(n() >= 6) %>%
213+
mutate(
214+
num_graduates_prop = num_graduates / sum(num_graduates)
215+
) %>%
216+
# med_income_2y_prop = med_income_2y / sum(med_income_2y),
217+
# med_income_5y_prop = med_income_5y / sum(med_income_5y)) %>%
218+
ungroup() %>%
219+
# select(-c(med_income_2y, med_income_5y, num_graduates)) %>%
220+
# Bug: shouldn't have to cast back to epi_df
221+
as_epi_df(
222+
geo_type = "custom",
223+
time_type = "year",
224+
additional_metadata = c(other_keys = list("age_group", "edu_qual")))
225+
head(employ_small)
226+
```
227+
228+
Below is a visualization for a sample of the small data. Note that some groups
229+
do not have any time series information since we filtered out all timeseries
230+
with incomplete dates.
231+
232+
```{r employ-small-graph, include=F, eval=F}
233+
employ_small %>%
234+
filter(geo_value %in% c("British Columbia", "Ontario")) %>%
235+
filter(grepl("degree", edu_qual, fixed = T)) %>%
236+
ggplot(aes(x = time_value, y = num_graduates_prop, color = geo_value)) +
237+
geom_line() +
238+
facet_grid(rows = vars(edu_qual), cols = vars(age_group)) +
239+
xlab("Year") +
240+
ylab("# of graduates as proportion of sum within group") +
241+
ggtitle("Trend in # of Graduates by Age Group and Education in BC and ON")
242+
```
243+
244+
We will predict the number of graduates in the next year (time $t+1$) using an
245+
autoregressive model with three lags (i.e., an AR(3) model). Such a model is
246+
represented algebraically like this:
247+
248+
\[
249+
x_{t+1} =
250+
\phi_0 + \phi_1 x_{t} + \phi_2 x_{t-1} + \phi_3 x_{t-2} + \epsilon_t
251+
\]
252+
253+
where $x_i$ is the number of graduates at time $i$, and the current time is $t$.
197254

198-
In this preprocessing step, no computation really happens. It just provides
199-
a series of steps that will be applied when using `epi_workflow` later. And note
200-
that since we specified our `time_type` to be `year`, our `lag` and `lead`
255+
In the preprocessing step, we need to create additional columns in `employ` for
256+
each of $x_{t+1}$, $x_{t}$, $x_{t-1}$, and $x_{t-2}$. We do this via an
257+
`epi_recipe`. Note that creating an `epi_recipe` alone doesn't add these
258+
outcome and predictor columns; the recipe just stores the instructions for
259+
adding them.
260+
261+
Our `epi_recipe` should add one `ahead` column representing $x_{t+1}$ and
262+
3 `lag` columns representing $x_{t}$, $x_{t-1}$, and $x_{t-2}$. Also note that
263+
since we specified our `time_type` to be `year`, our `lag` and `lead`
201264
values are both in years.
202265

203-
```{r make-recipe, include=T}
204-
r <- epi_recipe(employ) %>%
205-
step_epi_ahead(num_graduates, ahead = 1) %>% # lag & ahead units in years
206-
step_epi_lag(num_graduates, lag = 0:2) %>%
207-
step_epi_naomit()
266+
```{r make-recipe, include=T, eval=F}
267+
# r <- epi_recipe(employ) %>%
268+
# step_epi_ahead(num_graduates, ahead = 1) %>% # lag & ahead units in years
269+
# step_epi_lag(num_graduates, lag = 0:2) %>%
270+
# step_epi_naomit()
271+
# r
272+
273+
r <- epi_recipe(employ_small) %>%
274+
step_epi_ahead(num_graduates_prop, ahead = 1) %>% # lag & ahead units in years
275+
step_epi_lag(num_graduates_prop, lag = 0:2) %>%
276+
step_epi_naomit()
208277
r
209278
```
210279

211280
There are 3 `raw` roles which are our three lagged `num_graduates` columns, and
212281
three `key` roles which are our additional keys `age_group`, `fos` and
213-
`edu_qual`. Let's apply this recipe using `prep` and `bake` to see all of the
214-
additional `lag` and `ahead` columns created.
282+
`edu_qual`.
283+
284+
Let's apply this recipe using `prep` and `bake` to generate and view the `lag`
285+
and `ahead` columns.
215286

216287
```{r view-preprocessed, include=T}
217288
# Display a sample of the preprocessed data
218-
baked_sample <- r %>% prep() %>% bake(new_data = employ) %>% sample_n(5)
219-
baked_sample
289+
baked_sample <- r %>%
290+
prep() %>%
291+
bake(new_data = employ_small) %>%
292+
sample_n(5)
293+
# baked_sample
220294
```
221295

296+
We can see that the `prep` and `bake` steps created new columns according to
297+
our `epi_recipe`:
298+
299+
- `ahead_1_num_graduates` corresponds to $x_{t+1}$
300+
- `lag_0_num_graduates`, `lag_1_num_graduates`, and `lag_2_num_graduates`
301+
correspond to $x_{t}$, $x_{t-1}$, and $x_{t-2}$ respectively.
302+
222303
# Model fitting and prediction
223304

224305
## Within recipes framework
225306

226-
We will look at a simple model: [`parsnip::linear_reg()`](
307+
Since our goal for now is to fit a simple autoregressive model, we can use
308+
[`parsnip::linear_reg()`](
227309
https://parsnip.tidymodels.org/reference/linear_reg.html) with the default
228310
engine `lm`, which fits a linear regression using ordinary least squares.
229-
Specifically, our model will be an autoregressive linear model with:
230-
231-
* Outcome $y_{t}$, the number of graduates at time $t$. Corresponds to
232-
setting `ahead = 1` in our recipe.
233-
<!-- TODO i don't like the wording i used here -->
234-
* Predictors $x_{t-1}$, $x_{t-2}$, and $x_{t-3}$, the number of graduates in the
235-
three consecutive years prior to time $t$. Corresponds to setting `lag = 0:2` in
236-
our recipe.
237-
238-
The model is represented algebraically as follows:
239-
240-
\[
241-
y_{t+1} =
242-
\beta_0 + \beta_1 x_{t} + \beta_2 x_{t-1} + \beta_3 x_{t-2} + \epsilon_t
243-
\]
244311

245312
We will use `epi_workflow` with the `epi_recipe` we defined in the
246-
preprocessing section to fit this model.
313+
preprocessing section along with the `parsnip::linear_reg()` model. Note again
314+
that `epi_workflow` is a container and doesn't actually do the fitting. We have
315+
to pass the workflow into `fit()` to get our model coefficients
316+
$\phi_i, i=0,...,3$.
247317

248318
```{r linearreg-wf, include=T}
249-
wf_linreg <- epi_workflow(r, parsnip::linear_reg()) %>% fit(employ)
319+
wf_linreg <- epi_workflow(r, parsnip::linear_reg()) %>%
320+
parsnip::fit(employ_small)
250321
wf_linreg
251322
```
252323

253-
<!-- TODO comment on the coefficients, say something about beta_i = lag_{i-1}_num_graduates -->
324+
This output tells us the coefficients of the fitted model; for instance,
325+
the intercept is $\phi_0 = -2.2426$ and the coefficient for $x_{t}$ is
326+
$\phi_1 = 1.14401$.
254327

255328
Now that we have our workflow, we can generate predictions from a subset of our
256-
data. For this demo, we will predict the number of graduates from the last 2
329+
data. For this demo, we will predict the number of graduates using the last 2
257330
years of our dataset.
258331

259332
```{r linearreg-predict, include=T}
260-
latest <- employ %>% filter(time_value >= max(time_value) - 2)
333+
latest <- employ_small %>% filter(time_value >= max(time_value) - 2)
261334
preds <- stats::predict(wf_linreg, latest) %>% filter(!is.na(.pred))
262-
# Display a sample of the prediction values
263-
head(preds)
335+
# Display a sample of the prediction values, excluding NAs
336+
preds %>% head()
264337
```
265338

266-
Notice that `predict` still returns an `epi_df` with all of the keys that were
267-
present in the original dataset.
339+
We can do this using the `augment` function too:
340+
```{r linearreg-augment, include=T}
341+
employ_small_with_preds <- augment(wf_linreg, latest)
342+
employ_small_with_preds %>% head()
343+
344+
employ_small_with_preds %>%
345+
mutate(resid = med_income_2y - .pred) %>%
346+
ggplot(aes(x = .pred, y = resid, color = geo_value)) +
347+
geom_point() +
348+
xlab("Fitted values") +
349+
ylab("Residuals") +
350+
ggtitle("Plot of fitted values vs. residuals")
351+
```
268352

269-
<!-- TODO: residuals, predictions commentary -->
353+
Notice that `predict` and `augment` both still returns an `epi_df` with all of
354+
the keys that were present in the original dataset.
270355

271356
## With canned forecasters
272357

@@ -278,33 +363,52 @@ and the direct autoregressive (AR) forecaster
278363
[`arx_forecaster`](
279364
https://cmu-delphi.github.io/epipredict/reference/arx_forecaster.html).
280365

281-
```{r flatline, include=T}
282-
out_fl <- flatline_forecaster(employ, "med_income_2y",
366+
With canned forecasters, we don't need to manually create a recipe and workflow;
367+
we just need to specify the lags, aheads, and some additional arguments that
368+
are passed in a forecast-specific way that we'll see below.
369+
370+
In this first example, we'll use `flatline_forecaster` to make a simple
371+
prediction of the 2-year median income for the next year, based on one previous
372+
time point. This model is representated algebraically as:
373+
\[y_{t+1} = \phi_0 + \phi_1 y_{t}\]
374+
where $y_i$ is the 2-year median income at time $i$.
375+
376+
```{r flatline, include=T, warning=F}
377+
out_fl <- flatline_forecaster(employ_small, "med_income_2y",
283378
args_list = flatline_args_list(
284-
ahead=1L, forecast_date = as.Date("2022-08-16")))
379+
ahead = 1L, forecast_date = as.Date("2015-01-01")))
285380
286-
augment(out_fl$epi_workflow, employ)
381+
augment(out_fl$epi_workflow, employ_small) %>% head()
287382
```
288383

289-
```{r arx-lr, include=T}
384+
In this second example, we'll use `arx_forecaster` to make a prediction of the
385+
2-year median income based on the previous two time points' 2-year median
386+
income _and_ 5-year median income. This model is represented algebraically as:
387+
\[
388+
y_{t+1} =
389+
\phi_0 + \phi_1 y_{t} + \phi_2 y_{t-1} + \phi_3 z_{t} + \phi_4 z_{t-1}
390+
\]
391+
where $y_i$ is as before, and $z_i$ is the 5-year median income at time $i$.
392+
393+
```{r arx-lr, include=T, warning=F}
290394
arx_args <- arx_args_list(
291-
lags = c(0L, 1L), ahead = 1L, forecast_date = as.Date("2022-08-01"))
395+
lags = c(0L, 1L), ahead = 1L, forecast_date = as.Date("2015-01-01"))
292396
293-
out_arx_lr <- arx_forecaster(employ, "med_income_2y",
294-
c("med_income_2y", "med_income_5y", "num_graduates"),
397+
out_arx_lr <- arx_forecaster(employ_small, "med_income_2y",
398+
c("med_income_2y", "med_income_5y"),
295399
args_list = arx_args)
296400
297-
out_arx_lr$predictions
401+
out_arx_lr$predictions %>% head()
298402
```
299403

300404
Other changes to the direct AR forecaster, like changing the engine, also work
301-
as expected.
405+
as expected. Below we use a boosted tree model instead of a linear regression.
302406

303-
```{r arx-rf, include=F, warning=F}
407+
```{r arx-rf, include=T, warning=F}
304408
out_arx_rf <- arx_forecaster(
305-
employ, "med_income_2y", c("med_income_2y", "med_income_5y", "num_graduates"),
409+
employ_small, "med_income_2y", c("med_income_2y", "med_income_5y"),
306410
trainer = parsnip::boost_tree(mode = "regression", trees = 20),
307411
args_list = arx_args)
308412
309-
out_arx_rf$predictions
413+
out_arx_rf$predictions %>% head()
310414
```

0 commit comments

Comments
 (0)