Skip to content

Commit 4871589

Browse files
committed
bug: blocked by #291
1 parent 009abf5 commit 4871589

File tree

1 file changed

+76
-56
lines changed

1 file changed

+76
-56
lines changed

vignettes/panel-data.Rmd

Lines changed: 76 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,10 @@ vignette: >
99

1010
```{r setup, include=F}
1111
knitr::opts_chunk$set(
12+
echo = TRUE,
1213
collapse = TRUE,
1314
comment = "#>",
14-
warning = FALSE,
15-
message = FALSE
15+
out.width = "100%"
1616
)
1717
```
1818

@@ -24,6 +24,8 @@ library(recipes)
2424
library(epiprocess)
2525
library(epipredict)
2626
library(ggplot2)
27+
library(lubridate)
28+
theme_set(theme_bw())
2729
```
2830

2931
[Panel data](https://en.wikipedia.org/wiki/Panel_data), or longitudinal data,
@@ -38,16 +40,18 @@ dataset, which contains daily state-wise measures of `case_rate` and
3840
head(case_death_rate_subset, 3)
3941
```
4042

41-
`epipredict` functions work with data in [`epi_df`](
42-
https://cmu-delphi.github.io/epiprocess/reference/epi_df.html)
43+
`epipredict` functions work with data in
44+
[`epi_df`](https://cmu-delphi.github.io/epiprocess/reference/epi_df.html)
4345
format. Despite the stated goal and name of the package, other panel datasets
4446
are also valid candidates for `epipredict` functionality, as long as they are
4547
in `epi_df` format.
4648

4749
```{r employ-stats, include=F}
4850
data("grad_employ_subset")
49-
year_start <- min(grad_employ_subset$time_value)
50-
year_end <- max(grad_employ_subset$time_value)
51+
grad_employ_subset <- grad_employ_subset %>%
52+
mutate(time_value = ymd(paste0(time_value, "0101")))
53+
year_start <- year(min(grad_employ_subset$time_value))
54+
year_end <- year(max(grad_employ_subset$time_value))
5155
```
5256

5357
# Example panel data overview
@@ -90,7 +94,8 @@ gemploy <- statcan_grad_employ %>%
9094
select(c(
9195
"REF_DATE", "GEO", "VALUE", "STATUS", "Educational qualification",
9296
"Field of study", "Gender", "Age group", "Status of student in Canada",
93-
"Characteristics after graduation", "Graduate statistics")) %>%
97+
"Characteristics after graduation", "Graduate statistics"
98+
)) %>%
9499
rename(
95100
"geo_value" = "GEO",
96101
"time_value" = "REF_DATE",
@@ -102,7 +107,8 @@ gemploy <- statcan_grad_employ %>%
102107
"age_group" = "Age group",
103108
"student_status" = "Status of student in Canada",
104109
"grad_charac" = "Characteristics after graduation",
105-
"grad_stat" = "Graduate statistics") %>%
110+
"grad_stat" = "Graduate statistics"
111+
) %>%
106112
# The original `VALUE` column contain the statistic indicated by
107113
# `Graduate statistics` in the original data. Below we pivot the data
108114
# wider so that each unique statistic can have its own column.
@@ -115,7 +121,8 @@ gemploy <- statcan_grad_employ %>%
115121
`Median employment income five years after graduation` = "med_income_5y"
116122
),
117123
# They are originally strings but want ints for conversion to epi_df later
118-
time_value = as.integer(time_value)) %>%
124+
time_value = as.integer(time_value)
125+
) %>%
119126
pivot_wider(names_from = grad_stat, values_from = value) %>%
120127
filter(
121128
# Drop aggregates for some columns
@@ -135,7 +142,8 @@ gemploy <- statcan_grad_employ %>%
135142
# Only keep "good" data
136143
is.na(status) &
137144
# Drop NA value rows
138-
!is.na(num_graduates) & !is.na(med_income_2y) & !is.na(med_income_5y)) %>%
145+
!is.na(num_graduates) & !is.na(med_income_2y) & !is.na(med_income_5y)
146+
) %>%
139147
select(-c(status, gender, student_status, grad_charac, fos))
140148
```
141149

@@ -154,10 +162,12 @@ a list of all the `type_type`s available.
154162
grad_employ_subset <- gemploy %>%
155163
tsibble::as_tsibble(
156164
index = time_value,
157-
key = c(geo_value, age_group, edu_qual)) %>%
165+
key = c(geo_value, age_group, edu_qual)
166+
) %>%
158167
as_epi_df(
159168
geo_type = "custom", time_type = "year",
160-
additional_metadata = c(other_keys = list("age_group", "edu_qual")))
169+
additional_metadata = c(other_keys = list("age_group", "edu_qual"))
170+
)
161171
```
162172

163173
```{r data-dim, include=F}
@@ -169,7 +179,7 @@ Now, we are ready to use `grad_employ_subset` with `epipredict`.
169179
Our `epi_df` contains `r employ_rowcount` rows and `r employ_colcount` columns.
170180
Here is a quick summary of the columns in our `epi_df`:
171181

172-
* `time_value` (time value): year in YYYY format
182+
* `time_value` (time value): year in `date` format
173183
* `geo_value` (geo value): province in Canada
174184
* `num_graduates` (raw, time series value): number of graduates
175185
* `med_income_2y` (raw, time series value): median employment income 2 years
@@ -208,7 +218,8 @@ employ_small <- employ %>%
208218
mutate(
209219
num_graduates_prop = num_graduates / sum(num_graduates),
210220
med_income_2y_prop = med_income_2y / sum(med_income_2y),
211-
med_income_5y_prop = med_income_5y / sum(med_income_5y)) %>%
221+
med_income_5y_prop = med_income_5y / sum(med_income_5y)
222+
) %>%
212223
ungroup()
213224
head(employ_small)
214225
```
@@ -226,7 +237,8 @@ employ_small %>%
226237
facet_grid(rows = vars(edu_qual), cols = vars(age_group)) +
227238
xlab("Year") +
228239
ylab("# of graduates as proportion of sum within group") +
229-
ggtitle("Trend in # of Graduates by Age Group and Education in BC and ON")
240+
ggtitle("Trend in # of Graduates by Age Group and Education in BC and ON") +
241+
theme(legend.position = "bottom")
230242
```
231243

232244
We will predict the "standardized" number of graduates (a proportion) in the
@@ -254,8 +266,8 @@ values are both in years.
254266

255267
```{r make-recipe, include=T, eval=T}
256268
r <- epi_recipe(employ_small) %>%
257-
step_epi_ahead(num_graduates_prop, ahead = 1) %>% # lag & ahead units in years
258-
step_epi_lag(num_graduates_prop, lag = 0:2) %>%
269+
step_epi_ahead(num_graduates_prop, ahead = 365) %>% # lag & ahead units in days
270+
step_epi_lag(num_graduates_prop, lag = 0:2 * 365) %>%
259271
step_epi_naomit()
260272
r
261273
```
@@ -265,11 +277,11 @@ and `ahead` columns.
265277

266278
```{r view-preprocessed, include=T}
267279
# Display a sample of the preprocessed data
268-
bake_and_show_sample <- function(recipe, new_data, n=5) {
269-
recipe %>% prep() %>% bake(new_data = new_data) %>% sample_n(n)
280+
bake_and_show_sample <- function(recipe, data, n = 5) {
281+
recipe %>% prep(data) %>% bake(new_data = data) %>% sample_n(n)
270282
}
271283
272-
bake_and_show_sample(r, employ_small)
284+
r %>% bake_and_show_sample(employ_small)
273285
```
274286

275287
We can see that the `prep` and `bake` steps created new columns according to
@@ -337,7 +349,8 @@ First, we'll plot the residuals (that is, $y_{t} - \hat{y}_{t}$) against the
337349
fitted values ($\hat{y}_{t}$).
338350

339351
```{r lienarreg-resid-plot, include=T, fig.height=8}
340-
par(mfrow = c(2,2)); plot(extract_fit_engine(wf_linreg))
352+
par(mfrow = c(2, 2), mar = c(5, 3.5, 1, 1) + .5)
353+
plot(extract_fit_engine(wf_linreg))
341354
```
342355

343356
The fitted values vs. residuals plot shows us that the residuals are mostly
@@ -381,14 +394,14 @@ $z_i$ is the number of graduates (proportion) at time $i$.
381394
Again, we construct an `epi_recipe` detailing the preprocessing steps.
382395

383396
```{r custom-arx, include=T}
384-
rx <- epi_recipe(employ_small) %>%
385-
step_epi_ahead(med_income_5y_prop, ahead = 1) %>%
386-
# 5-year median income has 3 lags c(0,1,2)
387-
step_epi_lag(med_income_5y_prop, lag = c(0,1,2)) %>%
388-
# But the two exogenous variables have 2 lags c(0,1)
389-
step_epi_lag(med_income_2y_prop, lag = c(0,1)) %>%
390-
step_epi_lag(num_graduates_prop, lag = c(0,1)) %>%
391-
step_epi_naomit()
397+
rx <- epi_recipe(employ_small) %>%
398+
step_epi_ahead(med_income_5y_prop, ahead = 1) %>%
399+
# 5-year median income has 3 lags c(0,1,2)
400+
step_epi_lag(med_income_5y_prop, lag = c(0, 1, 2)) %>%
401+
# But the two exogenous variables have 2 lags c(0,1)
402+
step_epi_lag(med_income_2y_prop, lag = c(0, 1)) %>%
403+
step_epi_lag(num_graduates_prop, lag = c(0, 1)) %>%
404+
step_epi_naomit()
392405
393406
bake_and_show_sample(rx, employ_small)
394407
```
@@ -414,20 +427,25 @@ quantify the uncertainty associated with future predicted values.
414427
# Only have to include med_income_5y since that is our outcome
415428
totals <- employ_small %>%
416429
group_by(geo_value, age_group, edu_qual) %>%
417-
summarise(med_income_5y_tot = sum(med_income_5y))
430+
summarise(med_income_5y_tot = sum(med_income_5y), .groups = "keep")
418431
419-
# Define post-processing steps
432+
# Define post-processing steps
420433
f <- frosting() %>%
421434
layer_predict() %>%
422-
layer_naomit(.pred) %>%
423-
layer_threshold(.pred, lower = 0) %>%
435+
layer_naomit(.pred) %>%
436+
layer_threshold(.pred, lower = 0) %>%
424437
# 90% prediction interval
425-
layer_residual_quantiles(probs = c(0.05, 0.95), symmetrize = F) %>%
438+
layer_residual_quantiles(
439+
quantile_levels = c(0.05, 0.95),
440+
symmetrize = FALSE
441+
) %>%
426442
layer_population_scaling(
427-
.pred, .pred_distn, df = totals, df_pop_col = "med_income_5y_tot")
428-
429-
wfx_linreg <- epi_workflow(rx, parsnip::linear_reg()) %>%
430-
fit(employ_small) %>%
443+
.pred, .pred_distn,
444+
df = totals, df_pop_col = "med_income_5y_tot"
445+
)
446+
447+
wfx_linreg <- epi_workflow(rx, parsnip::linear_reg()) %>%
448+
fit(employ_small) %>%
431449
add_frosting(f)
432450
433451
summary(extract_fit_engine(wfx_linreg))
@@ -440,20 +458,19 @@ confidence level. Both lags for the number of graduates were insigificant.
440458

441459
Let's take a look at the predictions along with their 90% prediction intervals.
442460

443-
```{r}
461+
```{r}
444462
latest <- get_test_data(recipe = rx, x = employ_small)
445463
predsx <- predict(wfx_linreg, latest)
446464
447465
# Display values within prediction intervals
448466
predsx %>%
449-
select(
450-
geo_value, time_value, edu_qual, age_group,
451-
.pred_scaled, .pred_distn_scaled) %>%
452-
dplyr::mutate(.quantiles = nested_quantiles(.pred_distn_scaled)) %>%
453-
tidyr::unnest(.quantiles) %>%
454-
pivot_wider(names_from = tau, values_from = q) %>%
455-
head()
456-
```
467+
select(
468+
geo_value, time_value, edu_qual, age_group, fos,
469+
.pred_scaled, .pred_distn_scaled
470+
) %>%
471+
head() %>%
472+
pivot_quantiles_wider(.pred_distn_scaled)
473+
```
457474

458475
# Using canned forecasters
459476

@@ -486,11 +503,11 @@ where $y_i$ is the 2-year median income (proportion) at time $i$.
486503
```{r flatline, include=T, warning=F}
487504
out_fl <- flatline_forecaster(employ_small, "med_income_2y_prop",
488505
args_list = flatline_args_list(
489-
ahead = 1L, forecast_date = as.Date("2015-01-01")))
506+
ahead = 365L, forecast_date = as.Date("2015-01-01"),
507+
)
508+
)
490509
491-
# The first argument to augment grabs the epi_workflow object from the
492-
# forecaster output.
493-
augment(out_fl$epi_workflow, employ_small) %>% sample_n(5)
510+
out_fl
494511
```
495512

496513
## Autoregressive forecaster with exogenous inputs
@@ -507,13 +524,15 @@ same number of lags.
507524

508525
```{r arx-lr, include=T, warning=F}
509526
arx_args <- arx_args_list(
510-
lags = c(0L, 1L), ahead = 1L, forecast_date = as.Date("2015-01-01"))
527+
lags = c(0L, 365L), ahead = 365L, forecast_date = as.Date("2015-01-01")
528+
)
511529
512530
out_arx_lr <- arx_forecaster(employ_small, "med_income_5y_prop",
513531
c("med_income_5y_prop", "med_income_2y_prop", "num_graduates_prop"),
514-
args_list = arx_args)
532+
args_list = arx_args
533+
)
515534
516-
augment(out_arx_lr$epi_workflow, employ_small) %>% sample_n(5)
535+
out_arx_lr
517536
```
518537

519538
Other changes to the direct AR forecaster, like changing the engine, also work
@@ -524,7 +543,8 @@ out_arx_rf <- arx_forecaster(
524543
employ_small, "med_income_5y_prop",
525544
c("med_income_5y_prop", "med_income_2y_prop", "num_graduates_prop"),
526545
trainer = parsnip::boost_tree(mode = "regression", trees = 20),
527-
args_list = arx_args)
546+
args_list = arx_args
547+
)
528548
529-
augment(out_arx_rf$epi_workflow, employ_small) %>% sample_n(5)
549+
out_arx_rf
530550
```

0 commit comments

Comments
 (0)