cmu-delphi
diff --git a/‎NAMESPACE
Lines changed: 0 additions & 1 deletion b/‎NAMESPACE
Lines changed: 0 additions & 1 deletion
diff --git a/‎R/data.R
Lines changed: 1 addition & 2 deletions b/‎R/data.R
Lines changed: 1 addition & 2 deletions
diff --git a/‎data-raw/grad_employ_subset.R
Lines changed: 4 additions & 7 deletions b/‎data-raw/grad_employ_subset.R
Lines changed: 4 additions & 7 deletions
diff --git a/‎data/grad_employ_subset.rda
-31.4 KB b/‎data/grad_employ_subset.rda
-31.4 KB
diff --git a/‎man/bake.Rd
Lines changed: 6 additions & 5 deletions b/‎man/bake.Rd
Lines changed: 6 additions & 5 deletions
diff --git a/‎man/epi_juice.Rd
Lines changed: 3 additions & 2 deletions b/‎man/epi_juice.Rd
Lines changed: 3 additions & 2 deletions
diff --git a/‎man/grad_employ_subset.Rd
Lines changed: 1 addition & 2 deletions b/‎man/grad_employ_subset.Rd
Lines changed: 1 addition & 2 deletions
diff --git a/‎vignettes/panel-data.Rmd
Lines changed: 11 additions & 27 deletions b/‎vignettes/panel-data.Rmd
Lines changed: 11 additions & 27 deletions
@@ -137,6 +137,5 @@ importFrom(stats,predict)
 importFrom(stats,qnorm)
 importFrom(stats,quantile)
 importFrom(stats,residuals)
-importFrom(stats,setNames)
 importFrom(tibble,is_tibble)
 importFrom(tibble,tibble)
@@ -36,13 +36,12 @@
 
 #' Subset of Statistics Canada median employment income for postsecondary graduates
 #' 
-#' @format A tibble with 10193 rows and 8 variables:
+#' @format A tibble with 1607 rows and 7 variables:
 #' \describe{
 #'   \item{geo_value}{The province in Canada associated with each 
 #'      row of measurements.}
 #'   \item{time_value}{The time value, a year integer in YYYY format}
 #'   \item{edu_qual}{The education qualification}
-#'   \item{fos}{The field of study}
 #'   \item{age_group}{The age group; either 15 to 34 or 35 to 64}
 #'   \item{num_graduates}{The number of graduates for the given row of characteristics}
 #'   \item{med_income_2y}{The median employment income two years after graduation}
 
@@ -74,9 +74,9 @@ gemploy <- statcan_grad_employ %>%
     # Drop aggregates for some columns
     geo_value != "Canada" & 
     age_group != "15 to 64 years" &
-    fos != "Total, field of study" &
     edu_qual != "Total, educational qualification" &
     # Keep aggregates for keys we don't want to keep
+    fos == "Total, field of study" &
     gender == "Total, gender" &
     student_status == "Canadian and international students" &
     # Since we're looking at 2y and 5y employment income, the only 
@@ -89,7 +89,7 @@ gemploy <- statcan_grad_employ %>%
     is.na(status) & 
     # Drop NA value rows 
     !is.na(num_graduates) & !is.na(med_income_2y) & !is.na(med_income_5y)) %>%
-  select(-c(status, gender, student_status, grad_charac))
+  select(-c(status, gender, student_status, grad_charac, fos))
 
 # gemploy$time_value %>% unique()
 # class(gemploy$fos)
@@ -99,16 +99,13 @@ gemploy <- statcan_grad_employ %>%
 nrow(gemploy)
 ncol(gemploy)
 
-gemploy$grad_charac %>% unique()
-gemploy %>% group_by(grad_charac) %>% slice(1)
-
 grad_employ_subset <- gemploy %>%
   tsibble::as_tsibble(
     index=time_value, 
-    key=c(geo_value, age_group, fos, edu_qual)) %>%
+    key=c(geo_value, age_group, edu_qual)) %>%
   as_epi_df(
     geo_type = "custom", time_type = "year", as_of = "2022-07-19",
-    additional_metadata=c(other_keys=list("age_group", "fos", "edu_qual")))
+    additional_metadata = list(other_keys = c("age_group", "edu_qual")))
 usethis::use_data(grad_employ_subset, overwrite = TRUE)
 
 # ================== EDA ==================
 
@@ -122,9 +122,9 @@ gemploy <- statcan_grad_employ %>%
     # Drop aggregates for some columns
     geo_value != "Canada" &
       age_group != "15 to 64 years" &
-      fos != "Total, field of study" &
       edu_qual != "Total, educational qualification" &
       # Keep aggregates for keys we don't want to keep
+      fos == "Total, field of study" &
       gender == "Total, gender" &
       student_status == "Canadian and international students" &
       # Since we're looking at 2y and 5y employment income, the only
@@ -138,7 +138,7 @@ gemploy <- statcan_grad_employ %>%
       # Drop NA value rows
       !is.na(num_graduates) & !is.na(med_income_2y) & !is.na(med_income_5y)
   ) %>%
-  select(-c(status, gender, student_status, grad_charac))
+  select(-c(status, gender, student_status, grad_charac, fos))
 ```
 
 To use this data with `epipredict`, we need to convert it into `epi_df` format 
@@ -156,11 +156,11 @@ a list of all the `type_type`s available.
 grad_employ_subset <- gemploy %>%
   tsibble::as_tsibble(
     index = time_value,
-    key = c(geo_value, age_group, fos, edu_qual)
+    key = c(geo_value, age_group, edu_qual)
   ) %>%
   as_epi_df(
     geo_type = "custom", time_type = "year",
-    additional_metadata = c(other_keys = list("age_group", "fos", "edu_qual"))
+    additional_metadata = c(other_keys = list("age_group", "edu_qual"))
   )
 ```
 
@@ -202,34 +202,24 @@ As a simple example, let's work with the `num_graduates` column for now.
 
 ```{r employ-small, include=T}
 employ_small <- employ %>%
-  group_by(geo_value, time_value, age_group, edu_qual) %>%
-  summarise_if(is.numeric, sum) %>%
-  ungroup() %>%
   # Incomplete data - exclude
   filter(geo_value != "Territories") %>%
   # Select groups where there are complete timeseries values
   group_by(geo_value, age_group, edu_qual) %>%
   filter(n() >= 6) %>%
   mutate(
-    num_graduates_prop = num_graduates / sum(num_graduates)
-  ) %>%
-  # med_income_2y_prop = med_income_2y / sum(med_income_2y),
-  # med_income_5y_prop = med_income_5y / sum(med_income_5y)) %>%
-  ungroup() %>%
-  # select(-c(med_income_2y, med_income_5y, num_graduates)) %>%
-  # Bug: shouldn't have to cast back to epi_df
-  as_epi_df(
-    geo_type = "custom",
-    time_type = "year",
-    additional_metadata = c(other_keys = list("age_group", "edu_qual")))
+    num_graduates_prop = num_graduates / sum(num_graduates),
+    med_income_2y_prop = med_income_2y / sum(med_income_2y),
+    med_income_5y_prop = med_income_5y / sum(med_income_5y)) %>%
+  ungroup()
 head(employ_small)
 ```
 
 Below is a visualization for a sample of the small data. Note that some groups 
 do not have any time series information since we filtered out all timeseries 
 with incomplete dates.
 
-```{r employ-small-graph, include=F, eval=F}
+```{r employ-small-graph, include=T, eval=T}
 employ_small %>%
   filter(geo_value %in% c("British Columbia", "Ontario")) %>%
   filter(grepl("degree", edu_qual, fixed = T)) %>%
@@ -263,13 +253,7 @@ Our `epi_recipe` should add one `ahead` column representing $x_{t+1}$ and
 since we specified our `time_type` to be `year`, our `lag` and `lead`
 values are both in years. 
 
-```{r make-recipe, include=T, eval=F}
-# r <- epi_recipe(employ) %>%
-#   step_epi_ahead(num_graduates, ahead = 1) %>% # lag & ahead units in years
-#   step_epi_lag(num_graduates, lag = 0:2) %>%
-#   step_epi_naomit()
-# r
-
+```{r make-recipe, include=T, eval=T}
 r <- epi_recipe(employ_small) %>%
   step_epi_ahead(num_graduates_prop, ahead = 1) %>% # lag & ahead units in years
   step_epi_lag(num_graduates_prop, lag = 0:2) %>%
@@ -290,7 +274,7 @@ baked_sample <- r %>%
   prep() %>%
   bake(new_data = employ_small) %>%
   sample_n(5)
-# baked_sample
+baked_sample
 ```
 
 We can see that the `prep` and `bake` steps created new columns according to