Merge pull request #148 from cmu-delphi/fix-epirecipe-var_info-bug

dajmcdon · web-flow · commit 91bd0b1d3651 · 2022-11-16T13:23:10.000-08:00
Fix epirecipe var info bug
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -67,4 +67,4 @@ Config/testthat/edition: 3
 Encoding: UTF-8
 LazyData: true
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.2.1
+RoxygenNote: 7.2.2
diff --git a/R/epi_recipe.R b/R/epi_recipe.R
@@ -113,7 +113,7 @@ epi_recipe.epi_df <-
     )
 
     ## Add types
-    var_info <- dplyr::full_join(get_types(x), var_info, by = "variable")
+    var_info <- dplyr::full_join(recipes:::get_types(x), var_info, by = "variable")
     var_info$source <- "original"
 
     ## arrange to easy order
@@ -371,7 +371,7 @@ prep.epi_recipe <- function(
     dplyr::group_by(variable) %>%
     dplyr::arrange(dplyr::desc(number)) %>%
     dplyr::summarise(
-      type = dplyr::first(type),
+      type = list(dplyr::first(type)),
       role = as.list(unique(unlist(role))),
       source = dplyr::first(source),
       number = dplyr::first(number),
diff --git a/tests/testthat/test-epi_recipe.R b/tests/testthat/test-epi_recipe.R
@@ -36,19 +36,21 @@ test_that("epi_recipe formula works", {
   r <- epi_recipe(y ~ x, tib)
   ref_var_info <- tibble::tribble(
     ~ variable, ~ type, ~ role, ~ source,
-    "x", "numeric", "predictor", "original",
-    "y", "numeric", "outcome", "original",
+    "x", c("integer", "numeric"), "predictor", "original",
+    "y", c("integer", "numeric"), "outcome", "original",
     "time_value", "date", "time_value", "original",
-    "geo_value", "nominal", "geo_value", "original"
+    "geo_value", c("string", "unordered", "nominal"), "geo_value", "original"
   )
   expect_identical(r$var_info, ref_var_info)
   expect_equal(nrow(r$template), 1L)
 
   # with an epi_key as a predictor
   r <- epi_recipe(y ~ x + geo_value, tib)
-  ref_var_info <- ref_var_info %>% tibble::add_row(
-    variable = "geo_value", type = "nominal", role = "predictor",
-    source = "original", .after = 1)
+  ref_var_info <- ref_var_info %>%
+    tibble::add_row(
+      variable = "geo_value", type = list(c("string", "unordered", "nominal")),
+      role = "predictor",
+      source = "original", .after = 1)
   expect_identical(r$var_info, ref_var_info)
   expect_equal(nrow(r$template), 1L)
 
@@ -61,11 +63,13 @@ test_that("epi_recipe formula works", {
 
   # with an additional key
   r <- epi_recipe(y ~ x + geo_value, tib)
-  ref_var_info <- ref_var_info %>% tibble::add_row(
-    variable = "z", type = "nominal", role = "key",
-    source = "original")
+  ref_var_info <- ref_var_info %>%
+    tibble::add_row(
+      variable = "z", type = list(c("string", "unordered", "nominal")),
+      role = "key",
+      source = "original")
 
-  #expect_identical(r$var_info, ref_var_info)
+  expect_identical(r$var_info, ref_var_info)
 
 })
 
@@ -81,20 +85,20 @@ test_that("epi_recipe epi_df works", {
   ref_var_info <- tibble::tribble(
     ~ variable, ~ type, ~ role, ~ source,
     "time_value", "date", "time_value", "original",
-    "geo_value", "nominal", "geo_value", "original",
-    "x", "numeric", "raw", "original",
-    "y", "numeric", "raw", "original"
+    "geo_value", c("string", "unordered", "nominal"), "geo_value", "original",
+    "x", c("integer", "numeric"), "raw", "original",
+    "y", c("integer", "numeric"), "raw", "original"
   )
   expect_identical(r$var_info, ref_var_info)
   expect_equal(nrow(r$template), 1L)
 
   r <- epi_recipe(tib, formula = y ~ x)
   ref_var_info <- tibble::tribble(
     ~ variable, ~ type, ~ role, ~ source,
-    "x", "numeric", "predictor", "original",
-    "y", "numeric", "outcome", "original",
+    "x", c("integer", "numeric"), "predictor", "original",
+    "y", c("integer", "numeric"), "outcome", "original",
     "time_value", "date", "time_value", "original",
-    "geo_value", "nominal", "geo_value", "original"
+    "geo_value", c("string", "unordered", "nominal"), "geo_value", "original"
   )
   expect_identical(r$var_info, ref_var_info)
   expect_equal(nrow(r$template), 1L)
@@ -106,7 +110,7 @@ test_that("epi_recipe epi_df works", {
   )
   ref_var_info <- ref_var_info %>%
     tibble::add_row(
-      variable = "time_value", type = "date", role = "funny_business",
+      variable = "time_value", type = list("date"), role = "funny_business",
       source = "original"
     )
   expect_identical(r$var_info, ref_var_info)
diff --git a/vignettes/epipredict.Rmd b/vignettes/epipredict.Rmd
@@ -154,7 +154,7 @@ out_q <- arx_forecaster(jhu, "death_rate", c("case_rate", "death_rate"),
 The column `.pred_dstn` in the `predictions` object is actually a "distribution" here parameterized by its quantiles. For this default forecaster, these are created using the quantiles of the residuals of the predictive model (possibly symmetrized). Here, we used 23 quantiles, but one can grab a particular quantile
 
 ```{r q1}
-quantile(out_q$predictions$.pred_distn, p = .4)
+head(quantile(out_q$predictions$.pred_distn, p = .4))
 ```
 
 Or extract the entire distribution into a "long" `epi_df` with `tau` being the probability and `q` being the value associated to that quantile.
@@ -172,7 +172,7 @@ Further simple adjustments can be made using the function.
 arx_args_list(
   lags = c(0L, 7L, 14L), ahead = 7L, min_train_window = 20L, 
   forecast_date = NULL, target_date = NULL, levels = c(0.05, 0.95),
-  symmetrize = TRUE, nonneg = TRUE, quantile_by_key = TRUE
+  symmetrize = TRUE, nonneg = TRUE, quantile_by_key = "geo_value"
 )
 ```
 
diff --git a/vignettes/preprocessing-and-models.Rmd b/vignettes/preprocessing-and-models.Rmd
@@ -549,14 +549,16 @@ Notice the difference in number of rows `b1` and `b2` returns. This is because
 the second version, the one that doesn't use `step_epi_ahead` and `step_epi_lag`,
 has omitted dates compared to the one that used the `epipredict` functions.
 ```{r}
-dates_used_in_training1 <-
- b1 %>% select(- ahead_7_death_rate) %>% na.omit() %>% select(time_value)
-
+dates_used_in_training1 <- b1 %>% 
+  select(- ahead_7_death_rate) %>% 
+  na.omit() %>% 
+  select(time_value)
 dates_used_in_training1
 
-dates_used_in_training2 <- 
-  b2 %>% select(- ahead7death_rate) %>% na.omit() %>% select(time_value)
-
+dates_used_in_training2 <- b2 %>% 
+  select(- ahead7death_rate) %>% 
+  na.omit() %>% 
+  select(time_value)
 dates_used_in_training2
 ```