diff --git a/DESCRIPTION b/DESCRIPTION index bdbc337b4..e1a8b25d8 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -67,4 +67,4 @@ Config/testthat/edition: 3 Encoding: UTF-8 LazyData: true Roxygen: list(markdown = TRUE) -RoxygenNote: 7.2.2 +RoxygenNote: 7.2.3 diff --git a/R/arx_forecaster.R b/R/arx_forecaster.R index 9d537ab24..ad1d02288 100644 --- a/R/arx_forecaster.R +++ b/R/arx_forecaster.R @@ -38,6 +38,9 @@ arx_forecaster <- function(epi_data, # --- validation validate_forecaster_inputs(epi_data, outcome, predictors) + if (!inherits(args_list, "arx_alist")) { + cli_stop("args_list was not created using `arx_args_list().") + } if (!is.list(trainer) || trainer$mode != "regression") cli_stop("{trainer} must be a `parsnip` method of mode 'regression'.") lags <- arx_lags_validator(predictors, args_list$lags) @@ -121,7 +124,7 @@ arx_lags_validator <- function(predictors, lags) { #' [layer_residual_quantiles()] for more information. The default, #' `character(0)` performs no grouping. #' -#' @return A list containing updated parameter choices. +#' @return A list containing updated parameter choices with class `arx_alist`. #' @export #' #' @examples @@ -151,14 +154,15 @@ arx_args_list <- function(lags = c(0L, 7L, 14L), arg_is_probabilities(levels, allow_null = TRUE) max_lags <- max(lags) - enlist(lags = .lags, - ahead, - min_train_window, - levels, - forecast_date, - target_date, - symmetrize, - nonneg, - max_lags, - quantile_by_key) + structure(enlist(lags = .lags, + ahead, + min_train_window, + levels, + forecast_date, + target_date, + symmetrize, + nonneg, + max_lags, + quantile_by_key), + class = "arx_alist") } diff --git a/R/flatline_forecaster.R b/R/flatline_forecaster.R index 13f0c1f53..af2638c8a 100644 --- a/R/flatline_forecaster.R +++ b/R/flatline_forecaster.R @@ -32,6 +32,9 @@ flatline_forecaster <- function(epi_data, args_list = flatline_args_list()) { validate_forecaster_inputs(epi_data, outcome, "time_value") + if (!inherits(args_list, "flatline_alist")) { + cli_stop("args_list was not created using `flatline_args_list().") + } keys <- epi_keys(epi_data) ek <- keys[-1] outcome <- rlang::sym(outcome) @@ -75,7 +78,7 @@ flatline_forecaster <- function(epi_data, #' #' @inheritParams arx_args_list #' -#' @return A list containing updated parameter choices. +#' @return A list containing updated parameter choices with class `flatline_alist`. #' @export #' #' @examples @@ -99,14 +102,15 @@ flatline_args_list <- function(ahead = 7L, arg_is_lgl(symmetrize, nonneg) arg_is_probabilities(levels, allow_null = TRUE) - enlist(ahead, - min_train_window, - forecast_date, - target_date, - levels, - symmetrize, - nonneg, - quantile_by_key) + structure(enlist(ahead, + min_train_window, + forecast_date, + target_date, + levels, + symmetrize, + nonneg, + quantile_by_key), + class = "flatline_alist") } validate_forecaster_inputs <- function(epi_data, outcome, predictors) { diff --git a/README.Rmd b/README.Rmd index adc4c7dad..d79e7bc69 100644 --- a/README.Rmd +++ b/README.Rmd @@ -19,15 +19,15 @@ knitr::opts_chunk$set( [![R-CMD-check](https://github.com/cmu-delphi/epipredict/workflows/R-CMD-check/badge.svg)](https://github.com/cmu-delphi/epipredict/actions) -**Note:** This package is currently in development and likely will not work as expected. +**Note:** This package is currently in development and may not work as expected. Please file bug reports as issues in this repo, and we will do our best to address them quickly. ## Installation You can install the development version of epipredict from [GitHub](https://github.com/) with: ``` r -# install.packages("devtools") -devtools::install_github("cmu-delphi/epipredict") +# install.packages("remotes") +remotes::install_github("cmu-delphi/epipredict") ``` ## Documentation @@ -39,7 +39,7 @@ You can view documentation for the `main` branch at diff --git a/README.md b/README.md index 538a571ae..689c69076 100644 --- a/README.md +++ b/README.md @@ -8,8 +8,9 @@ [![R-CMD-check](https://github.com/cmu-delphi/epipredict/workflows/R-CMD-check/badge.svg)](https://github.com/cmu-delphi/epipredict/actions) -**Note:** This package is currently in development and likely will not -work as expected. +**Note:** This package is currently in development and may not work as +expected. Please file bug reports as issues in this repo, and we will do +our best to address them quickly. ## Installation @@ -17,8 +18,8 @@ You can install the development version of epipredict from [GitHub](https://github.com/) with: ``` r -# install.packages("devtools") -devtools::install_github("cmu-delphi/epipredict") +# install.packages("remotes") +remotes::install_github("cmu-delphi/epipredict") ``` ## Documentation @@ -32,48 +33,173 @@ You can view documentation for the `main` branch at 1. A set of basic, easy-to-use forecasters that work out of the box. You should be able to do a reasonably limited amount of - customization on them. (Any serious customization happens with point - number 2.) For the basic forecasters, we should provide, at least: - - Baseline flat-line forecaster - - Autoregressive forecaster - - Autoregressive classifier + customization on them. For the basic forecasters, we currently + provide: + - Baseline flat-line forecaster + - Autoregressive forecaster + - Autoregressive classifier 2. A framework for creating custom forecasters out of modular components. There are four types of components: - - Preprocessor: do things to the data before model training - - Trainer: train a model on data, resulting in a fitted model - object - - Predictor: make predictions, using a fitted model object - - Postprocessor: do things to the predictions before returning - -**Target audience:** - -- Basic. Has data, calls forecaster with default arguments. -- Intermediate. Wants to examine changes to the arguments, take - advantage of built in flexibility. -- Advanced. Wants to write their own forecasters. Maybe willing to - build up from some components that we write. - -The Advanced user should find their task to be relatively easy (and -we’ll show them how). - -**Example:** -During a quiet period, a user decides they want to first predict whether -a surge is about to occur, say using variant information from GISAID. -Then for surging locations, they want to train an AR model using past -surges in the same location. Everywhere else, they predict a flat line. -We should be able to do this in a few lines of code. - -Delphi’s own forecasts have been produced/evaluated in this way for a -while now, but the code base is scattered and evolving. We want to -consolidate, generalize, and simplify to allow others to benefit as -well. - -The basic framework should allow for something like the following. This -would feel very familiar to anyone working in `R`+`{tidyverse}`. + - Preprocessor: do things to the data before model training + - Trainer: train a model on data, resulting in a fitted model object + - Predictor: make predictions, using a fitted model object + - Postprocessor: do things to the predictions before returning -**Simple linear autoregressive model with scaling (modular)** +**Target audiences:** + +- Basic. Has data, calls forecaster with default arguments. +- Intermediate. Wants to examine changes to the arguments, take + advantage of built in flexibility. +- Advanced. Wants to write their own forecasters. Maybe willing to build + up from some components that we write. + +The Advanced user should find their task to be relatively easy. Examples +of these tasks are illustrated in the [vignettes and +articles](https://cmu-delphi.github.io/epipredict). + +## Intermediate example + +The package comes with some built-in historical data for illustration, +but up-to-date versions of this could be downloaded with the +[`{covidcast}` +package](https://cmu-delphi.github.io/covidcast/covidcastR/index.html) +and processed using +[`{epiprocess}`](https://cmu-delphi.github.io/epiprocess/).[^1] + +``` r +library(tidyverse) +library(epipredict) +jhu <- case_death_rate_subset +jhu +#> An `epi_df` object, 20,496 x 4 with metadata: +#> * geo_type = state +#> * time_type = day +#> * as_of = 2022-05-31 12:08:25 +#> +#> # A tibble: 20,496 × 4 +#> geo_value time_value case_rate death_rate +#> * +#> 1 ak 2020-12-31 35.9 0.158 +#> 2 al 2020-12-31 65.1 0.438 +#> 3 ar 2020-12-31 66.0 1.27 +#> 4 as 2020-12-31 0 0 +#> 5 az 2020-12-31 76.8 1.10 +#> 6 ca 2020-12-31 96.0 0.751 +#> 7 co 2020-12-31 35.8 0.649 +#> 8 ct 2020-12-31 52.1 0.819 +#> 9 dc 2020-12-31 31.0 0.601 +#> 10 de 2020-12-31 65.2 0.807 +#> # … with 20,486 more rows +``` + +To create and train a simple auto-regressive forecaster to predict the +death rate two weeks into the future using past (lagged) deaths and +cases, we could use the following function. ``` r +two_week_ahead <- arx_forecaster( + jhu, + outcome = "death_rate", + predictors = c("case_rate", "death_rate"), + args_list = arx_args_list( + lags = list(c(0,1,2,3,7,14), c(0,7,14)), + ahead = 14 + ) +) +``` + +In this case, we have used a number of different lags for the case rate, +while only using 3 weekly lags for the death rate (as predictors). The +result is both a fitted model object which could be used any time in the +future to create different forecasts, as well as a set of predicted +values (and prediction intervals) for each location 14 days after the +last available time value in the data. + +``` r +two_week_ahead$epi_workflow +#> ══ Epi Workflow [trained] ══════════════════════════════════════════════════════ +#> Preprocessor: Recipe +#> Model: linear_reg() +#> Postprocessor: Frosting +#> +#> ── Preprocessor ──────────────────────────────────────────────────────────────── +#> 5 Recipe Steps +#> +#> • step_epi_lag() +#> • step_epi_lag() +#> • step_epi_ahead() +#> • step_naomit() +#> • step_naomit() +#> +#> ── Model ─────────────────────────────────────────────────────────────────────── +#> +#> Call: +#> stats::lm(formula = ..y ~ ., data = data) +#> +#> Coefficients: +#> (Intercept) lag_0_case_rate lag_1_case_rate lag_2_case_rate +#> -0.0073358 0.0030365 0.0012467 0.0009536 +#> lag_3_case_rate lag_7_case_rate lag_14_case_rate lag_0_death_rate +#> 0.0011425 0.0012481 0.0003041 0.1351769 +#> lag_7_death_rate lag_14_death_rate +#> 0.1471127 0.1062473 +#> +#> ── Postprocessor ─────────────────────────────────────────────────────────────── +#> 5 Frosting Layers +#> +#> • layer_predict() +#> • layer_residual_quantiles() +#> • layer_add_forecast_date() +#> • layer_add_target_date() +#> • layer_threshold() +``` + +The fitted model here involved preprocessing the data to appropriately +generate lagged predictors, estimating a linear model with `stats::lm()` +and then postprocessing the results to be meaningful for epidemiological +tasks. We can also examine the predictions. + +``` r +two_week_ahead$predictions +#> An `epi_df` object, 56 x 6 with metadata: +#> * geo_type = state +#> * time_type = day +#> * as_of = 2022-05-31 12:08:25 +#> +#> # A tibble: 56 × 6 +#> geo_value time_value .pred .pred_distn forecast_date target_date +#> * +#> 1 ak 2021-12-31 0.449 [0.05, 0.95] 2021-12-31 2022-01-14 +#> 2 al 2021-12-31 0.574 [0.05, 0.95] 2021-12-31 2022-01-14 +#> 3 ar 2021-12-31 0.673 [0.05, 0.95] 2021-12-31 2022-01-14 +#> 4 as 2021-12-31 0 [0.05, 0.95] 2021-12-31 2022-01-14 +#> 5 az 2021-12-31 0.679 [0.05, 0.95] 2021-12-31 2022-01-14 +#> 6 ca 2021-12-31 0.575 [0.05, 0.95] 2021-12-31 2022-01-14 +#> 7 co 2021-12-31 0.862 [0.05, 0.95] 2021-12-31 2022-01-14 +#> 8 ct 2021-12-31 1.07 [0.05, 0.95] 2021-12-31 2022-01-14 +#> 9 dc 2021-12-31 2.12 [0.05, 0.95] 2021-12-31 2022-01-14 +#> 10 de 2021-12-31 1.09 [0.05, 0.95] 2021-12-31 2022-01-14 +#> # … with 46 more rows +``` + +The results above show a distributional forecast produced using data +through the end of 2021 for the 14th of January 2022. A prediction for +the death rate per 100K inhabitants is available for every state +(`geo_value`) along with a 90% predictive interval. + + + +[^1]: Other epidemiological signals for non-Covid related illnesses are + available with [`{epidatr}`](https://github.com/cmu-delphi/epidatr) + which interfaces directly to Delphi’s [Epidata + API](https://cmu-delphi.github.io/delphi-epidata/) diff --git a/_pkgdown.yml b/_pkgdown.yml index 719d7cc22..dacf88cbf 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -1,6 +1,20 @@ url: https://cmu-delphi.github.io/epipredict/ template: bootstrap: 5 + bootswatch: cosmo + +navbar: + bg: dark + +home: + links: + - text: The epiprocess R package + href: https://cmu-delphi.github.io/epiprocess/ + - text: The covidcast R package + href: https://cmu-delphi.github.io/covidcast/covidcastR/ + - text: The epidatr R package + href: https://github.com/cmu-delphi/epidatr/ + reference: diff --git a/man/arx_args_list.Rd b/man/arx_args_list.Rd index 20a6e778e..d9e87101e 100644 --- a/man/arx_args_list.Rd +++ b/man/arx_args_list.Rd @@ -49,7 +49,7 @@ before calculating residual quantiles. See the \code{by_key} argument to \code{character(0)} performs no grouping.} } \value{ -A list containing updated parameter choices. +A list containing updated parameter choices with class \code{arx_alist}. } \description{ Constructs a list of arguments for \code{\link[=arx_forecaster]{arx_forecaster()}}. diff --git a/man/flatline_args_list.Rd b/man/flatline_args_list.Rd index 2982d0107..1bdb48e0e 100644 --- a/man/flatline_args_list.Rd +++ b/man/flatline_args_list.Rd @@ -45,7 +45,7 @@ before calculating residual quantiles. See the \code{by_key} argument to \code{character(0)} performs no grouping.} } \value{ -A list containing updated parameter choices. +A list containing updated parameter choices with class \code{flatline_alist}. } \description{ Constructs a list of arguments for \code{\link[=flatline_forecaster]{flatline_forecaster()}}. diff --git a/tests/testthat/test-arx_args_list.R b/tests/testthat/test-arx_args_list.R index c540eb3fc..61dd374fe 100644 --- a/tests/testthat/test-arx_args_list.R +++ b/tests/testthat/test-arx_args_list.R @@ -1,4 +1,5 @@ test_that("arx_args checks inputs", { + expect_s3_class(arx_args_list(), "arx_alist") expect_error(arx_args_list(ahead = c(0, 4))) expect_error(arx_args_list(min_train_window = c(28, 65))) diff --git a/vignettes/articles/sliding.Rmd b/vignettes/articles/sliding.Rmd index 5d086b516..eec8b4607 100644 --- a/vignettes/articles/sliding.Rmd +++ b/vignettes/articles/sliding.Rmd @@ -93,10 +93,9 @@ fc_time_values <- seq(as.Date("2020-08-01"), as.Date("2021-12-01"), k_week_ahead <- function(epi_df, outcome, predictors, ahead = 7, engine) { epi_df %>% epi_slide( - ~arx_forecaster( + ~ arx_forecaster( .x, outcome, predictors, engine, - args_list = arx_args_list(ahead = ahead) - ) %>% + args_list = arx_args_list(ahead = ahead)) %>% extract2("predictions") %>% select(-c(geo_value, time_value)), n = 120, @@ -107,19 +106,21 @@ k_week_ahead <- function(epi_df, outcome, predictors, ahead = 7, engine) { mutate(engine_type = engine$engine) } -# Generate the forecasts, and bind them together +# Generate the forecasts and bind them together fc <- bind_rows( purrr::map_dfr( c(7,14,21,28), - ~ k_week_ahead(x_latest, "case_rate", c("case_rate", "percent_cli"), .x, - engine = linear_reg()) - ), + ~ k_week_ahead( + x_latest, "case_rate", c("case_rate", "percent_cli"), .x, + engine = linear_reg()) + ), purrr::map_dfr( c(7,14,21,28), - ~ k_week_ahead(x_latest, "case_rate", c("case_rate", "percent_cli"), .x, - engine = rand_forest(mode = "regression")) - )) %>% - mutate(.pred_distn = nested_quantiles(fc_.pred_distn)) %>% # "nested" list-col + ~ k_week_ahead( + x_latest, "case_rate", c("case_rate", "percent_cli"), .x, + engine = rand_forest(mode = "regression")) + )) %>% + mutate(.pred_distn = nested_quantiles(fc_.pred_distn)) %>% unnest(.pred_distn) %>% pivot_wider(names_from = tau, values_from = q) ``` @@ -127,20 +128,19 @@ fc <- bind_rows( Here, `arx_forecaster()` does all the heavy lifting. It creates leads of the target (respecting time stamps and locations) along with lags of the features (here, the response and doctors visits), estimates a forecasting model using the -specified engine, creates predictions, and non-parametric confidence bands. All -of these are tunable parameters. +specified engine, creates predictions, and non-parametric confidence bands. To see how the predictions compare, we plot them on top of the latest case -rates. Note that even though we've fitted on all states, we'll just display the +rates. Note that even though we've fitted the model on all states, +we'll just display the results for two states, California (CA) and Florida (FL), to get a sense of the -model performance while keeping it simple. So feel free to modify the code to -look over the results for other states. +model performance while keeping the graphic simple. ```{r plot-arx, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 6} fc_cafl <- fc %>% filter(geo_value %in% c("ca", "fl")) x_latest_cafl <- x_latest %>% filter(geo_value %in% c("ca", "fl")) -ggplot(fc_cafl, aes(x = fc_target_date, group = time_value, fill = engine_type)) + +ggplot(fc_cafl, aes(fc_target_date, group = time_value, fill = engine_type)) + geom_line(data = x_latest_cafl, aes(x = time_value, y = case_rate), inherit.aes = FALSE, color = "gray50") + geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`), alpha = 0.4) + @@ -155,14 +155,17 @@ ggplot(fc_cafl, aes(x = fc_target_date, group = time_value, fill = engine_type)) ``` For the two states of interest, simple linear regression clearly performs better -than random forest in terms of accuracy of the predictions and not does not -result in such in overconfident predictions (too narrow confidence bands). -Though, in general, both approaches do not perform great. This could be because +than random forest in terms of accuracy of the predictions and does not +result in such in overconfident predictions (overly narrow confidence bands). +Though, in general, neither approach produces amazingly accurate forecasts. +This could be because the behaviour is rather different across states and the effects of other notable factors such as age and public health measures may be important to account for in such forecasting. Including such factors as well as making enhancements such -as correcting for outliers are some improvements one could make to our simple -model in the future. +as correcting for outliers are some improvements one could make to this simple +model.[^1] + +[^1]: Note that, despite the above caveats, simple models like this tend to out-perform many far more complicated models in the online Covid forecasting due to those models high variance predictions. ### Example using case data from Canada @@ -175,7 +178,7 @@ daily time series data on COVID-19 cases, deaths, recoveries, testing and vaccinations at the health region and province levels. Data are collected from publicly available sources such as government datasets and news releases. Unfortunately, there is no simple versioned source, so we have created our own -from the Commit history. +from the Github commit history. First, we load versioned case rates at the provincial level. After converting these to 7-day averages (due to highly variable provincial reporting diff --git a/vignettes/epipredict.Rmd b/vignettes/epipredict.Rmd index b9a949552..cff1840ed 100644 --- a/vignettes/epipredict.Rmd +++ b/vignettes/epipredict.Rmd @@ -21,14 +21,13 @@ library(parsnip) library(workflows) library(recipes) library(epiprocess) -# remotes::install_github("cmu-delphi/epipredict") library(epipredict) ``` # Goals for the package -At a high level, our goal with `epipredict` is to make running simple Machine Learning / Statistical forecasters for epidemiology easy. However, this package is extremely extensible, and that is part of the utility. Our hope is that it is easy for users with epi training and some statistics to fit baseline models while still allowing those with more nuanced statistical understanding to create complicated specializations within the same framework. +At a high level, our goal with `{epipredict}` is to make running simple Machine Learning / Statistical forecasters for epidemiology easy. However, this package is extremely extensible, and that is part of its utility. Our hope is that it is easy for users with epi training and some statistics to fit baseline models while still allowing those with more nuanced statistical understanding to create complicated specializations using the same framework. Serving both populations is the main motivation for our efforts, but at the same time, we have tried hard to make it useful. @@ -38,7 +37,7 @@ Serving both populations is the main motivation for our efforts, but at the same We provide a set of basic, easy-to-use forecasters that work out of the box. You should be able to do a reasonably limited amount of customization on them. Any serious customization happens with the framework discussed below). -For the basic forecasters, we provide, at least: +For the basic forecasters, we provide: * Baseline flat-line forecaster * Autoregressive forecaster @@ -55,11 +54,11 @@ Our framework for creating custom forecasters views the prediction task as a set 3. Predictor: make predictions, using a fitted model object and processed test data 4. Postprocessor: manipulate or transform the predictions before returning -Users familiar with [`{tidymodels}`](https://www.tidymodels.org) and especially the [`{workflows}`](https://workflows.tidymodels.org) package will notice a lot of overlap. This is by design, and is in fact a feature. The truth is that `epipredict` is a wrapper around much that is contained in these packages. Therefore, if you want something from this -verse, it should "just work" (we hope). +Users familiar with [`{tidymodels}`](https://www.tidymodels.org) and especially the [`{workflows}`](https://workflows.tidymodels.org) package will notice a lot of overlap. This is by design, and is in fact a feature. The truth is that `{epipredict}` is a wrapper around much that is contained in these packages. Therefore, if you want something from this -verse, it should "just work" (we hope). The reason for the overlap is that `{workflows}` _already implements_ the first three steps. And it does this very well. However, it is missing the postprocessing stage and currently has no plans for such an implementation. And this feature is important. The baseline forecaster we provide _requires_ postprocessing. Anything more complicated needs this as well. -The second omission from `{tidymodels}` is support for panel data. Besides epidemiological data, economics, psychology, sociology, and many other areas frequently deal with data of this type. So the framework of behind `epipredict` implements this. In principle, this has nothing to do with epidemiology, and one could simply use this package as a solution for the missing functionality in `{tidymodels}`. Again, this should "just work". +The second omission from `{tidymodels}` is support for panel data. Besides epidemiological data, economics, psychology, sociology, and many other areas frequently deal with data of this type. So the framework of behind `{epipredict}` implements this. In principle, this has nothing to do with epidemiology, and one could simply use this package as a solution for the missing functionality in `{tidymodels}`. Again, this should "just work". All of the _panel data_ functionality is implemented through the `epi_df` data type in the companion [`{epiprocess}`](https://cmu-delphi.github.io/epiprocess/) package. There is much more to see there, but for the moment, it's enough to look at a simple one: @@ -70,7 +69,7 @@ jhu This data is built into the package and contains the measured variables `case_rate` and `death_rate` for COVID-19 at the daily level for each US state for the year 2021. The "panel" part is because we have repeated measurements across a number of locations. -The `epi_df` encodes the timestamp as `time_value` and the `key` as `geo_value`. While these 2 names are required, the values don't need to actually represent such objects. Additional `key`'s are also supported (like age group, ethnicity, taxonomy, etc.). +The `epi_df` encodes the time stamp as `time_value` and the `key` as `geo_value`. While these 2 names are required, the values don't need to actually represent such objects. Additional `key`'s are also supported (like age group, ethnicity, taxonomy, etc.). The `epi_df` also contains some metadata that describes the keys as well as the vintage of the data. It's possible that data collected at different times for the _same set_ of `geo_value`'s and `time_value`'s could actually be different. For more details, see [`{epiprocess}`](https://cmu-delphi.github.io/epiprocess/articles/epiprocess.html). @@ -90,9 +89,9 @@ use almost everything they provide. * The tidy-team doesn't have plans to do either of these things. (We checked). * There are two packages that do _time series_ built on `{tidymodels}`, but it's -"basic" time series: 1-step AR models, exponential smoothing, STL decomposition, etc.[^2] We have never found these models to be particularly helpful for epidemic forecasting, but one could also integrate these methods into our framework. +"basic" time series: 1-step AR models, exponential smoothing, STL decomposition, etc.[^2] Our group has not prioritized these sorts of models for epidemic forecasting, but one could also integrate these methods into our framework. -[^2]: These are [`{timetk}`](https://business-science.github.io/timetk/index.html) and [`{modeltime}`](https://business-science.github.io/timetk/index.html). There are _lots_ of useful methods there than can be used to do fairly complex machine learning methodology. +[^2]: These are [`{timetk}`](https://business-science.github.io/timetk/index.html) and [`{modeltime}`](https://business-science.github.io/timetk/index.html). There are _lots_ of useful methods there than can be used to do fairly complex machine learning methodology, though not directly for panel data and not for direct prediction of future targets. # Show me the basics @@ -103,7 +102,9 @@ We'll estimate the model jointly across all locations using only the most recent ```{r demo-workflow} jhu <- jhu %>% filter(time_value >= max(time_value) - 30) -out <- arx_forecaster(jhu, outcome = "death_rate", +out <- arx_forecaster( + jhu, + outcome = "death_rate", predictors = c("case_rate", "death_rate") ) ``` @@ -117,8 +118,12 @@ The `out` object has two components: out$predictions ``` 2. A list object of class `epi_workflow`. This object encapsulates all the instructions necessary to create the prediction. More details on this below. + ```{r} + out$epi_workflow + ``` -Note that the `time_value` in the predictions is not necessarily meaningful. +Note that the `time_value` in the predictions is not necessarily meaningful, +but it is a required column in an `epi_df`, so it remains here. By default, the forecaster predicts the outcome (`death_rate`) 1-week ahead, using 3 lags of each predictor (`case_rate` and `death_rate`) at 0 (today), 1 week back and 2 weeks back. The predictors and outcome can be changed directly. The rest of the defaults are encapsulated into a list of arguments. This list is produced by `arx_args_list()`. @@ -131,7 +136,10 @@ knitr::opts_chunk$set(warning = FALSE, message = FALSE) ``` ```{r differential-lags} -out2week <- arx_forecaster(jhu, "death_rate", c("case_rate", "death_rate"), +out2week <- arx_forecaster( + jhu, + outcome = "death_rate", + predictors = c("case_rate", "death_rate"), args_list = arx_args_list( lags = list(c(0,1,2,3,7,14), c(0,7,14)), ahead = 14) @@ -140,7 +148,7 @@ out2week <- arx_forecaster(jhu, "death_rate", c("case_rate", "death_rate"), Here, we've used different lags on the `case_rate` and are now predicting 2 weeks ahead. This example also illustrates a major difficulty with the "iterative" versions of AR models. This model doesn't produce forecasts for `case_rate`, and so, would not have data to "plug in" for the necessary lags.[^1] -[^1]: An obvious fix is to instead use a VAR and predict both, but would likely increase the variance, and may lead to less accurate forecasts for the variable of interest. +[^1]: An obvious fix is to instead use a VAR and predict both, but this would likely increase the variance of the model, and therefore, may lead to less accurate forecasts for the variable of interest. Another property of the basic model is the predictive interval. We describe this in more detail in a different vignette, but it is easy to request multiple quantiles. @@ -151,22 +159,22 @@ out_q <- arx_forecaster(jhu, "death_rate", c("case_rate", "death_rate"), ) ``` -The column `.pred_dstn` in the `predictions` object is actually a "distribution" here parameterized by its quantiles. For this default forecaster, these are created using the quantiles of the residuals of the predictive model (possibly symmetrized). Here, we used 23 quantiles, but one can grab a particular quantile +The column `.pred_dstn` in the `predictions` object is actually a "distribution" here parameterized by its quantiles. For this default forecaster, these are created using the quantiles of the residuals of the predictive model (possibly symmetrized). Here, we used 23 quantiles, but one can grab a particular quantile, ```{r q1} head(quantile(out_q$predictions$.pred_distn, p = .4)) ``` -Or extract the entire distribution into a "long" `epi_df` with `tau` being the probability and `q` being the value associated to that quantile. +or extract the entire distribution into a "long" `epi_df` with `tau` being the probability and `q` being the value associated to that quantile. ```{r q2} out_q$predictions %>% - mutate( - .pred_distn = nested_quantiles(.pred_distn) # "nested" list-col - ) %>% unnest(.pred_distn) + # first create a "nested" list-column + mutate(.pred_distn = nested_quantiles(.pred_distn)) %>% + unnest(.pred_distn) # then unnest it ``` -Further simple adjustments can be made using the function. +Additional simple adjustments to the basic forecaster can be made using the function: ```{r, eval = FALSE} arx_args_list( @@ -194,6 +202,15 @@ out_gb <- arx_forecaster(jhu, "death_rate", c("case_rate", "death_rate"), boost_tree(mode = "regression", trees = 20)) ``` +Or quantile regression, using our custom forecasting engine `quantile_reg()`: + +```{r quantreg, warning = FALSE} +out_gb <- arx_forecaster(jhu, "death_rate", c("case_rate", "death_rate"), + quantile_reg()) +``` + +FWIW, this last case (using quantile regression), is not far from what the Delphi production forecast team used for its Covid forecasts over the past few years. + ## Inner workings Underneath the hood, this forecaster creates (and returns) an `epi_workflow`. @@ -215,13 +232,14 @@ there are 2 extensions to the `formula` that `{recipes}` handles: lagging, filtering out rows, handling dummy variables, etc. 2. Using statistics from the training data to eventually process test data. This is a major benefit of `{recipes}`. It prevents what the tidy team calls - "data leakage". A simple example is centering a predictor by it's mean. We + "data leakage". A simple example is centering a predictor by its mean. We need to store the mean of the predictor from the training data and use that value on the test data rather than accidentally calculating the mean of - the test predictor. + the test predictor for centering. A recipe is processed in 2 steps, first it is "prepped". This calculates and -stores the result as necessary for use on the test data. Then it is "baked" +stores any intermediate statistics necessary for use on the test data. +Then it is "baked" resulting in training data ready for passing into a statistical model (like `lm`). We have introduced an `epi_recipe`. It's just a `recipe` that knows how to handle @@ -236,14 +254,14 @@ extract_recipe(out_gb$epi_workflow) The "Inputs" are the original `epi_df` and the "roles" that these are assigned. None of these are predictors or outcomes. Those will be created by the recipe when it is prepped. The "Operations" are the sequence of -instructions to create the cake. -Here we create lagged predictors, lead the outcome, and then remove NA's. -Some models like `lm` internally handle NA's, but not everything does, so we +instructions to create the cake (baked training data). +Here we create lagged predictors, lead the outcome, and then remove `NA`s. +Some models like `lm` internally handle `NA`s, but not everything does, so we deal with them explicitly. The code to do this (inside the forecaster) is ```{r} er <- epi_recipe(jhu) %>% - step_epi_lag(case_rate, death_rate, lag = c(0,7,14)) %>% + step_epi_lag(case_rate, death_rate, lag = c(0, 7, 14)) %>% step_epi_ahead(death_rate, ahead = 7) %>% step_epi_naomit() ``` @@ -251,7 +269,7 @@ er <- epi_recipe(jhu) %>% While `{recipes}` provides a function `step_lag()`, it assumes that the data have no breaks in the sequence of `time_values`. This is a bit dangerous, so we avoid that behaviour. Our `lag/ahead` functions also appropriately adjust the -amount of data to avoid accidently dropping recent predictors from the test +amount of data to avoid accidentally dropping recent predictors from the test data. ### The model specification @@ -267,6 +285,7 @@ any issue despite the fact that these functions couldn't be more different. lm(formula, data, subset, weights, na.action, method = "qr", model = TRUE, x = FALSE, y = FALSE, qr = TRUE, singular.ok = TRUE, contrasts = NULL, offset, ...) + xgboost(data = NULL, label = NULL, missing = NA, weight = NULL, params = list(), nrounds, verbose = 1, print_every_n = 1L, early_stopping_rounds = NULL, maximize = NULL, save_period = NULL, @@ -297,7 +316,7 @@ extract_frosting(out_q$epi_workflow) ``` Here we have 5 layers of frosting. The first generates the forecasts from the test data. -The second uses quantiles of the residuals (by `geo_value`) to create distributional +The second uses quantiles of the residuals to create distributional forecasts. The next two add columns for the date the forecast was made and the date for which it is intended to occur. Because we are predicting rates, they should be non-negative, so the last layer thresholds both predicted values and @@ -307,7 +326,7 @@ intervals at 0. The code to do this (inside the forecaster) is f <- frosting() %>% layer_predict() %>% layer_residual_quantiles( - probs = c(.01,.025, seq(.05,.95, by=.05), .975,.99), + probs = c(.01, .025, seq(.05, .95, by = .05), .975, .99), symmetrize = TRUE) %>% layer_add_forecast_date() %>% layer_add_target_date() %>% @@ -321,12 +340,20 @@ test_data <- get_test_data(er, jhu) ewf %>% add_frosting(f) %>% predict(test_data) ``` +The above `get_test_data()` function examines the recipe and ensures that enough +test data is available to create the necessary lags and produce a prediction +for the desired future time point (after the end of the training data). This mimics +what would happen if `jhu` contained the most recent available historical data and +we wanted to actually predict the future. We could have instead used any test data +that contained the necessary predictors. + + ## Conclusion Internally, we provide some simple functions to create reasonable forecasts. But ideally, a user could create their own forecasters by building up the components we provide. In other vignettes, we try to walk through some of these -customizations. +customizations. To illustrate everything above, here is (roughly) the code for the `flatline_forecaster()` applied to the `case_rate`. @@ -359,4 +386,3 @@ and predicting procedure. preds ``` -## Using the workflow object diff --git a/vignettes/preprocessing-and-models.Rmd b/vignettes/preprocessing-and-models.Rmd index a2d338f06..544b86b20 100644 --- a/vignettes/preprocessing-and-models.Rmd +++ b/vignettes/preprocessing-and-models.Rmd @@ -14,19 +14,19 @@ knitr::opts_chunk$set(echo = TRUE) ## Introduction The `epipredict` package utilizes the `tidymodels` framework, namely -[`recipes`](https://recipes.tidymodels.org/) for +[`{recipes}`](https://recipes.tidymodels.org/) for [dplyr](https://dplyr.tidyverse.org/)-like pipeable sequences -of feature engineering and [`parsnip`](https://parsnip.tidymodels.org/) for a +of feature engineering and [`{parsnip}`](https://parsnip.tidymodels.org/) for a unified interface to a range of models. `epipredict` has additional customized feature engineering and preprocessing -steps, such as `step_epi_shift()`, `step_population_scaling()`, +steps, such as `step_epi_lag()`, `step_population_scaling()`, `step_epi_naomit()`. They can be used along with -steps from the `recipes` package for more feature engineering. +steps from the `{recipes}` package for more feature engineering. In this vignette, we will illustrate some examples of how to use `epipredict` -with `recipes` and `parsnip` for different purposes of epidemiology forecasting. -We will focus on simple basic autoregressive models, in which COVID cases and +with `recipes` and `parsnip` for different purposes of epidemiological forecasting. +We will focus on basic autoregressive models, in which COVID cases and deaths in the near future are predicted using a linear combination of cases and deaths in the near past. @@ -49,7 +49,8 @@ library(poissonreg) ## Poisson Regression -During COVID-19, Center for Disease Control and Prevention (CDC) gathered models +During COVID-19, the US Center for Disease Control and Prevention (CDC) collected +models and forecasts to characterize the state of an outbreak and its course. They use it to inform public health decision makers on potential consequences of deploying control measures. @@ -57,7 +58,7 @@ deploying control measures. One of the outcomes that the CDC forecasts is [death counts from COVID-19](https://www.cdc.gov/coronavirus/2019-ncov/science/forecasting/forecasting-us.html). Although there are many state-of-the-art models, we choose to use Poisson regression, the textbook example for modeling count data, as an illustration -for using the `epipredict` package with other existing tidymodel packages. +for using the `epipredict` package with other existing tidymodels packages. ```{r poisson-reg-data} x <- covidcast( @@ -80,12 +81,11 @@ y <- covidcast( fetch_tbl() %>% select(geo_value, time_value, deaths = value) -case_death_counts_subset <- x %>% - full_join(y, by = c("geo_value", "time_value")) %>% +counts_subset <- full_join(x, y, by = c("geo_value", "time_value")) %>% as_epi_df() ``` -The `case_death_counts_subset` dataset comes from the `epidatr` package,and +The `counts_subset` dataset comes from the `epidatr` package, and contains the number of confirmed cases and deaths from June 4, 2021 to Dec 31, 2021 in some U.S. states. @@ -93,24 +93,19 @@ We wish to predict the 7-day ahead death counts with lagged cases and deaths. Furthermore, we will let each state be a dummy variable. Using differential intercept coefficients, we can allow for an intercept shift between states. -The model takes the form of +The model takes the form \begin{aligned} -\text{log}\left( \mu_{t+7} \right) = \beta_0 + \delta_1 s_{\text{state_1}} + -\delta_2 s_{\text{state_2}} + ... + \beta_1 \text{deaths}_{t} + \nonumber \\ +\log\left( \mu_{t+7} \right) &= \beta_0 + \delta_1 s_{\text{state}_1} + +\delta_2 s_{\text{state}_2} + \cdots + \nonumber \\ &\quad\beta_1 \text{deaths}_{t} + \beta_2 \text{deaths}_{t-7} + \beta_3 \text{cases}_{t} + -\beta_4 \text{cases}_{t-7} +\beta_4 \text{cases}_{t-7}, \end{aligned} - - where $\mu_{t+7} = \mathbb{E}(y_{t+7})$, and $y_{t+7}$ is assumed to follow a Poisson distribution with mean $\mu_{t+7}$; $s_{\text{state}}$ are dummy variables for each state and take values of either 0 or 1. -Preprocessing steps will be performed using the `recipes` functions to get the -data ready for modeling. - -But before diving into them, it will be helpful to understand what `roles` are -in the `recipes` framework. +Preprocessing steps will be performed to prepare the +data for model fitting. But before diving into them, it will be helpful to understand what `roles` are in the `recipes` framework. --- @@ -122,15 +117,16 @@ For most conventional situations, they are typically “predictor” and/or "outcome". Additional roles enable targeted `step_*()` operations on specific variables or groups of variables. -In our case, the role `predictor` is given to explanatory variable that go to the -left-hand side of the model. The role `outcome` is the response variable +In our case, the role `predictor` is given to explanatory variables on the +right-hand side of the model (in the equation above). +The role `outcome` is the response variable that we wish to predict. `geo_value` and `time_value` are predefined roles that are unique to the `epipredict` package. Since we work with `epi_df` objects, all datasets should have `geo_value` and `time_value` passed through -automatically with these two roles assigned to the appropriate columns. +automatically with these two roles assigned to the appropriate columns in the data. The `recipes` package also allows [manual alterations of roles](https://recipes.tidymodels.org/reference/roles.html) -in bulk. There are a couple handy functions that can be used together to help us +in bulk. There are a few handy functions that can be used together to help us manipulate variable roles easily. > `update_role()` alters an existing role in the recipe or assigns an initial role @@ -146,17 +142,17 @@ manipulate variable roles easily. --- Notice in the following preprocessing steps, we used `add_role()` on -`geo_value_factor` since currently the default role for it is `raw` and -we would like the dummified variables to automatically be `predictor`'s. +`geo_value_factor` since, currently, the default role for it is `raw`, but +we would like to reuse this variable as `predictor`s. ```{r} -case_death_counts_subset <- case_death_counts_subset %>% +counts_subset <- counts_subset %>% mutate(geo_value_factor = as.factor(geo_value)) %>% as_epi_df() -epi_recipe(case_death_counts_subset) +epi_recipe(counts_subset) -r <- epi_recipe(case_death_counts_subset) %>% +r <- epi_recipe(counts_subset) %>% add_role(geo_value_factor, new_role = "predictor") %>% step_dummy(geo_value_factor) %>% ## Occasionally, data reporting errors / corrections result in negative @@ -168,14 +164,14 @@ r <- epi_recipe(case_death_counts_subset) %>% ``` After specifying the preprocessing steps, we will use the `parsnip` package for -modeling and getting the prediction for death count, 7 days after the +modeling and producing the prediction for death count, 7 days after the latest available date in the dataset. ```{r} -latest <- get_test_data(r, case_death_counts_subset) +latest <- get_test_data(r, counts_subset) -wf <- epi_workflow(r, parsnip::poisson_reg()) %>% - fit(case_death_counts_subset) +wf <- epi_workflow(r, parsnip::poisson_reg()) %>% + fit(counts_subset) predict(wf, latest) %>% filter(!is.na(.pred)) ``` @@ -196,47 +192,49 @@ rates, by incorporating offset terms in the model. To model death rates, the Poisson regression would be expressed as: \begin{aligned} -\text{log}\left( \mu_{t+7} \right) = \text{log(population)} + -\beta_0 + \delta_1 s_{\text{state_1}} + -\delta_2 s_{\text{state_2}} + ... + \beta_1 \text{deaths}_{t} + \nonumber \\ +\log\left( \mu_{t+7} \right) &= \log(\text{population}) + +\beta_0 + \delta_1 s_{\text{state}_1} + +\delta_2 s_{\text{state}_2} + \cdots + \nonumber \\ &\quad\beta_1 \text{deaths}_{t} + \beta_2 \text{deaths}_{t-7} + \beta_3 \text{cases}_{t} + -\beta_4 \text{cases}_{t-7} -\end{aligned} -where $\text{log(population)}$ is the log of the state population that was -used to scale the count data on the left-hand side of the equation. +\beta_4 \text{cases}_{t-7}\end{aligned} +where $\log(\text{population})$ is the log of the state population that was +used to scale the count data on the left-hand side of the equation. This offset +is simply a predictor with coefficient fixed at 1 rather than estimated. There are several ways to model rate data given count and population data. -First, in the `parsnip` framework, we can specify the formula in `fit()`. +First, in the `parsnip` framework, we could specify the formula in `fit()`. However, by doing so we lose the ability to use the `recipes` framework to -create new variables since new variables that do not exist in the -original dataset such as lag and leads cannot be called directly in `fit()`. +create new variables since variables that do not exist in the +original dataset (such as, here, the lags and leads) cannot be called directly in `fit()`. Alternatively, `step_population_scaling()` and `layer_population_scaling()` in the `epipredict` package can perform the population scaling if we provide the -population data, and we can model this using other models such as linear -regression, which we will illustrate in the next section. +population data, which we will illustrate in the next section. ## Linear Regression For COVID-19, the CDC required submission of case and death count predictions. However, the Delphi Group preferred to train on rate data instead, because it -puts different locations on a similar scale. +puts different locations on a similar scale (eliminating the need for location-specific intercepts). We can use a liner regression to predict the death -rates and use state population data to scale the rates to counts. We will do so +rates and use state population data to scale the rates to counts.[^pois] We will do so using `layer_population_scaling()` from the `epipredict` package. +[^pois]: We could continue with the Poisson model, but we'll switch to the Gaussian likelihood just for simplicity. + Additionally, when forecasts are submitted, prediction intervals should be provided along with the point estimates. This can be obtained via postprocessing -`layer_residual_quantiles()`. Although worth pointing out that +using +`layer_residual_quantiles()`. It is worth pointing out, however, that `layer_residual_quantiles()` should be used before population scaling or else the transformation will make the results uninterpretable. -We wish to predict the 7-day ahead death counts with lagged case rates and death -rates, along with extra behaviour indicator inputs. Namely, we will use survey data +We wish, now, to predict the 7-day ahead death counts with lagged case rates and death +rates, along with some extra behaviourial predictors. Namely, we will use survey data from [COVID-19 Trends and Impact Survey](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/fb-survey.html#behavior-indicators). -The survey data shows the estimated percentage of people who wore a mask for +The survey data provides the estimated percentage of people who wore a mask for most or all of the time while in public in the past 7 days and the estimated percentage of respondents who reported that all or most people they encountered in public in the past 7 days maintained a distance of at least 6 feet. @@ -291,20 +289,20 @@ We will take a subset of death rate and case rate data from the built-in dataset `case_death_rate_subset`. ```{r} -jhu <- case_death_rate_subset %>% - dplyr::filter( - time_value >= "2021-06-04", - time_value <= "2021-12-31", - geo_value %in% c("ca","fl","tx","ny","nj")) +jhu <- filter( + case_death_rate_subset, + time_value >= "2021-06-04", + time_value <= "2021-12-31", + geo_value %in% c("ca","fl","tx","ny","nj") +) ``` -Preprocessing steps will rely on functions from the `epipredict` package as well -as the `recipes` package: - -Additionally, there are also many functions in the `recipes` package that allow for +Preprocessing steps will again rely on functions from the `epipredict` package as well +as the `recipes` package. +There are also many functions in the `recipes` package that allow for [scalar transformations](https://recipes.tidymodels.org/reference/#step-functions-individual-transformations), such as log transformations and data centering. In our case, we will -center the numerical predictors to allow for meaningful interpretation of the +center the numerical predictors to allow for a more meaningful interpretation of the intercept. ```{r} @@ -324,29 +322,34 @@ r <- epi_recipe(jhu) %>% step_epi_naomit() ``` -As a sanity check we can see what the training data will look like +As a sanity check we can examine the structure of the training data: ```{r, warning = FALSE} glimpse(slice_sample(bake(prep(r, jhu), jhu), n = 6)) ``` Before directly predicting the results, we need to add postprocessing layers to obtain the death counts instead of death rates. Note that the rates used so -far are "per 100K people" rather than "per person". +far are "per 100K people" rather than "per person". We'll also use quantile +regression with the `quantile_reg` engine rather than ordinary least squares +to create median predictions and a 90% prediction interval. -```{r} +```{r, warning=FALSE} f <- frosting() %>% layer_predict() %>% layer_add_target_date("2022-01-07") %>% layer_threshold(.pred, lower = 0) %>% - layer_residual_quantiles(probs = c(0.05, 0.95), symmetrize = FALSE) %>% + layer_quantile_distn() %>% layer_naomit(.pred) %>% layer_population_scaling( - .pred, .pred_distn, df = pop_dat, rate_rescaling = 1e5, - by = c("geo_value" = "abbr"), df_pop_col = "pop") + .pred, .pred_distn, + df = pop_dat, + rate_rescaling = 1e5, + by = c("geo_value" = "abbr"), + df_pop_col = "pop") -wf <- epi_workflow(r, parsnip::linear_reg()) %>% - fit(jhu) %>% - add_frosting(f) +wf <- epi_workflow(r, quantile_reg(tau = c(.05, .5, .95))) %>% + fit(jhu) %>% + add_frosting(f) latest <- get_test_data(recipe = r, x = jhu) p <- predict(wf, latest) @@ -369,21 +372,21 @@ p %>% Last but not least, let's take a look at the regression fit and check the coefficients: ```{r, echo =FALSE} -summary(extract_fit_engine(wf)) +extract_fit_engine(wf) ``` ## Classification Sometimes it is preferable to create a predictive model for surges or upswings rather than for raw values. In this case, -the target is to predict if the future will have increased case rates(up), -decreased case rates (down), or flat case rates (flat) relative to the current -level. Such model are often -referred to as hotspot prediction models. We will refer to the model notations -from [McDonald, Bien, Green, Hu, et al.](#references) but extend the application +the target is to predict if the future will have increased case rates (denoted `up`), +decreased case rates (`down`), or flat case rates (`flat`) relative to the current +level. Such models may be +referred to as "hotspot prediction models". We will follow the analysis +in [McDonald, Bien, Green, Hu, et al.](#references) but extend the application to predict three categories instead of two. -Hotspot prediction predicts a categorical variable defined in terms of the +Hotspot prediction uses a categorical outcome variable defined in terms of the relative change of $Y_{\ell, t+a}$ compared to $Y_{\ell, t}$. Where $Y_{\ell, t}$ denotes the case rates in location $\ell$ at time $t$. We define the response variables as follows: @@ -392,39 +395,35 @@ $$ Z_{\ell, t}= \begin{cases} \text{up}, & \text{if}\ Y^{\Delta}_{\ell, t} > 0.25 \\ - \text{down}, & \text{if}\ Y^{\Delta}_{\ell, t} < -0.25\\ + \text{down}, & \text{if}\ Y^{\Delta}_{\ell, t} < -0.20\\ \text{flat}, & \text{otherwise} \end{cases} $$ -where $Y^{\Delta}_{\ell, t} = (Y_{\ell, t}- Y_{\ell, t-7})/(Y_{\ell, t-7})$. -We say location $\ell$ is a hotspot at time $t$ when $Z_{\ell,t}$ is categorized -as 'up', meaning the number of newly reported cases over the past 7 days has +where $Y^{\Delta}_{\ell, t} = (Y_{\ell, t}- Y_{\ell, t-7})\ /\ (Y_{\ell, t-7})$. +We say location $\ell$ is a hotspot at time $t$ when $Z_{\ell,t}$ is +`up`, meaning the number of newly reported cases over the past 7 days has increased by at least 25% compared to the preceding week. When $Z_{\ell,t}$ -is categorized as 'down', it suggests that there has been at least a 25% -decrease in newly reported cases over the past 7 days. Otherwise, we will -consider the trends to be flat. - -The expression of the multinomial regression we will model is as follows: +is categorized as `down`, it suggests that there has been at least a 20% +decrease in newly reported cases over the past 7 days (a 20% decrease is the inverse of a 25% increase). Otherwise, we will +consider the trend to be `flat`. +The expression of the multinomial regression we will use is as follows: $$ -\pi_{j}(x) = \text{Pr}(Y = j|x) = \frac{e^{g_j(x)}}{1 + \sum_{k=0}^2 g_j(x) } +\pi_{j}(x) = \text{Pr}(Z_{\ell,t} = j|x) = \frac{e^{g_j(x)}}{1 + \sum_{k=0}^2 g_j(x) } $$ -where $j$ is either down (`Y=0`), flat (`Y=1`), or up (`Y=2`), -$$g_0(x) = 0$$ +where $j$ is either down, flat, or up \begin{aligned} -g_1(x)= \text{ln}\left(\frac{Pr(Y=1|x)}{Pr(Y=0|x)}\right) &= -\beta_{10} + \beta_{11}* x_{\text{time_value}} + \delta_{10} s_{\text{state_1}} + -\delta_{11} s_{\text{state_2}} + ... \nonumber \\ +g_{\text{down}}(x) &= 0.\\ +g_{\text{flat}}(x)&= \text{ln}\left(\frac{Pr(Z_{\ell,t}=\text{flat}|x)}{Pr(Z_{\ell,t}=\text{down}|x)}\right) = +\beta_{10} + \beta_{11}t + \delta_{10} s_{\text{state_1}} + +\delta_{11} s_{\text{state_2}} + \cdots \nonumber \\ &\quad + \beta_{12} Y^{\Delta}_{\ell, t} + -\beta_{13} Y^{\Delta}_{\ell, t-7} -\end{aligned} - -\begin{aligned} -g_2(x)= \text{ln}\left(\frac{Pr(Y=2|x)}{Pr(Y=0|x)}\right) &= -\beta_{20} + \beta_{21}* x_{\text{time_value}} + \delta_{20} s_{\text{state_1}} + -\delta_{21} s_{\text{state_2}} + ... \nonumber \\ +\beta_{13} Y^{\Delta}_{\ell, t-7} \\ +g_{\text{flat}}(x) &= \text{ln}\left(\frac{Pr(Z_{\ell,t}=\text{up}|x)}{Pr(Z_{\ell,t}=\text{down}|x)}\right) = +\beta_{20} + \beta_{21}t + \delta_{20} s_{\text{state_1}} + +\delta_{21} s_{\text{state}_2} + \cdots \nonumber \\ &\quad + \beta_{22} Y^{\Delta}_{\ell, t} + \beta_{23} Y^{\Delta}_{\ell, t-7} \end{aligned} @@ -432,9 +431,7 @@ g_2(x)= \text{ln}\left(\frac{Pr(Y=2|x)}{Pr(Y=0|x)}\right) &= Preprocessing steps are similar to the previous models with an additional step -of categorizing the response variables. - -We will take a subset of death rate and case rate data from our built-in dataset +of categorizing the response variables. Again, we will use a subset of death rate and case rate data from our built-in dataset `case_death_rate_subset`. ```{r} jhu <- case_death_rate_subset %>% @@ -470,31 +467,31 @@ r <- epi_recipe(jhu) %>% step_epi_naomit() ``` -We will fit the multinomial regression and take a look at the predictions: +We will fit the multinomial regression and examine the predictions: ```{r, warning=FALSE} wf <- epi_workflow(r, parsnip::multinom_reg()) %>% - fit(jhu) + fit(jhu) latest <- get_test_data(recipe = r, x = jhu) predict(wf, latest) %>% filter(!is.na(.pred_class)) ``` -We can also take a look at the fit: +We can also look at the estimated coefficients and model summary information: ```{r} extract_fit_engine(wf) ``` One could also use a formula in `epi_recipe()` to achieve the same results as above. However, only one of `add_formula()`, `add_recipe()`, or -`workflow_variables()` must be specified. For the purpose of demonstrating +`workflow_variables()` can be specified. For the purpose of demonstrating `add_formula` rather than `add_recipe`, we will `prep` and `bake` our recipe to return a `data.frame` that could be used for model fitting. ```{r} b <- bake(prep(r, jhu), jhu) epi_workflow() %>% - add_formula(response ~ geo_value + pct_diff_wk1 + pct_diff_wk2) %>% + add_formula(response ~ geo_value + time_value + pct_diff_wk1 + pct_diff_wk2) %>% add_model(parsnip::multinom_reg()) %>% fit(data = b) ``` @@ -506,15 +503,17 @@ is handy for creating correct lags and leads for future predictions. Let's start with a simple dataset and preprocessing: ```{r} -ex <- case_death_rate_subset %>% - dplyr::filter(time_value >= "2021-12-01", - time_value <= "2021-12-31", - geo_value == "ca") +ex <- filter( + case_death_rate_subset, + time_value >= "2021-12-01", + time_value <= "2021-12-31", + geo_value == "ca" +) dim(ex) ``` -We want to predict death rates on `2022-01-07`, which is 7 days ahead from the +We want to predict death rates on `r max(ex$time_value) + 7`, which is 7 days ahead of the latest available date in our dataset. We will compare two methods of trying to create lags and leads: @@ -562,7 +561,7 @@ dates_used_in_training2 <- b2 %>% dates_used_in_training2 ``` -The model that is trained without the functions is predicting 7 days ahead from +The model that is trained based on the `{recipes}` functions will predict 7 days ahead from `r max(dates_used_in_training2$time_value)` instead of 7 days ahead from `r max(dates_used_in_training1$time_value)`.