diff --git a/DESCRIPTION b/DESCRIPTION index 0dab118fc..2ae9d3337 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -25,6 +25,7 @@ URL: https://github.com/cmu-delphi/epipredict/, BugReports: https://github.com/cmu-delphi/epipredict/issues/ Depends: epiprocess (>= 0.9.0), + epidatasets, parsnip (>= 1.0.0), R (>= 3.5.0) Imports: @@ -49,7 +50,6 @@ Imports: workflows (>= 1.0.0) Suggests: data.table, - epidatasets, epidatr (>= 1.0.0), fs, grf, @@ -69,6 +69,7 @@ Suggests: VignetteBuilder: knitr Remotes: + cmu-delphi/epidatasets, cmu-delphi/epidatr, cmu-delphi/epiprocess, dajmcdon/smoothqr diff --git a/NAMESPACE b/NAMESPACE index 86b77716b..5ee13f730 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -215,6 +215,7 @@ export(update_model) export(validate_layer) export(weighted_interval_score) import(distributional) +import(epidatasets) import(epiprocess) import(parsnip) import(recipes) diff --git a/NEWS.md b/NEWS.md index e080d1aa0..3fcf4dc0c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,12 +4,21 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.0.x will indicat # epipredict 0.2 -## features +## Breaking changes + +- Moved example datasets from being hosted in the package to being loaded + from the `epidatasets` package. The datasets can no longer be loaded with + `data()`, but can be accessed with + `data(, package = "epidatasets")`, `epidatasets::` + or, after loading the package, the name of the dataset alone (#382). + +## Improvements + - Add `step_adjust_latency`, which give several methods to adjust the forecast if the `forecast_date` is after the last day of data. - (temporary) ahead negative is allowed for `step_epi_ahead` until we have `step_epi_shift` -## bugfixes -- shifting no columns results in no error for either `step_epi_ahead` and `step_epi_lag` +## Bug fixes +- Shifting no columns results in no error for either `step_epi_ahead` and `step_epi_lag` - Quantiles produced by `grf` were sometimes out of order. # epipredict 0.1 diff --git a/R/arx_classifier.R b/R/arx_classifier.R index 0aec0e362..240bc69ee 100644 --- a/R/arx_classifier.R +++ b/R/arx_classifier.R @@ -27,7 +27,7 @@ #' #' @examples #' library(dplyr) -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value >= as.Date("2021-11-01")) #' #' out <- arx_classifier(jhu, "death_rate", c("case_rate", "death_rate")) @@ -104,7 +104,7 @@ arx_classifier <- function( #' @seealso [arx_classifier()] #' @examples #' library(dplyr) -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value >= as.Date("2021-11-01")) #' #' arx_class_epi_workflow(jhu, "death_rate", c("case_rate", "death_rate")) diff --git a/R/arx_forecaster.R b/R/arx_forecaster.R index bfd5eaec1..c7aebef46 100644 --- a/R/arx_forecaster.R +++ b/R/arx_forecaster.R @@ -25,7 +25,7 @@ #' @seealso [arx_fcast_epi_workflow()], [arx_args_list()] #' #' @examples -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' dplyr::filter(time_value >= as.Date("2021-12-01")) #' #' out <- arx_forecaster( @@ -96,7 +96,7 @@ arx_forecaster <- function( #' #' @examples #' library(dplyr) -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value >= as.Date("2021-12-01")) #' #' arx_fcast_epi_workflow( diff --git a/R/autoplot.R b/R/autoplot.R index 8bded03a3..4f4222979 100644 --- a/R/autoplot.R +++ b/R/autoplot.R @@ -29,7 +29,7 @@ ggplot2::autoplot #' @name autoplot-epipred #' @examples #' library(dplyr) -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value >= as.Date("2021-11-01")) #' #' r <- epi_recipe(jhu) %>% @@ -70,7 +70,7 @@ ggplot2::autoplot #' #' # ------- Plotting canned forecaster output #' -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value >= as.Date("2021-11-01")) #' flat <- flatline_forecaster(jhu, "death_rate") #' autoplot(flat, .max_facets = 4) diff --git a/R/cdc_baseline_forecaster.R b/R/cdc_baseline_forecaster.R index 3352c5159..44090bf79 100644 --- a/R/cdc_baseline_forecaster.R +++ b/R/cdc_baseline_forecaster.R @@ -23,7 +23,7 @@ #' #' @examples #' library(dplyr) -#' weekly_deaths <- case_death_rate_subset %>% +#' weekly_deaths <- covid_case_death_rates %>% #' select(geo_value, time_value, death_rate) %>% #' left_join(state_census %>% select(pop, abbr), by = c("geo_value" = "abbr")) %>% #' mutate(deaths = pmax(death_rate / 1e5 * pop * 7, 0)) %>% diff --git a/R/data.R b/R/data.R deleted file mode 100644 index 71e5bdcd3..000000000 --- a/R/data.R +++ /dev/null @@ -1,87 +0,0 @@ -#' Subset of JHU daily state cases and deaths -#' -#' This data source of confirmed COVID-19 cases and deaths -#' is based on reports made available by the Center for -#' Systems Science and Engineering at Johns Hopkins University. -#' This example data ranges from Dec 31, 2020 to Dec 31, 2021, -#' and includes all states. -#' -#' @format A tibble with 20,496 rows and 4 variables: -#' \describe{ -#' \item{geo_value}{the geographic value associated with each row -#' of measurements.} -#' \item{time_value}{the time value associated with each row of measurements.} -#' \item{case_rate}{7-day average signal of number of new -#' confirmed COVID-19 cases per 100,000 population, daily} -#' \item{death_rate}{7-day average signal of number of new confirmed -#' deaths due to COVID-19 per 100,000 population, daily} -#' } -#' @source This object contains a modified part of the -#' \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University} -#' as \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}. -#' This data set is licensed under the terms of the -#' \href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license} -#' by the Johns Hopkins University on behalf of its Center for Systems Science -#' in Engineering. Copyright Johns Hopkins University 2020. -#' -#' Modifications: -#' * \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}: -#' These signals are taken directly from the JHU CSSE -#' \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub repository} -#' without changes. The 7-day average signals are computed by Delphi by -#' calculating moving averages of the preceding 7 days, so the signal for -#' June 7 is the average of the underlying data for June 1 through 7, -#' inclusive. -"case_death_rate_subset" - -#' State population data -#' -#' Data set on state populations, from the 2019 US Census. -#' -#' @format Data frame with 57 rows (including one for the United States as a -#' whole, plus the District of Columbia, Puerto Rico Commonwealth, -#' American Samoa, Guam, the U.S. Virgin Islands, and the Northern Mariana, -#' Islands). -#' -#' \describe{ -#' \item{fips}{FIPS code} -#' \item{name}{Full name of the state or territory} -#' \item{pop}{Estimate of the location's resident population in -#' 2019.} -#' \item{abbr}{Postal abbreviation for the location} -#' } -#' -#' @source United States Census Bureau, at -#' \url{https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/counties/totals/co-est2019-alldata.pdf}, -#' \url{https://www.census.gov/data/tables/time-series/demo/popest/2010s-total-puerto-rico-municipios.html}, -#' and \url{https://www.census.gov/data/tables/2010/dec/2010-island-areas.html} -"state_census" - -#' Subset of Statistics Canada median employment income for postsecondary graduates -#' -#' @format An [epiprocess::epi_df][epiprocess::as_epi_df] with 10193 rows and 8 variables: -#' \describe{ -#' \item{geo_value}{The province in Canada associated with each -#' row of measurements.} -#' \item{time_value}{The time value, a year integer in YYYY format} -#' \item{edu_qual}{The education qualification} -#' \item{fos}{The field of study} -#' \item{age_group}{The age group; either 15 to 34 or 35 to 64} -#' \item{num_graduates}{The number of graduates for the given row of characteristics} -#' \item{med_income_2y}{The median employment income two years after graduation} -#' \item{med_income_5y}{The median employment income five years after graduation} -#' } -#' @source This object contains modified data from the following Statistics Canada -#' data table: \href{https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=3710011501}{ -#' Characteristics and median employment income of longitudinal cohorts of postsecondary -#' graduates two and five years after graduation, by educational qualification and -#' field of study (primary groupings) -#' } -#' -#' Modifications: -#' * Only provincial-level geo_values are kept -#' * Only age group, field of study, and educational qualification are kept as -#' covariates. For the remaining covariates, we keep aggregated values and -#' drop the level-specific rows. -#' * No modifications were made to the time range of the data -"grad_employ_subset" diff --git a/R/epi_recipe.R b/R/epi_recipe.R index 311b9d073..646cb1b6d 100644 --- a/R/epi_recipe.R +++ b/R/epi_recipe.R @@ -43,7 +43,7 @@ epi_recipe.default <- function(x, ...) { #' @examples #' library(dplyr) #' library(recipes) -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value > "2021-08-01") %>% #' arrange(geo_value, time_value) #' @@ -263,7 +263,7 @@ is_epi_recipe <- function(x) { #' library(dplyr) #' library(recipes) #' -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value > "2021-08-01") %>% #' arrange(geo_value, time_value) #' @@ -347,7 +347,7 @@ update_epi_recipe <- function(x, recipe, ..., blueprint = default_epi_recipe_blu #' library(dplyr) #' library(workflows) #' -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) #' r <- epi_recipe(jhu) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% diff --git a/R/epi_workflow.R b/R/epi_workflow.R index e4cc9cd2a..81b443e7b 100644 --- a/R/epi_workflow.R +++ b/R/epi_workflow.R @@ -20,7 +20,7 @@ #' @importFrom generics augment #' @export #' @examples -#' jhu <- case_death_rate_subset +#' jhu <- covid_case_death_rates #' #' r <- epi_recipe(jhu) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% @@ -84,7 +84,7 @@ is_epi_workflow <- function(x) { #' @name fit-epi_workflow #' @export #' @examples -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) #' #' r <- epi_recipe(jhu) %>% @@ -142,7 +142,7 @@ fit.epi_workflow <- function(object, data, ..., control = workflows::control_wor #' @name predict-epi_workflow #' @export #' @examples -#' jhu <- case_death_rate_subset +#' jhu <- covid_case_death_rates #' #' r <- epi_recipe(jhu) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% diff --git a/R/epipredict-package.R b/R/epipredict-package.R index ad0f95295..3dee263e2 100644 --- a/R/epipredict-package.R +++ b/R/epipredict-package.R @@ -1,5 +1,5 @@ ## usethis namespace: start -#' @import epiprocess parsnip +#' @import epiprocess parsnip epidatasets #' @importFrom checkmate assert_class assert_numeric #' @importFrom checkmate test_character test_date test_function #' @importFrom checkmate test_integerish test_logical diff --git a/R/flatline_forecaster.R b/R/flatline_forecaster.R index 59f54bd86..7efda3efd 100644 --- a/R/flatline_forecaster.R +++ b/R/flatline_forecaster.R @@ -24,7 +24,7 @@ #' @export #' #' @examples -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' dplyr::filter(time_value >= as.Date("2021-12-01")) #' #' out <- flatline_forecaster(jhu, "death_rate") diff --git a/R/flusight_hub_formatter.R b/R/flusight_hub_formatter.R index c1aa00b82..b3e31822c 100644 --- a/R/flusight_hub_formatter.R +++ b/R/flusight_hub_formatter.R @@ -1,7 +1,6 @@ location_to_abbr <- function(location) { dictionary <- state_census %>% - mutate(fips = sprintf("%02d", fips)) %>% dplyr::transmute( location = dplyr::case_match(fips, "00" ~ "US", .default = fips), abbr @@ -12,7 +11,6 @@ location_to_abbr <- function(location) { abbr_to_location <- function(abbr) { dictionary <- state_census %>% - mutate(fips = sprintf("%02d", fips)) %>% dplyr::transmute( location = dplyr::case_match(fips, "00" ~ "US", .default = fips), abbr @@ -57,7 +55,7 @@ abbr_to_location <- function(abbr) { #' #' @examples #' library(dplyr) -#' weekly_deaths <- case_death_rate_subset %>% +#' weekly_deaths <- covid_case_death_rates %>% #' filter( #' time_value >= as.Date("2021-09-01"), #' geo_value %in% c("ca", "ny", "dc", "ga", "vt") diff --git a/R/frosting.R b/R/frosting.R index 2672bcdd1..ef32b4a3b 100644 --- a/R/frosting.R +++ b/R/frosting.R @@ -9,7 +9,7 @@ #' #' @examples #' library(dplyr) -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) #' r <- epi_recipe(jhu) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% @@ -128,7 +128,7 @@ update_frosting <- function(x, frosting, ...) { #' @export #' @examples #' library(dplyr) -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) #' r <- epi_recipe(jhu) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% @@ -268,7 +268,7 @@ new_frosting <- function() { #' wf <- epi_workflow() %>% add_frosting(f) #' #' # A more realistic example -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) #' #' r <- epi_recipe(jhu) %>% diff --git a/R/get_test_data.R b/R/get_test_data.R index 8de698301..442272a2f 100644 --- a/R/get_test_data.R +++ b/R/get_test_data.R @@ -19,11 +19,11 @@ #' keys, as well other variables in the original dataset. #' @examples #' # create recipe -#' rec <- epi_recipe(case_death_rate_subset) %>% +#' rec <- epi_recipe(covid_case_death_rates) %>% #' step_epi_ahead(death_rate, ahead = 7) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% #' step_epi_lag(case_rate, lag = c(0, 7, 14)) -#' get_test_data(recipe = rec, x = case_death_rate_subset) +#' get_test_data(recipe = rec, x = covid_case_death_rates) #' @importFrom rlang %@% #' @importFrom stats na.omit #' @export diff --git a/R/layer_add_forecast_date.R b/R/layer_add_forecast_date.R index c8f857c89..78cbb79a3 100644 --- a/R/layer_add_forecast_date.R +++ b/R/layer_add_forecast_date.R @@ -22,7 +22,7 @@ #' @export #' @examples #' library(dplyr) -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) #' r <- epi_recipe(jhu) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% diff --git a/R/layer_add_target_date.R b/R/layer_add_target_date.R index 991ec2140..f8b6a06e5 100644 --- a/R/layer_add_target_date.R +++ b/R/layer_add_target_date.R @@ -25,7 +25,7 @@ #' @export #' @examples #' library(dplyr) -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) #' r <- epi_recipe(jhu) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% diff --git a/R/layer_cdc_flatline_quantiles.R b/R/layer_cdc_flatline_quantiles.R index fd61c4045..13938d837 100644 --- a/R/layer_cdc_flatline_quantiles.R +++ b/R/layer_cdc_flatline_quantiles.R @@ -56,14 +56,14 @@ #' #' @examples #' library(dplyr) -#' r <- epi_recipe(case_death_rate_subset) %>% +#' r <- epi_recipe(covid_case_death_rates) %>% #' # data is "daily", so we fit this to 1 ahead, the result will contain #' # 1 day ahead residuals #' step_epi_ahead(death_rate, ahead = 1L, skip = TRUE) %>% #' recipes::update_role(death_rate, new_role = "predictor") %>% #' recipes::add_role(time_value, geo_value, new_role = "predictor") #' -#' forecast_date <- max(case_death_rate_subset$time_value) +#' forecast_date <- max(covid_case_death_rates$time_value) #' #' f <- frosting() %>% #' layer_predict() %>% @@ -71,7 +71,7 @@ #' #' eng <- linear_reg(engine = "flatline") #' -#' wf <- epi_workflow(r, eng, f) %>% fit(case_death_rate_subset) +#' wf <- epi_workflow(r, eng, f) %>% fit(covid_case_death_rates) #' preds <- forecast(wf) %>% #' select(-time_value) %>% #' mutate(forecast_date = forecast_date) @@ -91,7 +91,7 @@ #' geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`), fill = blues9[6]) + #' geom_line(aes(y = .pred), color = "orange") + #' geom_line( -#' data = case_death_rate_subset %>% filter(geo_value %in% four_states), +#' data = covid_case_death_rates %>% filter(geo_value %in% four_states), #' aes(x = time_value, y = death_rate) #' ) + #' scale_x_date(limits = c(forecast_date - 90, forecast_date + 30)) + diff --git a/R/layer_naomit.R b/R/layer_naomit.R index 209a663b4..2b111a6f9 100644 --- a/R/layer_naomit.R +++ b/R/layer_naomit.R @@ -12,7 +12,7 @@ #' @export #' @examples #' library(dplyr) -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) #' #' r <- epi_recipe(jhu) %>% diff --git a/R/layer_point_from_distn.R b/R/layer_point_from_distn.R index f14008748..c433717bb 100644 --- a/R/layer_point_from_distn.R +++ b/R/layer_point_from_distn.R @@ -17,7 +17,7 @@ #' #' @examples #' library(dplyr) -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) #' #' r <- epi_recipe(jhu) %>% diff --git a/R/layer_predict.R b/R/layer_predict.R index 6ca17ac24..b59be5f03 100644 --- a/R/layer_predict.R +++ b/R/layer_predict.R @@ -17,7 +17,7 @@ #' #' @examples #' library(dplyr) -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) #' #' r <- epi_recipe(jhu) %>% diff --git a/R/layer_predictive_distn.R b/R/layer_predictive_distn.R index b28e0c765..2b18fbf8e 100644 --- a/R/layer_predictive_distn.R +++ b/R/layer_predictive_distn.R @@ -21,7 +21,7 @@ #' #' @examples #' library(dplyr) -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) #' #' r <- epi_recipe(jhu) %>% diff --git a/R/layer_quantile_distn.R b/R/layer_quantile_distn.R index 5f87ded29..f7bc9259d 100644 --- a/R/layer_quantile_distn.R +++ b/R/layer_quantile_distn.R @@ -23,7 +23,7 @@ #' #' @examples #' library(dplyr) -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) #' #' r <- epi_recipe(jhu) %>% diff --git a/R/layer_residual_quantiles.R b/R/layer_residual_quantiles.R index 1b623adfa..e9b5b7c19 100644 --- a/R/layer_residual_quantiles.R +++ b/R/layer_residual_quantiles.R @@ -15,7 +15,7 @@ #' @export #' @examples #' library(dplyr) -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) #' #' r <- epi_recipe(jhu) %>% diff --git a/R/layer_threshold_preds.R b/R/layer_threshold_preds.R index 56f8059ab..7b8ca0252 100644 --- a/R/layer_threshold_preds.R +++ b/R/layer_threshold_preds.R @@ -23,7 +23,7 @@ #' @export #' @examples #' library(dplyr) -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value < "2021-03-08", geo_value %in% c("ak", "ca", "ar")) #' r <- epi_recipe(jhu) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% diff --git a/R/layers.R b/R/layers.R index 538fcad1b..752f014c5 100644 --- a/R/layers.R +++ b/R/layers.R @@ -42,7 +42,7 @@ layer <- function(subclass, ..., .prefix = "layer_") { #' #' @examples #' library(dplyr) -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) #' r <- epi_recipe(jhu) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% diff --git a/R/make_grf_quantiles.R b/R/make_grf_quantiles.R index 2903c93a8..00e7d0e71 100644 --- a/R/make_grf_quantiles.R +++ b/R/make_grf_quantiles.R @@ -61,7 +61,7 @@ #' # -- a more complicated task #' #' library(dplyr) -#' dat <- case_death_rate_subset %>% +#' dat <- covid_case_death_rates %>% #' filter(time_value > as.Date("2021-10-01")) #' rec <- epi_recipe(dat) %>% #' step_epi_lag(case_rate, death_rate, lag = c(0, 7, 14)) %>% diff --git a/R/model-methods.R b/R/model-methods.R index f3b374879..a575bd591 100644 --- a/R/model-methods.R +++ b/R/model-methods.R @@ -33,7 +33,7 @@ #' @export #' @examples #' library(dplyr) -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) #' #' r <- epi_recipe(jhu) %>% diff --git a/R/pivot_quantiles.R b/R/pivot_quantiles.R index b01dc392c..2a9e0d4e0 100644 --- a/R/pivot_quantiles.R +++ b/R/pivot_quantiles.R @@ -8,7 +8,7 @@ #' @examples #' library(dplyr) #' library(tidyr) -#' edf <- case_death_rate_subset[1:3, ] +#' edf <- covid_case_death_rates[1:3, ] #' edf$q <- dist_quantiles(list(1:5, 2:4, 3:10), list(1:5 / 6, 2:4 / 5, 3:10 / 11)) #' #' edf_nested <- edf %>% mutate(q = nested_quantiles(q)) diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R index 604e06710..3d9f19891 100644 --- a/R/step_adjust_latency.R +++ b/R/step_adjust_latency.R @@ -180,12 +180,12 @@ #' @rdname step_adjust_latency #' @export #' @examples -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) #' # setting the `as_of` to something realistic #' attributes(jhu)$metadata$as_of <- max(jhu$time_value) + 3 #' -#' r <- epi_recipe(case_death_rate_subset) %>% +#' r <- epi_recipe(covid_case_death_rates) %>% #' step_adjust_latency(method = "extend_ahead") %>% #' step_epi_ahead(death_rate, ahead = 7) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) diff --git a/R/step_epi_naomit.R b/R/step_epi_naomit.R index d81ba398d..bfe8a4faa 100644 --- a/R/step_epi_naomit.R +++ b/R/step_epi_naomit.R @@ -8,7 +8,7 @@ #' of data loss. #' @export #' @examples -#' case_death_rate_subset %>% +#' covid_case_death_rates %>% #' epi_recipe() %>% #' step_epi_naomit() step_epi_naomit <- function(recipe) { diff --git a/R/step_epi_shift.R b/R/step_epi_shift.R index d79ad1e2b..beda182e6 100644 --- a/R/step_epi_shift.R +++ b/R/step_epi_shift.R @@ -42,7 +42,7 @@ #' @rdname step_epi_shift #' @export #' @examples -#' r <- epi_recipe(case_death_rate_subset) %>% +#' r <- epi_recipe(covid_case_death_rates) %>% #' step_epi_ahead(death_rate, ahead = 7) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) #' r diff --git a/R/step_epi_slide.R b/R/step_epi_slide.R index c7d3f9fbd..274ce2451 100644 --- a/R/step_epi_slide.R +++ b/R/step_epi_slide.R @@ -37,7 +37,7 @@ #' @export #' @examples #' library(dplyr) -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value >= as.Date("2021-01-01"), geo_value %in% c("ca", "ny")) #' rec <- epi_recipe(jhu) %>% #' step_epi_slide(case_rate, death_rate, diff --git a/R/step_growth_rate.R b/R/step_growth_rate.R index 00bf9bd87..b3a712313 100644 --- a/R/step_growth_rate.R +++ b/R/step_growth_rate.R @@ -32,13 +32,13 @@ #' @importFrom epiprocess growth_rate #' @export #' @examples -#' r <- epi_recipe(case_death_rate_subset) %>% +#' r <- epi_recipe(covid_case_death_rates) %>% #' step_growth_rate(case_rate, death_rate) #' r #' #' r %>% -#' prep(case_death_rate_subset) %>% -#' bake(case_death_rate_subset) +#' prep(covid_case_death_rates) %>% +#' bake(new_data = NULL) step_growth_rate <- function(recipe, ..., diff --git a/R/step_lag_difference.R b/R/step_lag_difference.R index 39ae1ba59..2b0af00f2 100644 --- a/R/step_lag_difference.R +++ b/R/step_lag_difference.R @@ -15,14 +15,14 @@ #' @family row operation steps #' @export #' @examples -#' r <- epi_recipe(case_death_rate_subset) %>% +#' r <- epi_recipe(covid_case_death_rates) %>% #' step_lag_difference(case_rate, death_rate, horizon = c(7, 14)) %>% #' step_epi_naomit() #' r #' #' r %>% -#' prep(case_death_rate_subset) %>% -#' bake(case_death_rate_subset) +#' prep(covid_case_death_rates) %>% +#' bake(new_data = NULL) step_lag_difference <- function(recipe, ..., diff --git a/R/tidy.R b/R/tidy.R index 8fc06398a..47c6efa68 100644 --- a/R/tidy.R +++ b/R/tidy.R @@ -27,7 +27,7 @@ #' #' @examples #' library(dplyr) -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) #' #' r <- epi_recipe(jhu) %>% diff --git a/R/weighted_interval_score.R b/R/weighted_interval_score.R index cd67bbee9..48741de7d 100644 --- a/R/weighted_interval_score.R +++ b/R/weighted_interval_score.R @@ -44,13 +44,13 @@ #' #' # Using some actual forecasts -------- #' library(dplyr) -#' jhu <- case_death_rate_subset %>% +#' jhu <- covid_case_death_rates %>% #' filter(time_value >= "2021-10-01", time_value <= "2021-12-01") #' preds <- flatline_forecaster( #' jhu, "death_rate", #' flatline_args_list(quantile_levels = c(.01, .025, 1:19 / 20, .975, .99)) #' )$predictions -#' actuals <- case_death_rate_subset %>% +#' actuals <- covid_case_death_rates %>% #' filter(time_value == as.Date("2021-12-01") + 7) %>% #' select(geo_value, time_value, actual = death_rate) #' preds <- left_join(preds, actuals, diff --git a/README.Rmd b/README.Rmd index 36af14cd9..73cedbeaa 100644 --- a/README.Rmd +++ b/README.Rmd @@ -81,14 +81,14 @@ interfaces directly to Delphi's ```{r epidf, message=FALSE} library(epipredict) -case_death_rate_subset +covid_case_death_rates ``` To create and train a simple auto-regressive forecaster to predict the death rate two weeks into the future using past (lagged) deaths and cases, we could use the following function. ```{r make-forecasts, warning=FALSE} two_week_ahead <- arx_forecaster( - case_death_rate_subset, + covid_case_death_rates, outcome = "death_rate", predictors = c("case_rate", "death_rate"), args_list = arx_args_list( diff --git a/README.md b/README.md index 9d912f7e6..561d00a1e 100644 --- a/README.md +++ b/README.md @@ -75,7 +75,7 @@ processed using ``` r library(epipredict) -case_death_rate_subset +covid_case_death_rates #> An `epi_df` object, 20,496 x 4 with metadata: #> * geo_type = state #> * time_type = day @@ -103,7 +103,7 @@ cases, we could use the following function. ``` r two_week_ahead <- arx_forecaster( - case_death_rate_subset, + covid_case_death_rates, outcome = "death_rate", predictors = c("case_rate", "death_rate"), args_list = arx_args_list( diff --git a/_pkgdown.yml b/_pkgdown.yml index 5222999b6..dbe188f4c 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -121,8 +121,3 @@ reference: - title: Other utilities contents: - clean_f_name - - title: Included datasets - contents: - - case_death_rate_subset - - state_census - - grad_employ_subset diff --git a/data-raw/case_death_rate_subset.R b/data-raw/case_death_rate_subset.R deleted file mode 100644 index 4fa3e64a7..000000000 --- a/data-raw/case_death_rate_subset.R +++ /dev/null @@ -1,29 +0,0 @@ -library(tidyverse) -library(epidatr) -library(epiprocess) - -x <- pub_covidcast( - data_source = "jhu-csse", - signals = "confirmed_7dav_incidence_prop", - time_type = "day", - geo_type = "state", - time_values = epirange(20201231, 20211231), - geo_values = "*" -) %>% - select(geo_value, time_value, case_rate = value) - -y <- pub_covidcast( - data_source = "jhu-csse", - signals = "deaths_7dav_incidence_prop", - time_type = "day", - geo_type = "state", - time_values = epirange(20201231, 20211231), - geo_values = "*" -) %>% - select(geo_value, time_value, death_rate = value) - -case_death_rate_subset <- x %>% - full_join(y, by = c("geo_value", "time_value")) %>% - as_epi_df() - -usethis::use_data(case_death_rate_subset, overwrite = TRUE) diff --git a/data-raw/grad_employ_subset.R b/data-raw/grad_employ_subset.R deleted file mode 100644 index 38719a02e..000000000 --- a/data-raw/grad_employ_subset.R +++ /dev/null @@ -1,106 +0,0 @@ -library(epipredict) -library(epiprocess) -library(cansim) -library(dplyr) -library(stringr) -library(tidyr) - -# https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=3710011501 -statcan_grad_employ <- get_cansim("37-10-0115-01") - -gemploy <- statcan_grad_employ %>% - select(c( - "REF_DATE", - "GEO", - # "DGUID", - # "UOM", - # "UOM_ID", - # "SCALAR_FACTOR", - # "SCALAR_ID", - # "VECTOR", - # "COORDINATE", - "VALUE", - "STATUS", - # "SYMBOL", - # "TERMINATED", - # "DECIMALS", - # "GeoUID", - # "Hierarchy for GEO", - # "Classification Code for Educational qualification", - # "Hierarchy for Educational qualification", - # "Classification Code for Field of study", - # "Hierarchy for Field of study", - # "Classification Code for Gender", - # "Hierarchy for Gender", - # "Classification Code for Age group", - # "Hierarchy for Age group", - # "Classification Code for Status of student in Canada", - # "Hierarchy for Status of student in Canada", - # "Classification Code for Characteristics after graduation", - # "Hierarchy for Characteristics after graduation", - # "Classification Code for Graduate statistics", - # "Hierarchy for Graduate statistics", - # "val_norm", - # "Date", - "Educational qualification", - "Field of study", - "Gender", - "Age group", - "Status of student in Canada", - "Characteristics after graduation", - "Graduate statistics" - )) %>% - rename( - "geo_value" = "GEO", - "time_value" = "REF_DATE", - "value" = "VALUE", - "status" = "STATUS", - "edu_qual" = "Educational qualification", - "fos" = "Field of study", - "gender" = "Gender", - "age_group" = "Age group", - "student_status" = "Status of student in Canada", - "grad_charac" = "Characteristics after graduation", - "grad_stat" = "Graduate statistics" - ) %>% - mutate( - grad_stat = recode_factor( - grad_stat, - `Number of graduates` = "num_graduates", - `Median employment income two years after graduation` = "med_income_2y", - `Median employment income five years after graduation` = "med_income_5y" - ), - time_value = as.integer(time_value) - ) %>% - pivot_wider(names_from = grad_stat, values_from = value) %>% - filter( - # Drop aggregates for some columns - geo_value != "Canada" & - age_group != "15 to 64 years" & - edu_qual != "Total, educational qualification" & - # Keep aggregates for keys we don't want to keep - fos == "Total, field of study" & - gender == "Total, gender" & - student_status == "Canadian and international students" & - # Since we're looking at 2y and 5y employment income, the only - # characteristics remaining are: - # - Graduates reporting employment income - # - Graduates reporting wages, salaries, and commissions only - # For simplicity, keep the first one only - grad_charac == "Graduates reporting employment income" & - # Only keep "good" data - is.na(status) & - # Drop NA value rows - !is.na(num_graduates) & !is.na(med_income_2y) & !is.na(med_income_5y) - ) %>% - select(-c(status, gender, student_status, grad_charac, fos)) - -nrow(gemploy) -ncol(gemploy) - -grad_employ_subset <- gemploy %>% - as_epi_df( - as_of = "2022-07-19", - other_keys = c("age_group", "edu_qual") - ) -usethis::use_data(grad_employ_subset, overwrite = TRUE) diff --git a/data-raw/state_census.R b/data-raw/state_census.R deleted file mode 100644 index cfa74d38b..000000000 --- a/data-raw/state_census.R +++ /dev/null @@ -1,10 +0,0 @@ -library(dplyr) -library(tidyr) - -state_census <- readr::read_csv("https://github.com/cmu-delphi/covidcast/raw/c89e4d295550ba1540d64d2cc991badf63ad04e5/Python-packages/covidcast-py/covidcast/geo_mappings/state_census.csv") %>% # nolint: line_length_linter - select(STATE, NAME, POPESTIMATE2019, ABBR) %>% - rename(abbr = ABBR, name = NAME, pop = POPESTIMATE2019, fips = STATE) %>% - mutate(abbr = tolower(abbr)) %>% - as_tibble() - -usethis::use_data(state_census, overwrite = TRUE) diff --git a/data/case_death_rate_subset.rda b/data/case_death_rate_subset.rda deleted file mode 100644 index 2e5ced29e..000000000 Binary files a/data/case_death_rate_subset.rda and /dev/null differ diff --git a/data/grad_employ_subset.rda b/data/grad_employ_subset.rda deleted file mode 100644 index 9380b43b5..000000000 Binary files a/data/grad_employ_subset.rda and /dev/null differ diff --git a/data/state_census.rda b/data/state_census.rda deleted file mode 100644 index 1118db0d0..000000000 Binary files a/data/state_census.rda and /dev/null differ diff --git a/inst/extdata/can_prov_cases.rds b/inst/extdata/can_prov_cases.rds deleted file mode 100644 index b6a10a422..000000000 Binary files a/inst/extdata/can_prov_cases.rds and /dev/null differ diff --git a/inst/extdata/canada-case-rates.R b/inst/extdata/canada-case-rates.R deleted file mode 100644 index 7cf88d602..000000000 --- a/inst/extdata/canada-case-rates.R +++ /dev/null @@ -1,23 +0,0 @@ -path_to_csvs <- here::here("../../COVID-BC/Covid19Canada/updates.nosync/") -files <- list.files(path_to_csvs) -ca_as_ofs <- as.Date(substr(files, 1, 10)) %>% - intersect(fc_time_values) %>% - as.Date(origin = "1970-01-01") - -can <- purrr::map(ca_as_ofs, ~ { - readr::read_csv(here::here(path_to_csvs, paste0(.x, ".csv"))) %>% - left_join(ca_pop) %>% - mutate(time_value = lubridate::dmy(date_report)) %>% - filter(province %in% ca_pop$province, time_value > "2020-04-01") %>% - mutate( - geo_value = province, - case_rate = cases / population * 1e5 - ) %>% - select(geo_value, time_value, case_rate) %>% - as_epi_df(geo_type = "province", as_of = .x) -}) -names(can) <- ca_as_ofs -can <- can %>% - bind_rows(.id = "version") %>% - mutate(version = lubridate::ymd(version)) -saveRDS(can, "inst/extdata/can_prov_cases.rds") diff --git a/inst/extdata/epi_archive.rds b/inst/extdata/epi_archive.rds deleted file mode 100644 index 8ca52be76..000000000 Binary files a/inst/extdata/epi_archive.rds and /dev/null differ diff --git a/man/Add_model.Rd b/man/Add_model.Rd index 17b65793c..641bd7676 100644 --- a/man/Add_model.Rd +++ b/man/Add_model.Rd @@ -72,7 +72,7 @@ properly. } \examples{ library(dplyr) -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) r <- epi_recipe(jhu) \%>\% diff --git a/man/add_epi_recipe.Rd b/man/add_epi_recipe.Rd index 0da2d55b3..b74267524 100644 --- a/man/add_epi_recipe.Rd +++ b/man/add_epi_recipe.Rd @@ -41,7 +41,7 @@ default blueprint to automatically handle \link[epiprocess:epi_df]{epiprocess::e library(dplyr) library(recipes) -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value > "2021-08-01") \%>\% arrange(geo_value, time_value) diff --git a/man/add_frosting.Rd b/man/add_frosting.Rd index 94812cbe2..00b899b7c 100644 --- a/man/add_frosting.Rd +++ b/man/add_frosting.Rd @@ -27,7 +27,7 @@ Add frosting to a workflow } \examples{ library(dplyr) -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) r <- epi_recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% diff --git a/man/adjust_epi_recipe.Rd b/man/adjust_epi_recipe.Rd index 7468c4ce2..0ed7148a3 100644 --- a/man/adjust_epi_recipe.Rd +++ b/man/adjust_epi_recipe.Rd @@ -55,7 +55,7 @@ illustrations of the different types of updates. library(dplyr) library(workflows) -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) r <- epi_recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% diff --git a/man/adjust_frosting.Rd b/man/adjust_frosting.Rd index c089b3443..3b855a9af 100644 --- a/man/adjust_frosting.Rd +++ b/man/adjust_frosting.Rd @@ -36,7 +36,7 @@ illustrations of the different types of updates. } \examples{ library(dplyr) -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) r <- epi_recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% diff --git a/man/arx_class_epi_workflow.Rd b/man/arx_class_epi_workflow.Rd index 713365f17..9f0aae6a1 100644 --- a/man/arx_class_epi_workflow.Rd +++ b/man/arx_class_epi_workflow.Rd @@ -48,7 +48,7 @@ may alter the returned \code{epi_workflow} object but can be omitted. } \examples{ library(dplyr) -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value >= as.Date("2021-11-01")) arx_class_epi_workflow(jhu, "death_rate", c("case_rate", "death_rate")) diff --git a/man/arx_classifier.Rd b/man/arx_classifier.Rd index c7c2cf059..94503f3d3 100644 --- a/man/arx_classifier.Rd +++ b/man/arx_classifier.Rd @@ -49,7 +49,7 @@ that it estimates a class at a particular target horizon. } \examples{ library(dplyr) -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value >= as.Date("2021-11-01")) out <- arx_classifier(jhu, "death_rate", c("case_rate", "death_rate")) diff --git a/man/arx_fcast_epi_workflow.Rd b/man/arx_fcast_epi_workflow.Rd index 4070a3337..c2e38218f 100644 --- a/man/arx_fcast_epi_workflow.Rd +++ b/man/arx_fcast_epi_workflow.Rd @@ -43,7 +43,7 @@ use \code{\link[=quantile_reg]{quantile_reg()}}) but can be omitted. } \examples{ library(dplyr) -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value >= as.Date("2021-12-01")) arx_fcast_epi_workflow( diff --git a/man/arx_forecaster.Rd b/man/arx_forecaster.Rd index d8c7671dc..ff820b8c8 100644 --- a/man/arx_forecaster.Rd +++ b/man/arx_forecaster.Rd @@ -41,7 +41,7 @@ This is an autoregressive forecasting model for that it estimates a model for a particular target horizon. } \examples{ -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% dplyr::filter(time_value >= as.Date("2021-12-01")) out <- arx_forecaster( diff --git a/man/autoplot-epipred.Rd b/man/autoplot-epipred.Rd index 27bfdf5f7..1025759b3 100644 --- a/man/autoplot-epipred.Rd +++ b/man/autoplot-epipred.Rd @@ -71,7 +71,7 @@ can simply call \code{autoplot()} on the original \code{epi_df}). } \examples{ library(dplyr) -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value >= as.Date("2021-11-01")) r <- epi_recipe(jhu) \%>\% @@ -112,7 +112,7 @@ autoplot(wf, p, .max_facets = 4) # ------- Plotting canned forecaster output -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value >= as.Date("2021-11-01")) flat <- flatline_forecaster(jhu, "death_rate") autoplot(flat, .max_facets = 4) diff --git a/man/case_death_rate_subset.Rd b/man/case_death_rate_subset.Rd deleted file mode 100644 index 119c8ee26..000000000 --- a/man/case_death_rate_subset.Rd +++ /dev/null @@ -1,49 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/data.R -\docType{data} -\name{case_death_rate_subset} -\alias{case_death_rate_subset} -\title{Subset of JHU daily state cases and deaths} -\format{ -A tibble with 20,496 rows and 4 variables: -\describe{ -\item{geo_value}{the geographic value associated with each row -of measurements.} -\item{time_value}{the time value associated with each row of measurements.} -\item{case_rate}{7-day average signal of number of new -confirmed COVID-19 cases per 100,000 population, daily} -\item{death_rate}{7-day average signal of number of new confirmed -deaths due to COVID-19 per 100,000 population, daily} -} -} -\source{ -This object contains a modified part of the -\href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University} -as \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}. -This data set is licensed under the terms of the -\href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license} -by the Johns Hopkins University on behalf of its Center for Systems Science -in Engineering. Copyright Johns Hopkins University 2020. - -Modifications: -\itemize{ -\item \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}: -These signals are taken directly from the JHU CSSE -\href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub repository} -without changes. The 7-day average signals are computed by Delphi by -calculating moving averages of the preceding 7 days, so the signal for -June 7 is the average of the underlying data for June 1 through 7, -inclusive. -} -} -\usage{ -case_death_rate_subset -} -\description{ -This data source of confirmed COVID-19 cases and deaths -is based on reports made available by the Center for -Systems Science and Engineering at Johns Hopkins University. -This example data ranges from Dec 31, 2020 to Dec 31, 2021, -and includes all states. -} -\keyword{datasets} diff --git a/man/cdc_baseline_forecaster.Rd b/man/cdc_baseline_forecaster.Rd index 0c7f1e436..e7cefda2d 100644 --- a/man/cdc_baseline_forecaster.Rd +++ b/man/cdc_baseline_forecaster.Rd @@ -38,7 +38,7 @@ This forecaster is meant to produce exactly the CDC Baseline used for } \examples{ library(dplyr) -weekly_deaths <- case_death_rate_subset \%>\% +weekly_deaths <- covid_case_death_rates \%>\% select(geo_value, time_value, death_rate) \%>\% left_join(state_census \%>\% select(pop, abbr), by = c("geo_value" = "abbr")) \%>\% mutate(deaths = pmax(death_rate / 1e5 * pop * 7, 0)) \%>\% diff --git a/man/epi_recipe.Rd b/man/epi_recipe.Rd index d0105d1ec..9ef5eb288 100644 --- a/man/epi_recipe.Rd +++ b/man/epi_recipe.Rd @@ -59,7 +59,7 @@ columns present in an \code{epi_df} \examples{ library(dplyr) library(recipes) -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value > "2021-08-01") \%>\% arrange(geo_value, time_value) diff --git a/man/epi_workflow.Rd b/man/epi_workflow.Rd index b29078d52..59e3d5c8f 100644 --- a/man/epi_workflow.Rd +++ b/man/epi_workflow.Rd @@ -33,7 +33,7 @@ this operates exactly like a \code{\link[workflows:workflow]{workflows::workflow and numerous examples, see there. } \examples{ -jhu <- case_death_rate_subset +jhu <- covid_case_death_rates r <- epi_recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% diff --git a/man/fit-epi_workflow.Rd b/man/fit-epi_workflow.Rd index 3dfa0029a..83b3b9f51 100644 --- a/man/fit-epi_workflow.Rd +++ b/man/fit-epi_workflow.Rd @@ -28,7 +28,7 @@ Fitting an \code{epi_workflow} involves two main steps, which are preprocessing the data and fitting the underlying parsnip model. } \examples{ -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) r <- epi_recipe(jhu) \%>\% diff --git a/man/flatline_forecaster.Rd b/man/flatline_forecaster.Rd index 1803f1078..f70c05e0f 100644 --- a/man/flatline_forecaster.Rd +++ b/man/flatline_forecaster.Rd @@ -35,7 +35,7 @@ This forecaster is very similar to that used by the \href{https://covid19forecasthub.org}{COVID19ForecastHub} } \examples{ -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% dplyr::filter(time_value >= as.Date("2021-12-01")) out <- flatline_forecaster(jhu, "death_rate") diff --git a/man/flusight_hub_formatter.Rd b/man/flusight_hub_formatter.Rd index b2be9b4fe..f48d33d65 100644 --- a/man/flusight_hub_formatter.Rd +++ b/man/flusight_hub_formatter.Rd @@ -42,7 +42,7 @@ format for this forecast task is \href{https://github.com/cdcepi/FluSight-foreca } \examples{ library(dplyr) -weekly_deaths <- case_death_rate_subset \%>\% +weekly_deaths <- covid_case_death_rates \%>\% filter( time_value >= as.Date("2021-09-01"), geo_value \%in\% c("ca", "ny", "dc", "ga", "vt") diff --git a/man/frosting.Rd b/man/frosting.Rd index a75f21b61..8534bc6d6 100644 --- a/man/frosting.Rd +++ b/man/frosting.Rd @@ -28,7 +28,7 @@ f <- frosting() wf <- epi_workflow() \%>\% add_frosting(f) # A more realistic example -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) r <- epi_recipe(jhu) \%>\% diff --git a/man/get_test_data.Rd b/man/get_test_data.Rd index 81649452a..16359b9c3 100644 --- a/man/get_test_data.Rd +++ b/man/get_test_data.Rd @@ -31,9 +31,9 @@ calculated internally. } \examples{ # create recipe -rec <- epi_recipe(case_death_rate_subset) \%>\% +rec <- epi_recipe(covid_case_death_rates) \%>\% step_epi_ahead(death_rate, ahead = 7) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% step_epi_lag(case_rate, lag = c(0, 7, 14)) -get_test_data(recipe = rec, x = case_death_rate_subset) +get_test_data(recipe = rec, x = covid_case_death_rates) } diff --git a/man/grad_employ_subset.Rd b/man/grad_employ_subset.Rd deleted file mode 100644 index 46ba36913..000000000 --- a/man/grad_employ_subset.Rd +++ /dev/null @@ -1,44 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/data.R -\docType{data} -\name{grad_employ_subset} -\alias{grad_employ_subset} -\title{Subset of Statistics Canada median employment income for postsecondary graduates} -\format{ -An \link[epiprocess:epi_df]{epiprocess::epi_df} with 10193 rows and 8 variables: -\describe{ -\item{geo_value}{The province in Canada associated with each -row of measurements.} -\item{time_value}{The time value, a year integer in YYYY format} -\item{edu_qual}{The education qualification} -\item{fos}{The field of study} -\item{age_group}{The age group; either 15 to 34 or 35 to 64} -\item{num_graduates}{The number of graduates for the given row of characteristics} -\item{med_income_2y}{The median employment income two years after graduation} -\item{med_income_5y}{The median employment income five years after graduation} -} -} -\source{ -This object contains modified data from the following Statistics Canada -data table: \href{https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=3710011501}{ -Characteristics and median employment income of longitudinal cohorts of postsecondary -graduates two and five years after graduation, by educational qualification and -field of study (primary groupings) -} - -Modifications: -\itemize{ -\item Only provincial-level geo_values are kept -\item Only age group, field of study, and educational qualification are kept as -covariates. For the remaining covariates, we keep aggregated values and -drop the level-specific rows. -\item No modifications were made to the time range of the data -} -} -\usage{ -grad_employ_subset -} -\description{ -Subset of Statistics Canada median employment income for postsecondary graduates -} -\keyword{datasets} diff --git a/man/grf_quantiles.Rd b/man/grf_quantiles.Rd index e6852a55b..f6400edcf 100644 --- a/man/grf_quantiles.Rd +++ b/man/grf_quantiles.Rd @@ -85,7 +85,7 @@ predict(out, new_data = tib[1:5, ]) \%>\% # -- a more complicated task library(dplyr) -dat <- case_death_rate_subset \%>\% +dat <- covid_case_death_rates \%>\% filter(time_value > as.Date("2021-10-01")) rec <- epi_recipe(dat) \%>\% step_epi_lag(case_rate, death_rate, lag = c(0, 7, 14)) \%>\% diff --git a/man/layer_add_forecast_date.Rd b/man/layer_add_forecast_date.Rd index aa224013f..cc92e9a71 100644 --- a/man/layer_add_forecast_date.Rd +++ b/man/layer_add_forecast_date.Rd @@ -39,7 +39,7 @@ model fitting, and postprocessing), an appropriate warning will be thrown. } \examples{ library(dplyr) -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) r <- epi_recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% diff --git a/man/layer_add_target_date.Rd b/man/layer_add_target_date.Rd index e522cd6da..37416d24e 100644 --- a/man/layer_add_target_date.Rd +++ b/man/layer_add_target_date.Rd @@ -44,7 +44,7 @@ every dataset used (prep, training, and prediction). } \examples{ library(dplyr) -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) r <- epi_recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% diff --git a/man/layer_cdc_flatline_quantiles.Rd b/man/layer_cdc_flatline_quantiles.Rd index c3bc4f257..632fdb65e 100644 --- a/man/layer_cdc_flatline_quantiles.Rd +++ b/man/layer_cdc_flatline_quantiles.Rd @@ -85,14 +85,14 @@ adds them on to produce wider intervals as \code{ahead} increases. } \examples{ library(dplyr) -r <- epi_recipe(case_death_rate_subset) \%>\% +r <- epi_recipe(covid_case_death_rates) \%>\% # data is "daily", so we fit this to 1 ahead, the result will contain # 1 day ahead residuals step_epi_ahead(death_rate, ahead = 1L, skip = TRUE) \%>\% recipes::update_role(death_rate, new_role = "predictor") \%>\% recipes::add_role(time_value, geo_value, new_role = "predictor") -forecast_date <- max(case_death_rate_subset$time_value) +forecast_date <- max(covid_case_death_rates$time_value) f <- frosting() \%>\% layer_predict() \%>\% @@ -100,7 +100,7 @@ f <- frosting() \%>\% eng <- linear_reg(engine = "flatline") -wf <- epi_workflow(r, eng, f) \%>\% fit(case_death_rate_subset) +wf <- epi_workflow(r, eng, f) \%>\% fit(covid_case_death_rates) preds <- forecast(wf) \%>\% select(-time_value) \%>\% mutate(forecast_date = forecast_date) @@ -120,7 +120,7 @@ if (require("ggplot2")) { geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`), fill = blues9[6]) + geom_line(aes(y = .pred), color = "orange") + geom_line( - data = case_death_rate_subset \%>\% filter(geo_value \%in\% four_states), + data = covid_case_death_rates \%>\% filter(geo_value \%in\% four_states), aes(x = time_value, y = death_rate) ) + scale_x_date(limits = c(forecast_date - 90, forecast_date + 30)) + diff --git a/man/layer_naomit.Rd b/man/layer_naomit.Rd index d77112f95..06e09d4b0 100644 --- a/man/layer_naomit.Rd +++ b/man/layer_naomit.Rd @@ -25,7 +25,7 @@ Omit \code{NA}s from predictions or other columns } \examples{ library(dplyr) -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) r <- epi_recipe(jhu) \%>\% diff --git a/man/layer_point_from_distn.Rd b/man/layer_point_from_distn.Rd index 276f7cb17..bde2323b1 100644 --- a/man/layer_point_from_distn.Rd +++ b/man/layer_point_from_distn.Rd @@ -35,7 +35,7 @@ or set the \code{name} argument to something specific. } \examples{ library(dplyr) -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) r <- epi_recipe(jhu) \%>\% diff --git a/man/layer_predict.Rd b/man/layer_predict.Rd index 8ae92f4c8..db771b882 100644 --- a/man/layer_predict.Rd +++ b/man/layer_predict.Rd @@ -59,7 +59,7 @@ postprocessor. } \examples{ library(dplyr) -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) r <- epi_recipe(jhu) \%>\% diff --git a/man/layer_predictive_distn.Rd b/man/layer_predictive_distn.Rd index 240db5f5b..3bd95425b 100644 --- a/man/layer_predictive_distn.Rd +++ b/man/layer_predictive_distn.Rd @@ -40,7 +40,7 @@ should be reasonably accurate for models fit using \code{lm} when the new point } \examples{ library(dplyr) -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) r <- epi_recipe(jhu) \%>\% diff --git a/man/layer_quantile_distn.Rd b/man/layer_quantile_distn.Rd index 68192deee..3a5cb60e2 100644 --- a/man/layer_quantile_distn.Rd +++ b/man/layer_quantile_distn.Rd @@ -46,7 +46,7 @@ If these engines were used, then this layer will grab out estimated } \examples{ library(dplyr) -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) r <- epi_recipe(jhu) \%>\% diff --git a/man/layer_residual_quantiles.Rd b/man/layer_residual_quantiles.Rd index 39e1ecfbe..a7deded71 100644 --- a/man/layer_residual_quantiles.Rd +++ b/man/layer_residual_quantiles.Rd @@ -40,7 +40,7 @@ Creates predictions based on residual quantiles } \examples{ library(dplyr) -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) r <- epi_recipe(jhu) \%>\% diff --git a/man/layer_threshold.Rd b/man/layer_threshold.Rd index 0f4b1dfb7..702c5d713 100644 --- a/man/layer_threshold.Rd +++ b/man/layer_threshold.Rd @@ -41,7 +41,7 @@ to the threshold values. } \examples{ library(dplyr) -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value < "2021-03-08", geo_value \%in\% c("ak", "ca", "ar")) r <- epi_recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% diff --git a/man/nested_quantiles.Rd b/man/nested_quantiles.Rd index b34b718ca..0fa0fe8cc 100644 --- a/man/nested_quantiles.Rd +++ b/man/nested_quantiles.Rd @@ -18,7 +18,7 @@ Turn a vector of quantile distributions into a list-col \examples{ library(dplyr) library(tidyr) -edf <- case_death_rate_subset[1:3, ] +edf <- covid_case_death_rates[1:3, ] edf$q <- dist_quantiles(list(1:5, 2:4, 3:10), list(1:5 / 6, 2:4 / 5, 3:10 / 11)) edf_nested <- edf \%>\% mutate(q = nested_quantiles(q)) diff --git a/man/predict-epi_workflow.Rd b/man/predict-epi_workflow.Rd index 130279249..0b605d556 100644 --- a/man/predict-epi_workflow.Rd +++ b/man/predict-epi_workflow.Rd @@ -66,7 +66,7 @@ possible. Specifically, the output will have \code{time_value} and } } \examples{ -jhu <- case_death_rate_subset +jhu <- covid_case_death_rates r <- epi_recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% diff --git a/man/state_census.Rd b/man/state_census.Rd deleted file mode 100644 index eec13eb53..000000000 --- a/man/state_census.Rd +++ /dev/null @@ -1,33 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/data.R -\docType{data} -\name{state_census} -\alias{state_census} -\title{State population data} -\format{ -Data frame with 57 rows (including one for the United States as a -whole, plus the District of Columbia, Puerto Rico Commonwealth, -American Samoa, Guam, the U.S. Virgin Islands, and the Northern Mariana, -Islands). - -\describe{ -\item{fips}{FIPS code} -\item{name}{Full name of the state or territory} -\item{pop}{Estimate of the location's resident population in -2019.} -\item{abbr}{Postal abbreviation for the location} -} -} -\source{ -United States Census Bureau, at -\url{https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/counties/totals/co-est2019-alldata.pdf}, -\url{https://www.census.gov/data/tables/time-series/demo/popest/2010s-total-puerto-rico-municipios.html}, -and \url{https://www.census.gov/data/tables/2010/dec/2010-island-areas.html} -} -\usage{ -state_census -} -\description{ -Data set on state populations, from the 2019 US Census. -} -\keyword{datasets} diff --git a/man/step_adjust_latency.Rd b/man/step_adjust_latency.Rd index af733fcce..1a6770428 100644 --- a/man/step_adjust_latency.Rd +++ b/man/step_adjust_latency.Rd @@ -267,8 +267,8 @@ while this will not: \if{html}{\out{
}}\preformatted{toy_recipe <- epi_recipe(toy_df) \%>\% step_epi_lag(a, lag=0) \%>\% step_adjust_latency(a, method = "extend_lags") -#> Warning: If `method` is "extend_lags" or "locf", then the previous `step_epi_lag`s won't -#> work with modified data. +#> Warning: If `method` is "extend_lags" or "locf", then the previous +#> `step_epi_lag`s won't work with modified data. }\if{html}{\out{
}} If you create columns that you then apply lags to (such as @@ -277,12 +277,12 @@ If you create columns that you then apply lags to (such as } \examples{ -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% dplyr::filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) # setting the `as_of` to something realistic attributes(jhu)$metadata$as_of <- max(jhu$time_value) + 3 -r <- epi_recipe(case_death_rate_subset) \%>\% +r <- epi_recipe(covid_case_death_rates) \%>\% step_adjust_latency(method = "extend_ahead") \%>\% step_epi_ahead(death_rate, ahead = 7) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) diff --git a/man/step_epi_naomit.Rd b/man/step_epi_naomit.Rd index b579dd6d6..faf7484da 100644 --- a/man/step_epi_naomit.Rd +++ b/man/step_epi_naomit.Rd @@ -19,7 +19,7 @@ of data loss. Unified NA omission wrapper function for recipes } \examples{ -case_death_rate_subset \%>\% +covid_case_death_rates \%>\% epi_recipe() \%>\% step_epi_naomit() } diff --git a/man/step_epi_shift.Rd b/man/step_epi_shift.Rd index 30ac05d16..867410360 100644 --- a/man/step_epi_shift.Rd +++ b/man/step_epi_shift.Rd @@ -78,7 +78,7 @@ are always set to \code{"ahead_"} and \code{"epi_ahead"} respectively, while for \code{step_epi_lag}, they are set to \code{"lag_"} and \verb{"epi_lag}, respectively. } \examples{ -r <- epi_recipe(case_death_rate_subset) \%>\% +r <- epi_recipe(covid_case_death_rates) \%>\% step_epi_ahead(death_rate, ahead = 7) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) r diff --git a/man/step_epi_slide.Rd b/man/step_epi_slide.Rd index 242f8e312..b8e4cedb1 100644 --- a/man/step_epi_slide.Rd +++ b/man/step_epi_slide.Rd @@ -81,7 +81,7 @@ a computation along existing data. } \examples{ library(dplyr) -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value >= as.Date("2021-01-01"), geo_value \%in\% c("ca", "ny")) rec <- epi_recipe(jhu) \%>\% step_epi_slide(case_rate, death_rate, diff --git a/man/step_growth_rate.Rd b/man/step_growth_rate.Rd index 752b38dbe..12963f8da 100644 --- a/man/step_growth_rate.Rd +++ b/man/step_growth_rate.Rd @@ -73,13 +73,13 @@ sequence of any existing operations. that will generate one or more new columns of derived data. } \examples{ -r <- epi_recipe(case_death_rate_subset) \%>\% +r <- epi_recipe(covid_case_death_rates) \%>\% step_growth_rate(case_rate, death_rate) r r \%>\% - prep(case_death_rate_subset) \%>\% - bake(case_death_rate_subset) + prep(covid_case_death_rates) \%>\% + bake(new_data = NULL) } \seealso{ Other row operation steps: diff --git a/man/step_lag_difference.Rd b/man/step_lag_difference.Rd index e8ec2101a..6151bee84 100644 --- a/man/step_lag_difference.Rd +++ b/man/step_lag_difference.Rd @@ -47,14 +47,14 @@ sequence of any existing operations. that will generate one or more new columns of derived data. } \examples{ -r <- epi_recipe(case_death_rate_subset) \%>\% +r <- epi_recipe(covid_case_death_rates) \%>\% step_lag_difference(case_rate, death_rate, horizon = c(7, 14)) \%>\% step_epi_naomit() r r \%>\% - prep(case_death_rate_subset) \%>\% - bake(case_death_rate_subset) + prep(covid_case_death_rates) \%>\% + bake(new_data = NULL) } \seealso{ Other row operation steps: diff --git a/man/tidy.frosting.Rd b/man/tidy.frosting.Rd index ba3c0f3d5..8152b1440 100644 --- a/man/tidy.frosting.Rd +++ b/man/tidy.frosting.Rd @@ -38,7 +38,7 @@ version of the \code{tidy} method for a recipe. } \examples{ library(dplyr) -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) r <- epi_recipe(jhu) \%>\% diff --git a/man/update.layer.Rd b/man/update.layer.Rd index 9604992e1..f151beea9 100644 --- a/man/update.layer.Rd +++ b/man/update.layer.Rd @@ -19,7 +19,7 @@ Analogous to \code{update.step()} from the \code{recipes} package. } \examples{ library(dplyr) -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) r <- epi_recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% diff --git a/man/weighted_interval_score.Rd b/man/weighted_interval_score.Rd index 4907e2724..4aac20e7d 100644 --- a/man/weighted_interval_score.Rd +++ b/man/weighted_interval_score.Rd @@ -80,13 +80,13 @@ weighted_interval_score(dist_quantiles(1:4, 1:4 / 5), 2.5, 1:9 / 10, # Using some actual forecasts -------- library(dplyr) -jhu <- case_death_rate_subset \%>\% +jhu <- covid_case_death_rates \%>\% filter(time_value >= "2021-10-01", time_value <= "2021-12-01") preds <- flatline_forecaster( jhu, "death_rate", flatline_args_list(quantile_levels = c(.01, .025, 1:19 / 20, .975, .99)) )$predictions -actuals <- case_death_rate_subset \%>\% +actuals <- covid_case_death_rates \%>\% filter(time_value == as.Date("2021-12-01") + 7) \%>\% select(geo_value, time_value, actual = death_rate) preds <- left_join(preds, actuals, diff --git a/tests/testthat/_snaps/arg_is_.md b/tests/testthat/_snaps/arg_is_.md index 9250f1707..f05ca780d 100644 --- a/tests/testthat/_snaps/arg_is_.md +++ b/tests/testthat/_snaps/arg_is_.md @@ -377,7 +377,7 @@ # simple surface step test Code - epi_recipe(case_death_rate_subset) %>% step_epi_lag(death_rate, lag = "hello") + epi_recipe(cases_deaths_subset) %>% step_epi_lag(death_rate, lag = "hello") Condition Error in `step_epi_lag()`: ! `lag` must be a non-negative integer. diff --git a/tests/testthat/_snaps/get_test_data.md b/tests/testthat/_snaps/get_test_data.md index e65b0715c..22d0c942a 100644 --- a/tests/testthat/_snaps/get_test_data.md +++ b/tests/testthat/_snaps/get_test_data.md @@ -1,7 +1,7 @@ # expect insufficient training data error Code - get_test_data(recipe = r, x = case_death_rate_subset) + get_test_data(recipe = r, x = covid_case_death_rates) Condition Error in `get_test_data()`: ! You supplied insufficient recent data for this recipe. diff --git a/tests/testthat/_snaps/snapshots.md b/tests/testthat/_snaps/snapshots.md index f3e7e5737..a03a8dd43 100644 --- a/tests/testthat/_snaps/snapshots.md +++ b/tests/testthat/_snaps/snapshots.md @@ -1093,6 +1093,7 @@ Training data was an with: * Geography: state, + * Other keys: , * Time type: day, * Using data up-to-date as of: 2022-05-31. * With the last data available on 2021-12-31 @@ -1116,6 +1117,7 @@ Training data was an with: * Geography: state, + * Other keys: , * Time type: day, * Using data up-to-date as of: 2022-05-31. * With the last data available on 2021-12-31 @@ -1140,6 +1142,7 @@ Training data was an with: * Geography: state, + * Other keys: , * Time type: day, * Using data up-to-date as of: 2022-05-31. * With the last data available on 2021-12-31 diff --git a/tests/testthat/_snaps/step_adjust_latency.md b/tests/testthat/_snaps/step_adjust_latency.md index e37ae07ea..8d09248f3 100644 --- a/tests/testthat/_snaps/step_adjust_latency.md +++ b/tests/testthat/_snaps/step_adjust_latency.md @@ -63,7 +63,7 @@ --- Code - prep(r6, case_death_rate_subset) + prep(r6, covid_case_death_rates) Message -- Epi Recipe ------------------------------------------------------------------ diff --git a/tests/testthat/test-arg_is_.R b/tests/testthat/test-arg_is_.R index a1606f021..f043328c7 100644 --- a/tests/testthat/test-arg_is_.R +++ b/tests/testthat/test-arg_is_.R @@ -149,7 +149,7 @@ test_that("coerce scalar to date", { test_that("simple surface step test", { expect_snapshot( error = TRUE, - epi_recipe(case_death_rate_subset) %>% + epi_recipe(cases_deaths_subset) %>% step_epi_lag(death_rate, lag = "hello") ) }) diff --git a/tests/testthat/test-bake-method.R b/tests/testthat/test-bake-method.R index 06f861012..8e118a18d 100644 --- a/tests/testthat/test-bake-method.R +++ b/tests/testthat/test-bake-method.R @@ -1,5 +1,5 @@ test_that("bake method works in all cases", { - edf <- case_death_rate_subset %>% + edf <- covid_case_death_rates %>% filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) r <- epi_recipe(edf) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% diff --git a/tests/testthat/test-blueprint.R b/tests/testthat/test-blueprint.R index 2d22aff6e..b37bd5e4a 100644 --- a/tests/testthat/test-blueprint.R +++ b/tests/testthat/test-blueprint.R @@ -4,7 +4,7 @@ test_that("epi_recipe blueprint keeps the class, mold works", { expect_s3_class(bp, "default_epi_recipe_blueprint") expect_s3_class(refresh_blueprint(bp), "default_epi_recipe_blueprint") - jhu <- case_death_rate_subset + jhu <- covid_case_death_rates # expect_s3_class(er_check_is_data_like(jhu), "epi_df") r <- epi_recipe(jhu) %>% diff --git a/tests/testthat/test-epi_recipe.R b/tests/testthat/test-epi_recipe.R index 1b06cf24c..b4c59c0e5 100644 --- a/tests/testthat/test-epi_recipe.R +++ b/tests/testthat/test-epi_recipe.R @@ -103,7 +103,7 @@ test_that("epi_recipe epi_df works", { test_that("add/update/adjust/remove epi_recipe works as intended", { - jhu <- case_death_rate_subset + jhu <- covid_case_death_rates r <- epi_recipe(jhu) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% diff --git a/tests/testthat/test-epi_workflow.R b/tests/testthat/test-epi_workflow.R index af6ef39ca..cce68a80f 100644 --- a/tests/testthat/test-epi_workflow.R +++ b/tests/testthat/test-epi_workflow.R @@ -1,5 +1,5 @@ test_that("postprocesser was evaluated", { - r <- epi_recipe(case_death_rate_subset) + r <- epi_recipe(covid_case_death_rates) s <- parsnip::linear_reg() f <- frosting() @@ -12,7 +12,7 @@ test_that("postprocesser was evaluated", { test_that("outcome of the two methods are the same", { - jhu <- case_death_rate_subset + jhu <- covid_case_death_rates r <- epi_recipe(jhu) %>% step_epi_lag(death_rate, lag = c(0, 7)) %>% @@ -33,7 +33,7 @@ test_that("outcome of the two methods are the same", { }) test_that("model can be added/updated/removed from epi_workflow", { - jhu <- case_death_rate_subset %>% + jhu <- covid_case_death_rates %>% dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) r <- epi_recipe(jhu) %>% @@ -64,7 +64,7 @@ test_that("model can be added/updated/removed from epi_workflow", { }) test_that("forecast method works", { - jhu <- case_death_rate_subset %>% + jhu <- covid_case_death_rates %>% filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) r <- epi_recipe(jhu) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% @@ -89,7 +89,7 @@ test_that("forecast method works", { }) test_that("forecast method errors when workflow not fit", { - jhu <- case_death_rate_subset %>% + jhu <- covid_case_death_rates %>% filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) r <- epi_recipe(jhu) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% diff --git a/tests/testthat/test-extract_argument.R b/tests/testthat/test-extract_argument.R index 7434763e7..7ac160e67 100644 --- a/tests/testthat/test-extract_argument.R +++ b/tests/testthat/test-extract_argument.R @@ -28,7 +28,7 @@ test_that("layer argument extractor works", { }) test_that("recipe argument extractor works", { - jhu <- case_death_rate_subset %>% + jhu <- covid_case_death_rates %>% dplyr::filter(time_value > "2021-08-01") %>% dplyr::arrange(geo_value, time_value) diff --git a/tests/testthat/test-frosting.R b/tests/testthat/test-frosting.R index 1bdce3b5a..cd153b200 100644 --- a/tests/testthat/test-frosting.R +++ b/tests/testthat/test-frosting.R @@ -40,7 +40,7 @@ test_that("frosting can be created/added/updated/adjusted/removed", { test_that("prediction works without any postprocessor", { - jhu <- case_death_rate_subset %>% + jhu <- covid_case_death_rates %>% dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) r <- epi_recipe(jhu) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% @@ -62,7 +62,7 @@ test_that("prediction works without any postprocessor", { test_that("layer_predict is added by default if missing", { - jhu <- case_death_rate_subset %>% + jhu <- covid_case_death_rates %>% dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) r <- epi_recipe(jhu) %>% @@ -89,7 +89,7 @@ test_that("layer_predict is added by default if missing", { test_that("parsnip settings can be passed through predict.epi_workflow", { - jhu <- case_death_rate_subset %>% + jhu <- covid_case_death_rates %>% dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) r <- epi_recipe(jhu) %>% diff --git a/tests/testthat/test-get_test_data.R b/tests/testthat/test-get_test_data.R index 5f315c499..7822f5433 100644 --- a/tests/testthat/test-get_test_data.R +++ b/tests/testthat/test-get_test_data.R @@ -1,17 +1,17 @@ suppressPackageStartupMessages(library(dplyr)) test_that("return expected number of rows and returned dataset is ungrouped", { - r <- epi_recipe(case_death_rate_subset) %>% + r <- epi_recipe(covid_case_death_rates) %>% step_epi_ahead(death_rate, ahead = 7) %>% step_epi_lag(death_rate, lag = c(0, 7, 14, 21, 28)) %>% step_epi_lag(case_rate, lag = c(0, 7, 14)) %>% step_naomit(all_predictors()) %>% step_naomit(all_outcomes(), skip = TRUE) - test <- get_test_data(recipe = r, x = case_death_rate_subset) + test <- get_test_data(recipe = r, x = covid_case_death_rates) expect_equal( nrow(test), - dplyr::n_distinct(case_death_rate_subset$geo_value) * 29 + dplyr::n_distinct(covid_case_death_rates$geo_value) * 29 ) expect_false(dplyr::is.grouped_df(test)) @@ -19,25 +19,25 @@ test_that("return expected number of rows and returned dataset is ungrouped", { test_that("expect insufficient training data error", { - r <- epi_recipe(case_death_rate_subset) %>% + r <- epi_recipe(covid_case_death_rates) %>% step_epi_ahead(death_rate, ahead = 7) %>% step_epi_lag(death_rate, lag = c(0, 367)) %>% step_naomit(all_predictors()) %>% step_naomit(all_outcomes(), skip = TRUE) - expect_snapshot(error = TRUE, get_test_data(recipe = r, x = case_death_rate_subset)) + expect_snapshot(error = TRUE, get_test_data(recipe = r, x = covid_case_death_rates)) }) test_that("expect error that geo_value or time_value does not exist", { - r <- epi_recipe(case_death_rate_subset) %>% + r <- epi_recipe(covid_case_death_rates) %>% step_epi_ahead(death_rate, ahead = 7) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% step_epi_lag(case_rate, lag = c(0, 7, 14)) %>% step_naomit(all_predictors()) %>% step_naomit(all_outcomes(), skip = TRUE) - wrong_epi_df <- case_death_rate_subset %>% dplyr::select(-geo_value) + wrong_epi_df <- covid_case_death_rates %>% dplyr::select(-geo_value) expect_snapshot(error = TRUE, get_test_data(recipe = r, x = wrong_epi_df)) }) @@ -139,7 +139,7 @@ test_that("Omit end rows according to minimum lag when that’s not lag 0", { # Ex. using real built-in data - ca <- case_death_rate_subset %>% + ca <- covid_case_death_rates %>% filter(geo_value == "ca") rec <- epi_recipe(ca) %>% diff --git a/tests/testthat/test-key_colnames.R b/tests/testthat/test-key_colnames.R index 3b3118740..d94daaec4 100644 --- a/tests/testthat/test-key_colnames.R +++ b/tests/testthat/test-key_colnames.R @@ -1,9 +1,9 @@ test_that("Extracts keys from a recipe; roles are NA, giving an empty vector", { - expect_equal(key_colnames(recipe(case_death_rate_subset)), character(0L)) + expect_equal(key_colnames(recipe(covid_case_death_rates)), character(0L)) }) test_that("key_colnames extracts time_value and geo_value, but not raw", { - my_recipe <- epi_recipe(case_death_rate_subset) %>% + my_recipe <- epi_recipe(covid_case_death_rates) %>% step_epi_ahead(death_rate, ahead = 7) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% step_epi_lag(case_rate, lag = c(0, 7, 14)) %>% @@ -14,7 +14,7 @@ test_that("key_colnames extracts time_value and geo_value, but not raw", { my_workflow <- epi_workflow() %>% add_epi_recipe(my_recipe) %>% add_model(linear_reg()) %>% - fit(data = case_death_rate_subset) + fit(data = covid_case_death_rates) expect_identical(key_colnames(my_workflow), c("geo_value", "time_value")) }) diff --git a/tests/testthat/test-layer_add_forecast_date.R b/tests/testthat/test-layer_add_forecast_date.R index 491bf5e20..8bf452a81 100644 --- a/tests/testthat/test-layer_add_forecast_date.R +++ b/tests/testthat/test-layer_add_forecast_date.R @@ -1,4 +1,4 @@ -jhu <- case_death_rate_subset %>% +jhu <- covid_case_death_rates %>% dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) attributes(jhu)$metadata$as_of <- max(jhu$time_value) + 3 diff --git a/tests/testthat/test-layer_add_target_date.R b/tests/testthat/test-layer_add_target_date.R index 8bdb3a76b..7cd164960 100644 --- a/tests/testthat/test-layer_add_target_date.R +++ b/tests/testthat/test-layer_add_target_date.R @@ -1,4 +1,4 @@ -jhu <- case_death_rate_subset %>% +jhu <- covid_case_death_rates %>% dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) r <- epi_recipe(jhu) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% diff --git a/tests/testthat/test-layer_naomit.R b/tests/testthat/test-layer_naomit.R index 1d5b4ee25..8eb597f41 100644 --- a/tests/testthat/test-layer_naomit.R +++ b/tests/testthat/test-layer_naomit.R @@ -1,4 +1,4 @@ -jhu <- case_death_rate_subset %>% +jhu <- covid_case_death_rates %>% dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) r <- epi_recipe(jhu) %>% diff --git a/tests/testthat/test-layer_predict.R b/tests/testthat/test-layer_predict.R index 041516b29..ae51a5ec6 100644 --- a/tests/testthat/test-layer_predict.R +++ b/tests/testthat/test-layer_predict.R @@ -1,4 +1,4 @@ -jhu <- case_death_rate_subset %>% +jhu <- covid_case_death_rates %>% dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) r <- epi_recipe(jhu) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% diff --git a/tests/testthat/test-layer_residual_quantiles.R b/tests/testthat/test-layer_residual_quantiles.R index 09ef7c9d3..12e44809e 100644 --- a/tests/testthat/test-layer_residual_quantiles.R +++ b/tests/testthat/test-layer_residual_quantiles.R @@ -1,4 +1,4 @@ -jhu <- case_death_rate_subset %>% +jhu <- covid_case_death_rates %>% dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) r <- epi_recipe(jhu) %>% diff --git a/tests/testthat/test-layer_threshold_preds.R b/tests/testthat/test-layer_threshold_preds.R index 9df7e64ab..324f60a1b 100644 --- a/tests/testthat/test-layer_threshold_preds.R +++ b/tests/testthat/test-layer_threshold_preds.R @@ -1,4 +1,4 @@ -jhu <- case_death_rate_subset %>% +jhu <- covid_case_death_rates %>% dplyr::filter(time_value < "2021-03-08", geo_value %in% c("ak", "ca", "ar")) r <- epi_recipe(jhu) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% diff --git a/tests/testthat/test-population_scaling.R b/tests/testthat/test-population_scaling.R index a1ccba4a1..1f356c0cc 100644 --- a/tests/testthat/test-population_scaling.R +++ b/tests/testthat/test-population_scaling.R @@ -7,7 +7,7 @@ test_that("Column names can be passed with and without the tidy way", { pop_data2 <- pop_data %>% dplyr::rename(geo_value = states) - newdata <- case_death_rate_subset %>% + newdata <- covid_case_death_rates %>% filter(geo_value %in% c("ak", "al", "ar", "as", "az", "ca")) r1 <- epi_recipe(newdata) %>% @@ -150,7 +150,7 @@ test_that("Postprocessing workflow works and values correct", { }) test_that("Postprocessing to get cases from case rate", { - jhu <- case_death_rate_subset %>% + jhu <- covid_case_death_rates %>% dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ca", "ny")) %>% dplyr::select(geo_value, time_value, case_rate) @@ -193,7 +193,7 @@ test_that("Postprocessing to get cases from case rate", { test_that("test joining by default columns", { - jhu <- case_death_rate_subset %>% + jhu <- covid_case_death_rates %>% dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ca", "ny")) %>% dplyr::select(geo_value, time_value, case_rate) @@ -237,7 +237,7 @@ test_that("test joining by default columns", { latest <- get_test_data( recipe = r, - x = case_death_rate_subset %>% + x = covid_case_death_rates %>% dplyr::filter( time_value > "2021-11-01", geo_value %in% c("ca", "ny") @@ -250,7 +250,7 @@ test_that("test joining by default columns", { - jhu <- case_death_rate_subset %>% + jhu <- covid_case_death_rates %>% dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ca", "ny")) %>% dplyr::select(geo_value, time_value, case_rate) @@ -305,7 +305,7 @@ test_that("test joining by default columns", { test_that("expect error if `by` selector does not match", { - jhu <- case_death_rate_subset %>% + jhu <- covid_case_death_rates %>% dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ca", "ny")) %>% dplyr::select(geo_value, time_value, case_rate) diff --git a/tests/testthat/test-snapshots.R b/tests/testthat/test-snapshots.R index 956580e9f..3aecfd6b8 100644 --- a/tests/testthat/test-snapshots.R +++ b/tests/testthat/test-snapshots.R @@ -110,7 +110,7 @@ test_that("arx_forecaster snapshots", { }) test_that("arx_forecaster output format snapshots", { - jhu <- case_death_rate_subset %>% + jhu <- covid_case_death_rates %>% dplyr::filter(time_value >= as.Date("2021-12-01")) attributes(jhu)$metadata$as_of <- as.Date(attributes(jhu)$metadata$as_of) out1 <- arx_forecaster( @@ -147,15 +147,15 @@ test_that("arx_forecaster output format snapshots", { test_that("arx_classifier snapshots", { arc1 <- arx_classifier( - case_death_rate_subset %>% + covid_case_death_rates %>% dplyr::filter(time_value >= as.Date("2021-11-01")), "death_rate", c("case_rate", "death_rate") ) expect_snapshot_tibble(arc1$predictions) - max_date <- case_death_rate_subset$time_value %>% max() + max_date <- covid_case_death_rates$time_value %>% max() arc2 <- arx_classifier( - case_death_rate_subset %>% + covid_case_death_rates %>% dplyr::filter(time_value >= as.Date("2021-11-01")), "death_rate", c("case_rate", "death_rate"), @@ -164,7 +164,7 @@ test_that("arx_classifier snapshots", { expect_snapshot_tibble(arc2$predictions) expect_error( arc3 <- arx_classifier( - case_death_rate_subset %>% + covid_case_death_rates %>% dplyr::filter(time_value >= as.Date("2021-11-01")), "death_rate", c("case_rate", "death_rate"), @@ -174,7 +174,7 @@ test_that("arx_classifier snapshots", { ) expect_error( arc4 <- arx_classifier( - case_death_rate_subset %>% + covid_case_death_rates %>% dplyr::filter(time_value >= as.Date("2021-11-01")), "death_rate", c("case_rate", "death_rate"), diff --git a/tests/testthat/test-step_adjust_latency.R b/tests/testthat/test-step_adjust_latency.R index 0c292ed6f..7b1f320e4 100644 --- a/tests/testthat/test-step_adjust_latency.R +++ b/tests/testthat/test-step_adjust_latency.R @@ -508,12 +508,12 @@ test_that("printing step_adjust_latency results in expected output", { step_epi_ahead(death_rate, ahead = ahead) expect_snapshot(r5) expect_snapshot(prep(r5, real_x)) - r6 <- epi_recipe(case_death_rate_subset) %>% + r6 <- epi_recipe(covid_case_death_rates) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% step_adjust_latency(method = "extend_ahead") %>% step_epi_ahead(death_rate, ahead = 7) expect_snapshot(r6) - expect_snapshot(prep(r6, case_death_rate_subset)) + expect_snapshot(prep(r6, covid_case_death_rates)) }) test_that("locf works as intended", { diff --git a/tests/testthat/test-utils_latency.R b/tests/testthat/test-utils_latency.R index 7bf808835..2ac32fc9f 100644 --- a/tests/testthat/test-utils_latency.R +++ b/tests/testthat/test-utils_latency.R @@ -8,7 +8,10 @@ old_data <- tibble( tmp_death_rate = atan(0.1 * 1:200) + cos(5 * 1:200) + 1 ) %>% # place2 is slightly more recent than place1 - mutate(time_value = as.Date(ifelse(geo_value == "place2", time_value + 1, time_value))) %>% + mutate(time_value = case_when( + geo_value == "place2" ~ time_value + 1, + TRUE ~ time_value + )) %>% as_epi_df(as_of = as_of) old_data keys <- c("time_value", "geo_value") diff --git a/vignettes/articles/all_states_covidcast_signals.rds b/vignettes/articles/all_states_covidcast_signals.rds deleted file mode 100644 index e4ad60153..000000000 Binary files a/vignettes/articles/all_states_covidcast_signals.rds and /dev/null differ diff --git a/vignettes/articles/case_death_rate_archive.rds b/vignettes/articles/case_death_rate_archive.rds deleted file mode 100644 index b5209fb1d..000000000 Binary files a/vignettes/articles/case_death_rate_archive.rds and /dev/null differ diff --git a/vignettes/articles/smooth-qr.Rmd b/vignettes/articles/smooth-qr.Rmd index b93c726f6..801934e8f 100644 --- a/vignettes/articles/smooth-qr.Rmd +++ b/vignettes/articles/smooth-qr.Rmd @@ -97,7 +97,7 @@ state cases and deaths. This sample data ranges from Dec. 31, 2020 to Dec. 31, 2021. ```{r} -edf <- case_death_rate_subset +edf <- covid_case_death_rates ``` We will set the forecast date to be November 30, 2021 so that we can produce diff --git a/vignettes/articles/symptom-surveys.Rmd b/vignettes/articles/symptom-surveys.Rmd index 1e51a9963..af692726e 100644 --- a/vignettes/articles/symptom-surveys.Rmd +++ b/vignettes/articles/symptom-surveys.Rmd @@ -145,22 +145,39 @@ own forecaster under the `epipredict` framework, we could easily add steps to re-scale and transform the signals to our `epi_recipe`. This would make the code more succinct and self-contained. +We will compare two CLI-in-community indicators from +different sources. The data are available in the [`epidatasets` package](https://cmu-delphi.github.io/epidatasets/)), +and can be loaded with: + ```{r, message = FALSE, warning = FALSE} -library(epidatr) library(dplyr) library(purrr) library(epipredict) library(recipes) +z <- epidatasets::county_smoothed_cli_comparison +``` + +The data can also be constructed from data the Delphi API with the following code: + +```{r, message = FALSE, warning = FALSE, eval = FALSE} +library(epidatr) + +d <- "2020-09-21" + case_num <- 200 -as_of_date <- "2020-05-14" -geo_values <- pub_covidcast( +geos_date <- "2020-05-14" + +# Find counties that on 2020-05-14 had >= 200 cases reported. +# For later datasets, we will only keep data for these geos. +geo_values_initial <- pub_covidcast( source = "jhu-csse", signals = "confirmed_cumulative_num", geo_type = "county", time_type = "day", geo_values = "*", - time_values = epirange(20200514, 20200514) + time_values = epirange(geos_date, geos_date), + as_of = d ) %>% filter(value >= case_num) %>% pull(geo_value) %>% @@ -177,9 +194,10 @@ goog_sm_cli <- pub_covidcast( geo_type = "county", time_type = "day", geo_values = "*", - time_values = epirange(start_day, end_day) + time_values = epirange(start_day, end_day), + as_of = d ) %>% - filter(geo_value %in% geo_values) %>% + filter(geo_value %in% geo_values_initial) %>% select(geo_value, time_value, value) %>% rename(goog = value) @@ -189,9 +207,10 @@ fb_survey <- pub_covidcast( geo_type = "county", time_type = "day", geo_values = "*", - time_values = epirange(start_day, end_day) + time_values = epirange(start_day, end_day), + as_of = d ) %>% - filter(geo_value %in% geo_values) %>% + filter(geo_value %in% geo_values_initial) %>% select(geo_value, time_value, value) %>% rename(fb = value) @@ -201,26 +220,31 @@ jhu_7dav_incid <- pub_covidcast( geo_type = "county", time_type = "day", geo_values = "*", - time_values = epirange(start_day, end_day) + time_values = epirange(start_day, end_day), + as_of = d ) %>% - filter(geo_value %in% geo_values) %>% + filter(geo_value %in% geo_values_initial) %>% select(geo_value, time_value, value) %>% rename(case = value) -# Find "complete" counties, present in all three data signals at all times +# Find "complete" counties, present in all three data signals, and also +# present in the `geo_values_initial` object. geo_values_complete <- intersect( intersect(goog_sm_cli$geo_value, fb_survey$geo_value), jhu_7dav_incid$geo_value ) -# Make one big matrix by joining these three data frames -z <- full_join(full_join(goog_sm_cli, fb_survey, by = c("geo_value", "time_value")), +# Join the three data frames together +z <- full_join( + full_join(goog_sm_cli, fb_survey, by = c("geo_value", "time_value")), jhu_7dav_incid, by = c("geo_value", "time_value") ) %>% filter(geo_value %in% geo_values_complete) %>% - as_epi_df() + as_epi_df(as_of = d) +``` +```{r, message = FALSE, warning = FALSE} Logit <- function(x, a = 0.01) log((x + a) / (1 - x + a)) Sigmd <- function(y, a = 0.01) (exp(y) * (1 + a) - a) / (1 + exp(y)) diff --git a/vignettes/arx-classifier.Rmd b/vignettes/arx-classifier.Rmd index 3813e7d13..1e2a6949a 100644 --- a/vignettes/arx-classifier.Rmd +++ b/vignettes/arx-classifier.Rmd @@ -29,14 +29,14 @@ or ahead value. To get a sense of how the `arx_classifier()` works, let's consider a simple example with minimal inputs. For this, we will use the built-in -`case_death_rate_subset` that contains confirmed COVID-19 cases and deaths from +`covid_case_death_rates` that contains confirmed COVID-19 cases and deaths from JHU CSSE for all states over Dec 31, 2020 to Dec 31, 2021. From this, we'll take a subset of data for five states over June 4, 2021 to December 31, 2021. Our objective is to predict whether the case rates are increasing when considering the 0, 7 and 14 day case rates: ```{r} -jhu <- case_death_rate_subset %>% +jhu <- covid_case_death_rates %>% filter( time_value >= "2021-06-04", time_value <= "2021-12-31", @@ -163,7 +163,7 @@ $$ g_{\text{up}}(x) = \log\left ( \frac{\Pr(Z_{l, t} = \text{up} \vert x)}{\Pr(Z_{l, t} = \text{not up} \vert x)} \right ) = \beta_{10} + \beta_{11}Y_{l,t}^\Delta + \beta_{12}Y_{l,t-7}^\Delta + \beta_{13}Y_{l,t-14}^\Delta. $$ -Now then, we will operate on the same subset of the `case_death_rate_subset` +Now then, we will operate on the same subset of the `covid_case_death_rates` that we used in our above example. This time, we will use it to investigate whether the number of newly reported cases over the past 7 days has increased by at least 25% compared to the preceding week for our sample of states. diff --git a/vignettes/epipredict.Rmd b/vignettes/epipredict.Rmd index 32e0d7d16..ae4f7671a 100644 --- a/vignettes/epipredict.Rmd +++ b/vignettes/epipredict.Rmd @@ -85,7 +85,7 @@ package. There is much more to see there, but for the moment, it's enough to look at a simple one: ```{r epidf} -jhu <- case_death_rate_subset +jhu <- covid_case_death_rates jhu ``` diff --git a/vignettes/panel-data.Rmd b/vignettes/panel-data.Rmd index 79186a6c2..1faf5b56f 100644 --- a/vignettes/panel-data.Rmd +++ b/vignettes/panel-data.Rmd @@ -18,6 +18,7 @@ library(parsnip) library(recipes) library(epiprocess) library(epipredict) +library(epidatasets) library(ggplot2) theme_set(theme_bw()) ``` @@ -25,13 +26,13 @@ theme_set(theme_bw()) [Panel data](https://en.wikipedia.org/wiki/Panel_data), or longitudinal data, contain cross-sectional measurements of subjects over time. The `epipredict` package is most suitable for running forecasters on epidemiological panel data. -A built-in example of this is the [`case_death_rate_subset`]( - https://cmu-delphi.github.io/epipredict/reference/case_death_rate_subset.html) -dataset, which contains daily state-wise measures of `case_rate` and +An example of this is the [`covid_case_death_rates`]( + https://cmu-delphi.github.io/epidatasets/reference/covid_case_death_rates.html) +dataset, which contains daily state-wise measures of `case_rate` and `death_rate` for COVID-19 in 2021: ```{r epi-panel-ex, include=T} -head(case_death_rate_subset, 3) +head(covid_case_death_rates, 3) ``` `epipredict` functions work with data in @@ -41,7 +42,6 @@ are also valid candidates for `epipredict` functionality, as long as they are in `epi_df` format. ```{r employ-stats, include=F} -data("grad_employ_subset") year_start <- min(grad_employ_subset$time_value) year_end <- max(grad_employ_subset$time_value) ``` diff --git a/vignettes/preprocessing-and-models.Rmd b/vignettes/preprocessing-and-models.Rmd index 987ecdef4..8d1d2f19f 100644 --- a/vignettes/preprocessing-and-models.Rmd +++ b/vignettes/preprocessing-and-models.Rmd @@ -38,7 +38,6 @@ will create a classification model for hotspot predictions. ```{r, warning=FALSE, message=FALSE} library(tidyr) library(dplyr) -library(epidatr) library(epipredict) library(recipes) library(workflows) @@ -59,14 +58,28 @@ Although there are many state-of-the-art models, we choose to use Poisson regression, the textbook example for modeling count data, as an illustration for using the `epipredict` package with other existing tidymodels packages. +The `counts_subset` dataset is available in the [`epidatasets` package](https://cmu-delphi.github.io/epidatasets/)), +and contains the number of confirmed cases and deaths from June 4, 2021 to Dec +31, 2021 in some U.S. states. It can be loaded with: + ```{r poisson-reg-data} +x <- epidatasets::counts_subset +``` + +The data can also be fetched from the Delphi API with the following query: +```{r, eval = FALSE} +library(epidatr) + +d <- as.Date("2024-03-20") + x <- pub_covidcast( source = "jhu-csse", signals = "confirmed_incidence_num", time_type = "day", geo_type = "state", time_values = epirange(20210604, 20211231), - geo_values = "ca,fl,tx,ny,nj" + geo_values = "ca,fl,tx,ny,nj", + as_of = d ) %>% select(geo_value, time_value, cases = value) @@ -76,18 +89,15 @@ y <- pub_covidcast( time_type = "day", geo_type = "state", time_values = epirange(20210604, 20211231), - geo_values = "ca,fl,tx,ny,nj" + geo_values = "ca,fl,tx,ny,nj", + as_of = d ) %>% select(geo_value, time_value, deaths = value) -counts_subset <- full_join(x, y, by = c("geo_value", "time_value")) %>% - as_epi_df() +x <- full_join(x, y, by = c("geo_value", "time_value")) %>% + as_epi_df(as_of = d) ``` -The `counts_subset` dataset comes from the `epidatr` package, and -contains the number of confirmed cases and deaths from June 4, 2021 to -Dec 31, 2021 in some U.S. states. - We wish to predict the 7-day ahead death counts with lagged cases and deaths. Furthermore, we will let each state be a dummy variable. Using differential intercept coefficients, we can allow for an intercept shift between states. @@ -242,17 +252,31 @@ most or all of the time while in public in the past 7 days and the estimated percentage of respondents who reported that all or most people they encountered in public in the past 7 days maintained a distance of at least 6 feet. -State-wise population data from the 2019 U.S. Census is included in this package -and will be used in `layer_population_scaling()`. +State-wise population data from the 2019 U.S. Census will be used in +`layer_population_scaling()`. + +Both datasets are available in the [`epidatasets` package](https://cmu-delphi.github.io/epidatasets/)), +and can be loaded with: ```{r} +behav_ind <- epidatasets::ctis_covid_behaviours +pop_dat <- epidatasets::state_census %>% select(abbr, pop) +``` + +The data can also be fetched from the Delphi API with the following query: +```{r, eval = FALSE} +library(epidatr) + +d <- as.Date("2024-03-20") + behav_ind_mask <- pub_covidcast( source = "fb-survey", signals = "smoothed_wwearing_mask_7d", time_type = "day", geo_type = "state", time_values = epirange(20210604, 20211231), - geo_values = "ca,fl,tx,ny,nj" + geo_values = "ca,fl,tx,ny,nj", + as_of = d ) %>% select(geo_value, time_value, masking = value) @@ -262,14 +286,16 @@ behav_ind_distancing <- pub_covidcast( time_type = "day", geo_type = "state", time_values = epirange(20210604, 20211231), - geo_values = "ca,fl,tx,ny,nj" + geo_values = "ca,fl,tx,ny,nj", + as_of = d ) %>% select(geo_value, time_value, distancing = value) -pop_dat <- state_census %>% select(abbr, pop) - behav_ind <- behav_ind_mask %>% - full_join(behav_ind_distancing, by = c("geo_value", "time_value")) + full_join(behav_ind_distancing, by = c("geo_value", "time_value")) %>% + as_epi_df(as_of = d) + +pop_dat <- state_census %>% select(abbr, pop) ``` Rather than using raw mask-wearing / social-distancing metrics, for the sake @@ -290,11 +316,11 @@ behav_ind %>% ``` We will take a subset of death rate and case rate data from the built-in dataset -`case_death_rate_subset`. +`covid_case_death_rates`. ```{r} jhu <- filter( - case_death_rate_subset, + covid_case_death_rates, time_value >= "2021-06-04", time_value <= "2021-12-31", geo_value %in% c("ca", "fl", "tx", "ny", "nj") @@ -440,10 +466,10 @@ g_{\text{up}}(x) &= \log\left(\frac{Pr(Z_{\ell,t}=\text{up}\mid x)}{Pr(Z_{\ell,t Preprocessing steps are similar to the previous models with an additional step of categorizing the response variables. Again, we will use a subset of death rate and case rate data from our built-in dataset -`case_death_rate_subset`. +`covid_case_death_rates`. ```{r} -jhu <- case_death_rate_subset %>% +jhu <- covid_case_death_rates %>% dplyr::filter( time_value >= "2021-06-04", time_value <= "2021-12-31", @@ -512,7 +538,7 @@ Let's start with a simple dataset and preprocessing: ```{r} ex <- filter( - case_death_rate_subset, + covid_case_death_rates, time_value >= "2021-12-01", time_value <= "2021-12-31", geo_value == "ca" diff --git a/vignettes/update.Rmd b/vignettes/update.Rmd index 6e9e8745c..3d97da049 100644 --- a/vignettes/update.Rmd +++ b/vignettes/update.Rmd @@ -61,13 +61,13 @@ vignette and only briefly go through some examples for a `frosting` object. ## Add/update/remove an `epi_recipe` in an `epi_workflow` -We start with the built-in `case_death_rate_subset` dataset that contains JHU +We start with the built-in `covid_case_death_rates` dataset that contains JHU daily COVID-19 cases and deaths by state and take a subset of it from Nov. 1, 2021 to Dec. 31, 2021 for the four states of Alaska, California, New York, and South Carolina. ```{r} -jhu <- case_death_rate_subset %>% +jhu <- covid_case_death_rates %>% dplyr::filter(time_value >= as.Date("2021-11-01"), geo_value %in% c("ak", "ca", "ny", "sc")) jhu