Skip to content

133 fix residuals #134

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Sep 15, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,13 @@ URL: https://github.com/cmu-delphi/epipredict/,
https://cmu-delphi.github.io/epipredict
BugReports: https://github.com/cmu-delphi/epipredict/issues/
Depends:
R (>= 3.5.0)
R (>= 3.5.0),
epiprocess
Imports:
assertthat,
cli,
distributional,
dplyr,
epiprocess,
fs,
generics,
glue,
Expand Down
1 change: 0 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,5 @@ importFrom(stats,predict)
importFrom(stats,qnorm)
importFrom(stats,quantile)
importFrom(stats,residuals)
importFrom(stats,setNames)
importFrom(tibble,is_tibble)
importFrom(tibble,tibble)
10 changes: 5 additions & 5 deletions R/arx_forecaster.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,10 @@
#' out <- arx_forecaster(jhu, "death_rate",
#' c("case_rate", "death_rate"))
arx_forecaster <- function(epi_data,
outcome,
predictors,
trainer = parsnip::linear_reg(),
args_list = arx_args_list()) {
outcome,
predictors,
trainer = parsnip::linear_reg(),
args_list = arx_args_list()) {

validate_forecaster_inputs(epi_data, outcome, predictors)
if (!is.list(trainer) || trainer$mode != "regression")
Expand Down Expand Up @@ -60,7 +60,7 @@ arx_forecaster <- function(epi_data,
layer_add_target_date(target_date = target_date)
if (args_list$nonneg) f <- layer_threshold(f, dplyr::starts_with(".pred"))

latest <- get_test_data(r, epi_data)
latest <- get_test_data(r, epi_data, TRUE)

wf <- epi_workflow(r, trainer, f) %>% generics::fit(epi_data)
list(
Expand Down
57 changes: 45 additions & 12 deletions R/get_test_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,25 @@
#' and other variables in the original dataset,
#' which will be used to create test data.
#'
#' It also optionally fills missing values
#' using the last-observation-carried-forward (LOCF) method. If this
#' is not possible (say because there would be only `NA`'s in some location),
#' it will produce an error suggesting alternative options to handle missing
#' values with more advanced techniques.
#'
#' @param recipe A recipe object. The step will be added to the
#' sequence of operations for this recipe.
#' @param x A data frame, tibble, or epi_df data set.
#' @param fill_locf Logical. Should we use `locf` to fill in missing data?
#' @param n_recent Integer or NULL. If filling missing data with `locf=TRUE`,
#' how far back are we willing to tolerate missing data? Larger values allow
#' more filling. The default `NULL` will determine this from the maximum
#' lags used in the `recipe`. For example, suppose n_recent = 3, then if the
#' 3 most recent observations in some region are all `NA`’s, we won’t be able
#' to fill anything, and an error message will be thrown.
#'
#' @return A tibble with columns `geo_value`, `time_value`
#' and other variables in the original dataset.
#' @return A tibble with columns `geo_value`, `time_value`, any additional
#' keys, as well other variables in the original dataset.
#' @examples
#' # create recipe
#' rec <- epi_recipe(case_death_rate_subset) %>%
Expand All @@ -21,11 +34,17 @@
#' get_test_data(recipe = rec, x = case_death_rate_subset)
#' @export

get_test_data <- function(recipe, x) {
get_test_data <- function(recipe, x, fill_locf = FALSE, n_recent = NULL) {
stopifnot(is.data.frame(x))
if (! all(colnames(x) %in% colnames(recipe$template)))
arg_is_lgl(fill_locf)
arg_is_scalar(fill_locf)
arg_is_pos_int(n_recent, allow_null = TRUE)
arg_is_scalar(n_recent, allow_null = TRUE)

if (!all(colnames(x) %in% colnames(recipe$template)))
cli_stop("some variables used for training are not available in `x`.")
max_lags <- max(map_dbl(recipe$steps, ~ max(.x$lag %||% 0)), 0)
if (is.null(n_recent)) n_recent <- max_lags + 1

# CHECK: Return NA if insufficient training data
if (dplyr::n_distinct(x$time_value) < max_lags) {
Expand All @@ -36,15 +55,29 @@ get_test_data <- function(recipe, x) {
groups <- epi_keys(recipe)
groups <- groups[groups != "time_value"]

x %>%
dplyr::filter(
dplyr::if_any(
.cols = recipe$term_info$variable[which(recipe$var_info$role == 'raw')],
.fns = ~ !is.na(.x)
)
) %>%
x <- x %>%
epiprocess::group_by(dplyr::across(dplyr::all_of(groups))) %>%
dplyr::slice_tail(n = max(n_recent, max_lags + 1))

if (fill_locf) {
cannot_be_used <- x %>%
dplyr::slice_tail(n = n_recent) %>%
dplyr::summarize(dplyr::across(
!time_value, ~ !is.na(.x[1])), .groups = "drop") %>%
dplyr::summarise(dplyr::across(-dplyr::all_of(groups), ~ any(!.x))) %>%
unlist()
if (any(cannot_be_used)) {
bad_vars <- names(cannot_be_used)[cannot_be_used]
cli_stop("The variables {bad_vars} have ",
"too many recent missing values to be filled automatically. ",
"You should either choose `n_recent` larger than its current ",
"value {n_recent}, or perform NA imputation manually, perhaps with ",
"{.code recipes::step_impute_*()} or with {.code tidyr::fill()}.")
}
x <- x %>% tidyr::fill(!time_value)
}

x %>%
dplyr::slice_tail(n = max_lags + 1) %>%
epiprocess::ungroup()

}
4 changes: 2 additions & 2 deletions R/layer_residual_quantiles.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#' @param probs numeric vector of probabilities with values in (0,1)
#' referring to the desired quantile.
#' @param symmetrize logical. If `TRUE` then interval will be symmetric.
#' @param by_key A character vector of keys to group the residuls by before
#' @param by_key A character vector of keys to group the residuals by before
#' calculating quantiles. The default, `c()` performs no grouping.
#' @param name character. The name for the output column.
#' @param .flag a logical to determine if the layer is added. Passed on to
Expand Down Expand Up @@ -45,7 +45,7 @@
#'
#' p2 <- predict(wf2, latest)
layer_residual_quantiles <- function(frosting, ...,
probs = c(0.0275, 0.975),
probs = c(0.05, 0.95),
symmetrize = TRUE,
by_key = character(0L),
name = ".pred_distn",
Expand Down
2 changes: 1 addition & 1 deletion R/utils_arg.R
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ arg_is_pos_int = function(..., allow_null = FALSE) {
...,
tests = function(name, value) {
if (!((is.numeric(value) && all(value > 0) && all(value%%1 == 0)) |
(is.null(value) & !allow_null)))
(is.null(value) & allow_null)))
cli_stop("All {.val {name}} must be whole positive number(s).")
}
)
Expand Down
11 changes: 6 additions & 5 deletions man/bake.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 3 additions & 2 deletions man/epi_juice.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/epi_workflow.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 3 additions & 2 deletions man/flatline.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

22 changes: 19 additions & 3 deletions man/get_test_data.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions man/layer_residual_quantiles.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

40 changes: 39 additions & 1 deletion tests/testthat/test-get_test_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ test_that("return expected number of rows and returned dataset is ungrouped", {
test <- get_test_data(recipe = r, x = case_death_rate_subset)

expect_equal(nrow(test),
dplyr::n_distinct(case_death_rate_subset$geo_value)* 29)
dplyr::n_distinct(case_death_rate_subset$geo_value) * 29)

expect_false(dplyr::is.grouped_df(test))
})
Expand Down Expand Up @@ -39,3 +39,41 @@ test_that("expect error that geo_value or time_value does not exist", {
expect_error(get_test_data(recipe = r, x = wrong_epi_df))
})


test_that("NA fill behaves as desired", {
df <- tibble::tibble(
geo_value = rep(c("ca", "ny"), each = 10),
time_value = rep(1:10, times = 2),
x1 = rnorm(20),
x2 = rnorm(20)) %>%
epiprocess::as_epi_df()

r <- epi_recipe(df) %>%
step_epi_ahead(x1, ahead = 3) %>%
step_epi_lag(x1, x2, lag = c(1,3)) %>%
step_epi_naomit()

expect_silent(tt <- get_test_data(r, df))
expect_s3_class(tt, "epi_df")

expect_error(get_test_data(r, df, "A"))
expect_error(get_test_data(r, df, TRUE, -3))

df2 <- df
df2$x1[df2$geo_value == "ca"] <- NA

td <- get_test_data(r, df2)
expect_true(any(is.na(td)))
expect_error(get_test_data(r, df2, TRUE))

df1 <- df2
df1$x1[1:4] <- 1:4
td1 <- get_test_data(r, df1, TRUE, n_recent = 7)
expect_true(!any(is.na(td1)))

df2$x1[7:8] <- 1:2
td2 <- get_test_data(r, df2, TRUE)
expect_true(!any(is.na(td2)))


})