diff --git a/DESCRIPTION b/DESCRIPTION index 42738f5ee..a1d79e7a1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -37,6 +37,7 @@ Imports: quantreg, recipes (>= 1.0.4), rlang, + smoothqr, stats, tibble, tidyr, @@ -61,7 +62,8 @@ VignetteBuilder: knitr Remotes: cmu-delphi/epidatr, - cmu-delphi/epiprocess@dev + cmu-delphi/epiprocess@dev, + dajmcdon/smoothqr Config/testthat/edition: 3 Encoding: UTF-8 LazyData: true diff --git a/NAMESPACE b/NAMESPACE index 1dda742a8..7da9f84a2 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -33,6 +33,8 @@ S3method(extrapolate_quantiles,dist_default) S3method(extrapolate_quantiles,dist_quantiles) S3method(extrapolate_quantiles,distribution) S3method(format,dist_quantiles) +S3method(is.na,dist_quantiles) +S3method(is.na,distribution) S3method(mean,dist_quantiles) S3method(median,dist_quantiles) S3method(predict,epi_workflow) @@ -61,6 +63,7 @@ S3method(print,layer_predictive_distn) S3method(print,layer_quantile_distn) S3method(print,layer_residual_quantiles) S3method(print,layer_threshold) +S3method(print,layer_unnest) S3method(print,step_epi_ahead) S3method(print,step_epi_lag) S3method(print,step_growth_rate) @@ -81,6 +84,7 @@ S3method(slather,layer_predictive_distn) S3method(slather,layer_quantile_distn) S3method(slather,layer_residual_quantiles) S3method(slather,layer_threshold) +S3method(slather,layer_unnest) S3method(snap,default) S3method(snap,dist_default) S3method(snap,dist_quantiles) @@ -132,6 +136,7 @@ export(layer_predictive_distn) export(layer_quantile_distn) export(layer_residual_quantiles) export(layer_threshold) +export(layer_unnest) export(nested_quantiles) export(new_default_epi_recipe_blueprint) export(new_epi_recipe_blueprint) @@ -140,6 +145,7 @@ export(prep) export(quantile_reg) export(remove_frosting) export(slather) +export(smooth_quantile_reg) export(step_epi_ahead) export(step_epi_lag) export(step_epi_naomit) diff --git a/R/create-layer.R b/R/create-layer.R index d26402c18..6e30dc606 100644 --- a/R/create-layer.R +++ b/R/create-layer.R @@ -20,6 +20,14 @@ #' create_layer <- function(name = NULL, open = rlang::is_interactive()) { name <- name %||% usethis:::get_active_r_file(path = "R") + if (substr(name, 1, 5) == "layer") { + nn <- substring(name, 6) + if (substr(nn, 1, 1) == "_") nn <- substring(nn, 2) + cli::cli_abort( + c('`name` should not begin with "layer" or "layer_".', + i = 'Did you mean to use `create_layer("{ nn }")`?') + ) + } layer_name <- name name <- paste0("layer_", name) name <- usethis:::slug(name, "R") diff --git a/R/dist_quantiles.R b/R/dist_quantiles.R index 833f30c1c..62bc1dc95 100644 --- a/R/dist_quantiles.R +++ b/R/dist_quantiles.R @@ -360,3 +360,15 @@ Ops.dist_quantiles <- function(e1, e2) { new_quantiles(q = q, tau = tau) } +#' @method is.na distribution +#' @export +is.na.distribution <- function(x) { + sapply(vctrs::vec_data(x), is.na) +} + +#' @method is.na dist_quantiles +#' @export +is.na.dist_quantiles <- function(x) { + q <- field(x, "q") + all(is.na(q)) +} diff --git a/R/layer_unnest.R b/R/layer_unnest.R new file mode 100644 index 000000000..ef82e1308 --- /dev/null +++ b/R/layer_unnest.R @@ -0,0 +1,46 @@ +#' Unnest prediction list-cols +#' +#' @param frosting a `frosting` postprocessor +#' @param ... <[`tidy-select`][dplyr::dplyr_tidy_select]> One or more unquoted +#' expressions separated by commas. Variable names can be used as if they +#' were positions in the data frame, so expressions like `x:y` can +#' be used to select a range of variables. +#' @param id a random id string +#' +#' @return an updated `frosting` postprocessor +#' @export +layer_unnest <- function(frosting, ..., id = rand_id("unnest")) { + arg_is_chr_scalar(id) + + add_layer( + frosting, + layer_unnest_new( + terms = dplyr::enquos(...), + id = id + ) + ) +} + +layer_unnest_new <- function(terms, id) { + layer("unnest", terms = terms, id = id) +} + +#' @export +slather.layer_unnest <- + function(object, components, the_fit, the_recipe, ...) { + exprs <- rlang::expr(c(!!!object$terms)) + pos <- tidyselect::eval_select(exprs, components$predictions) + col_names <- names(pos) + components$predictions <- components$predictions %>% + tidyr::unnest(col_names) + + components + } + +#' @export +print.layer_unnest <- function( + x, width = max(20, options()$width - 30), ...) { + + title <- "Unnesting prediction list-cols" + print_layer(x$terms, title = title, width = width) +} diff --git a/R/make_smooth_quantile_reg.R b/R/make_smooth_quantile_reg.R new file mode 100644 index 000000000..9ada3b1dc --- /dev/null +++ b/R/make_smooth_quantile_reg.R @@ -0,0 +1,201 @@ + +#' Smooth quantile regression +#' +#' @description +#' `smooth_quantile_reg()` generates a quantile regression model _specification_ for +#' the [tidymodels](https://www.tidymodels.org/) framework. Currently, the +#' only supported engine is [smoothqr::smooth_qr()]. +#' +#' @param mode A single character string for the type of model. +#' The only possible value for this model is "regression". +#' @param engine Character string naming the fitting function. Currently, only +#' "smooth_qr" is supported. +#' @param tau A scalar or vector of values in (0, 1) to determine which +#' quantiles to estimate (default is 0.5). +#' @param outcome_locations Defaults to the vector `1:ncol(y)` but if the +#' responses are observed at a different spacing (or appear in a different +#' order), that information should be used here. This +#' argument will be mapped to the `ahead` argument of [smoothqr::smooth_qr()]. +#' @param degree the number of polynomials used for response smoothing. Must +#' be no more than the number of responses. +#' @export +#' +#' @seealso [fit.model_spec()], [set_engine()] +#' +#' @importFrom quantreg rq +#' @examples +#' tib <- data.frame( +#' y1 = rnorm(100), y2 = rnorm(100), y3 = rnorm(100), +#' y4 = rnorm(100), y5 = rnorm(100), y6 = rnorm(100), +#' x1 = rnorm(100), x2 = rnorm(100)) +#' qr_spec <- smooth_quantile_reg(tau = c(.2, .5, .8), outcome_locations = 1:6) +#' ff <- qr_spec %>% fit(cbind(y1, y2 , y3 , y4 , y5 , y6) ~ ., data = tib) +#' p <- predict(ff, new_data = tib) +#' +#' x <- -99:99 / 100 * 2 * pi +#' y <- sin(x) + rnorm(length(x), sd = .1) +#' fd <- x[length(x) - 20] +#' XY <- smoothqr::lagmat(y[1:(length(y) - 20)], c(-20:20)) +#' XY <- tibble::as_tibble(XY) +#' qr_spec <- smooth_quantile_reg(tau = c(.2, .5, .8), outcome_locations = 20:1) +#' tt <- qr_spec %>% fit_xy(x = XY[,21:41], y = XY[,1:20]) +#' +#' library(tidyr) +#' library(dplyr) +#' pl <- predict( +#' object = tt, +#' new_data = XY[max(which(complete.cases(XY[,21:41]))), 21:41] +#' ) +#' pl <- pl %>% +#' unnest(.pred) %>% +#' mutate(distn = nested_quantiles(distn)) %>% +#' unnest(distn) %>% +#' mutate(x = x[length(x) - 20] + ahead / 100 * 2 * pi, +#' ahead = NULL) %>% +#' pivot_wider(names_from = tau, values_from = q) +#' plot(x, y, pch = 16, xlim = c(pi, 2 * pi), col = "lightgrey") +#' curve(sin(x), add = TRUE) +#' abline(v = fd, lty = 2) +#' lines(pl$x, pl$`0.2`, col = "blue") +#' lines(pl$x, pl$`0.8`, col = "blue") +#' lines(pl$x, pl$`0.5`, col = "red") +#' \dontrun{ +#' ggplot(data.frame(x = x, y = y), aes(x)) + +#' geom_ribbon(data = pl, aes(ymin = `0.2`, ymax = `0.8`), fill = "lightblue") + +#' geom_point(aes(y = y), colour = "grey") + # observed data +#' geom_function(fun = sin, colour = "black") + # truth +#' geom_vline(xintercept = fd, linetype = "dashed") + # end of training data +#' geom_line(data = pl, aes(y = `0.5`), colour = "red") + # median prediction +#' theme_bw() + +#' coord_cartesian(xlim = c(0, NA)) + +#' ylab("y") +#' } +smooth_quantile_reg <- function( + mode = "regression", + engine = "smoothqr", + outcome_locations = NULL, + tau = 0.5, + degree = 3L) { + + # Check for correct mode + if (mode != "regression") rlang::abort("`mode` must be 'regression'") + if (engine != "smoothqr") rlang::abort("`engine` must be 'smoothqr'") + + arg_is_probabilities(tau) + arg_is_pos_int(degree) + arg_is_scalar(degree) + arg_is_numeric(outcome_locations, allow_null = TRUE) + if (is.unsorted(tau)) { + rlang::warn("Sorting tau to increasing order.") + tau <- sort(tau) + } + + args <- list(tau = rlang::enquo(tau), degree = rlang::enquo(degree), + outcome_locations = rlang::enquo(outcome_locations)) + + # Save some empty slots for future parts of the specification + parsnip::new_model_spec( + "smooth_quantile_reg", + args = args, + eng_args = NULL, + mode = mode, + method = NULL, + engine = engine + ) +} + + +make_smooth_quantile_reg <- function() { + parsnip::set_new_model("smooth_quantile_reg") + parsnip::set_model_mode("smooth_quantile_reg", "regression") + parsnip::set_model_engine("smooth_quantile_reg", "regression", eng = "smoothqr") + parsnip::set_dependency( + "smooth_quantile_reg", + eng = "smoothqr", + pkg = "smoothqr", + mode = "regression" + ) + + parsnip::set_model_arg( + model = "smooth_quantile_reg", + eng = "smoothqr", + parsnip = "tau", + original = "tau", + func = list(pkg = "smoothqr", fun = "smooth_qr"), + has_submodel = FALSE + ) + parsnip::set_model_arg( + model = "smooth_quantile_reg", + eng = "smoothqr", + parsnip = "degree", + original = "degree", + func = list(pkg = "smoothqr", fun = "smooth_qr"), + has_submodel = FALSE + ) + parsnip::set_model_arg( + model = "smooth_quantile_reg", + eng = "smoothqr", + parsnip = "outcome_locations", + original = "aheads", + func = list(pkg = "smoothqr", fun = "smooth_qr"), + has_submodel = FALSE + ) + + parsnip::set_fit( + model = "smooth_quantile_reg", + eng = "smoothqr", + mode = "regression", + value = list( + interface = "data.frame", + protect = c("x", "y"), # prevent user from touching these + func = c(pkg = "smoothqr", fun = "smooth_qr"), + defaults = list(intercept = TRUE) + ) + ) + + parsnip::set_encoding( + model = "smooth_quantile_reg", + eng = "smoothqr", + mode = "regression", + options = list( + predictor_indicators = "traditional", # factor -> dummy conversion w/ baseline + compute_intercept = TRUE, # put an intercept into the design matrix + remove_intercept = TRUE, # but then remove it, we'll put it back in the function + allow_sparse_x = FALSE # quantgen::rq can't handle sparse x, unfortunately + ) + ) + + process_smooth_qr_preds <- function(x, object) { + object <- parsnip::extract_fit_engine(object) + list_of_pred_distns <- lapply(x, function(p) { + x <- unname(apply( + p, 1, function(q) unname(sort(q, na.last = TRUE)), simplify = FALSE + )) + dist_quantiles(x, list(object$tau)) + }) + n_preds <- length(list_of_pred_distns[[1]]) + nout <- length(list_of_pred_distns) + tib <- tibble::tibble( + ids = rep(seq(n_preds), times = nout), + ahead = rep(object$aheads, each = n_preds), + distn = do.call(c, unname(list_of_pred_distns))) %>% + tidyr::nest(.pred = c(ahead, distn)) + + return(tib[".pred"]) + } + + + parsnip::set_pred( + model = "smooth_quantile_reg", + eng = "smoothqr", + mode = "regression", + type = "numeric", + value = list( + pre = NULL, + post = process_smooth_qr_preds, + func = c(fun = "predict"), + args = list(object = quote(object$fit), newdata = quote(new_data)) + ) + ) +} + diff --git a/R/utils-arg.R b/R/utils-arg.R index 81100591e..2ac97675b 100644 --- a/R/utils-arg.R +++ b/R/utils-arg.R @@ -32,7 +32,7 @@ arg_is_lgl = function(..., allow_null = FALSE, allow_na = FALSE, allow_empty = T handle_arg_list( ..., tests = function(name, value) { - if (!(is.logical(value) | (is.null(value) & !allow_null))) { + if (!(is.logical(value) | (is.null(value) & allow_null))) { cli::cli_abort("Argument {.val {name}} must be of logical type.") } if (any(is.na(value)) & !allow_na) { @@ -62,7 +62,7 @@ arg_is_nonneg_int = function(..., allow_null = FALSE) { ..., tests = function(name, value) { if (!((is.numeric(value) && all(value >= 0) && all(value %% 1 == 0)) | - (is.null(value) & !allow_null))) + (is.null(value) & allow_null))) cli::cli_abort("All {.val {name}} must be whole positive number(s).") } ) @@ -96,7 +96,7 @@ arg_is_int = function(..., allow_null = FALSE) { ..., tests = function(name, value) { if (!((is.numeric(value) && all(value %% 1 == 0)) | - (is.null(value) & !allow_null))) + (is.null(value) & allow_null))) cli::cli_abort("All {.val {name}} must be whole positive number(s).") } ) @@ -106,7 +106,7 @@ arg_is_numeric = function(..., allow_null = FALSE) { handle_arg_list( ..., tests = function(name, value) { - if (!(is.numeric(value) | (is.null(value) & !allow_null))) + if (!(is.numeric(value) | (is.null(value) & allow_null))) cli::cli_abort("All {.val {name}} must numeric.") } ) @@ -117,7 +117,7 @@ arg_is_lgl = function(..., allow_null = FALSE, allow_na = FALSE, allow_empty = T handle_arg_list( ..., tests = function(name, value) { - if (!(is.logical(value) | (is.null(value) & !allow_null))) + if (!(is.logical(value) | (is.null(value) & allow_null))) cli::cli_abort("Argument {.val {name}} must be of logical type.") if (any(is.na(value)) & !allow_na) @@ -173,8 +173,8 @@ arg_is_function = function(..., allow_null = FALSE) { handle_arg_list( ..., tests = function(name, value) { - if (!is.function(value) | (is.null(value) & !allow_null)) - cli::cli_abort("{value} must be a `parsnip` function.") + if (!is.function(value) | (is.null(value) & allow_null)) + cli::cli_abort("{value} must be a function.") } ) } @@ -198,7 +198,7 @@ arg_is_sorted = function(..., allow_null = FALSE) { arg_to_date <- function(x, allow_null = FALSE, allow_na = FALSE) { arg_is_scalar(x, allow_null = allow_null, allow_na = allow_na) - if (allow_null && !is.null(x)) { + if (!is.null(x)) { x <- tryCatch(as.Date(x), error = function(e) NA) } arg_is_date(x, allow_null = allow_null, allow_na = allow_na) diff --git a/R/zzz.R b/R/zzz.R index 666963028..77fedad9f 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -7,5 +7,6 @@ .onLoad <- function(libname, pkgname) { make_flatline_reg() make_quantile_reg() + make_smooth_quantile_reg() } diff --git a/inst/templates/layer.R b/inst/templates/layer.R index cf7ebc8cc..14dc4a82b 100644 --- a/inst/templates/layer.R +++ b/inst/templates/layer.R @@ -1,31 +1,43 @@ layer_{{ name }} <- function(frosting, # mandatory - ..., - args, # add as many as you need - .flag = TRUE, # mandatory - id = rand_id("{{{ name }}}")) { + ..., + args, # add as many as you need + more_args, + id = rand_id("{{{ name }}}")) { - # if you don't need ... then uncomment the below + # validate any additional arguments here + + # if you don't need ... then uncomment the line below ## rlang::check_dots_empty() add_layer( frosting, layer_{{{ name }}}_new( + terms = dplyr::enquos(...), # remove if ... should be empty args, id = id - ), - flag = .flag + ) ) } -layer_{{{ name }}}_new <- function(args, id) { - layer("{{{ name }}}", args, id = id) +layer_{{{ name }}}_new <- function(terms, args, more_args, id) { + layer("{{{ name }}}", + terms = terms, + args = args, + more_args = more_args, + id = id) } #' @export slather.layer_{{{ name }}} <- function(object, components, the_fit, the_recipe, ...) { - # add necessary processing steps here + # if layer_ used ... in tidyselect, we need to evaluate it now + exprs <- rlang::expr(c(!!!object$terms)) + pos <- tidyselect::eval_select(exprs, components$predictions) + col_names <- names(pos) + # now can select with `tidyselect::all_of(col_names)` - # always return components - components -} + # add additional necessary processing steps here + + # always return components + components + } diff --git a/man/layer_unnest.Rd b/man/layer_unnest.Rd new file mode 100644 index 000000000..3c25608a6 --- /dev/null +++ b/man/layer_unnest.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/layer_unnest.R +\name{layer_unnest} +\alias{layer_unnest} +\title{Unnest prediction list-cols} +\usage{ +layer_unnest(frosting, ..., id = rand_id("unnest")) +} +\arguments{ +\item{frosting}{a \code{frosting} postprocessor} + +\item{...}{<\code{\link[dplyr:dplyr_tidy_select]{tidy-select}}> One or more unquoted +expressions separated by commas. Variable names can be used as if they +were positions in the data frame, so expressions like \code{x:y} can +be used to select a range of variables.} + +\item{id}{a random id string} +} +\value{ +an updated \code{frosting} postprocessor +} +\description{ +Unnest prediction list-cols +} diff --git a/man/smooth_quantile_reg.Rd b/man/smooth_quantile_reg.Rd new file mode 100644 index 000000000..293999876 --- /dev/null +++ b/man/smooth_quantile_reg.Rd @@ -0,0 +1,88 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/make_smooth_quantile_reg.R +\name{smooth_quantile_reg} +\alias{smooth_quantile_reg} +\title{Smooth quantile regression} +\usage{ +smooth_quantile_reg( + mode = "regression", + engine = "smoothqr", + outcome_locations = NULL, + tau = 0.5, + degree = 3L +) +} +\arguments{ +\item{mode}{A single character string for the type of model. +The only possible value for this model is "regression".} + +\item{engine}{Character string naming the fitting function. Currently, only +"smooth_qr" is supported.} + +\item{outcome_locations}{Defaults to the vector \code{1:ncol(y)} but if the +responses are observed at a different spacing (or appear in a different +order), that information should be used here. This +argument will be mapped to the \code{ahead} argument of \code{\link[smoothqr:smooth_qr]{smoothqr::smooth_qr()}}.} + +\item{tau}{A scalar or vector of values in (0, 1) to determine which +quantiles to estimate (default is 0.5).} + +\item{degree}{the number of polynomials used for response smoothing. Must +be no more than the number of responses.} +} +\description{ +\code{smooth_quantile_reg()} generates a quantile regression model \emph{specification} for +the \href{https://www.tidymodels.org/}{tidymodels} framework. Currently, the +only supported engine is \code{\link[smoothqr:smooth_qr]{smoothqr::smooth_qr()}}. +} +\examples{ +tib <- data.frame( + y1 = rnorm(100), y2 = rnorm(100), y3 = rnorm(100), + y4 = rnorm(100), y5 = rnorm(100), y6 = rnorm(100), + x1 = rnorm(100), x2 = rnorm(100)) +qr_spec <- smooth_quantile_reg(tau = c(.2, .5, .8), outcome_locations = 1:6) +ff <- qr_spec \%>\% fit(cbind(y1, y2 , y3 , y4 , y5 , y6) ~ ., data = tib) +p <- predict(ff, new_data = tib) + +x <- -99:99 / 100 * 2 * pi +y <- sin(x) + rnorm(length(x), sd = .1) +fd <- x[length(x) - 20] +XY <- smoothqr::lagmat(y[1:(length(y) - 20)], c(-20:20)) +XY <- tibble::as_tibble(XY) +qr_spec <- smooth_quantile_reg(tau = c(.2, .5, .8), outcome_locations = 20:1) +tt <- qr_spec \%>\% fit_xy(x = XY[,21:41], y = XY[,1:20]) + +library(tidyr) +library(dplyr) +pl <- predict( + object = tt, + new_data = XY[max(which(complete.cases(XY[,21:41]))), 21:41] + ) +pl <- pl \%>\% + unnest(.pred) \%>\% + mutate(distn = nested_quantiles(distn)) \%>\% + unnest(distn) \%>\% + mutate(x = x[length(x) - 20] + ahead / 100 * 2 * pi, + ahead = NULL) \%>\% + pivot_wider(names_from = tau, values_from = q) +plot(x, y, pch = 16, xlim = c(pi, 2 * pi), col = "lightgrey") +curve(sin(x), add = TRUE) +abline(v = fd, lty = 2) +lines(pl$x, pl$`0.2`, col = "blue") +lines(pl$x, pl$`0.8`, col = "blue") +lines(pl$x, pl$`0.5`, col = "red") +\dontrun{ +ggplot(data.frame(x = x, y = y), aes(x)) + + geom_ribbon(data = pl, aes(ymin = `0.2`, ymax = `0.8`), fill = "lightblue") + + geom_point(aes(y = y), colour = "grey") + # observed data + geom_function(fun = sin, colour = "black") + # truth + geom_vline(xintercept = fd, linetype = "dashed") + # end of training data + geom_line(data = pl, aes(y = `0.5`), colour = "red") + # median prediction + theme_bw() + + coord_cartesian(xlim = c(0, NA)) + + ylab("y") +} +} +\seealso{ +\code{\link[=fit.model_spec]{fit.model_spec()}}, \code{\link[=set_engine]{set_engine()}} +}