diff --git a/DESCRIPTION b/DESCRIPTION index d58872c5..52426f7a 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ -Package: epiprocess Type: Package +Package: epiprocess Title: Tools for basic signal processing in epidemiology -Version: 0.10.3 +Version: 0.10.4 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", , "lcbrooks+github@andrew.cmu.edu", role = c("aut", "cre")), @@ -23,17 +23,17 @@ Authors@R: c( person("Posit", role = "cph", comment = "Copyright holder of included rlang fragments"), person("Johns Hopkins University Center for Systems Science and Engineering", role = "dtc", - comment = "Owner of COVID-19 cases and deaths data from the COVID-19 Data Repository"), + comment = "Owner of COVID-19 cases and deaths data from the COVID-19 Data Repository"), person("Johns Hopkins University", role = "cph", - comment = "Copyright holder of COVID-19 cases and deaths data from the COVID-19 Data Repository"), + comment = "Copyright holder of COVID-19 cases and deaths data from the COVID-19 Data Repository"), person("Carnegie Mellon University Delphi Group", role = "dtc", - comment = "Owner of claims-based CLI data from the Delphi Epidata API") + comment = "Owner of claims-based CLI data from the Delphi Epidata API") ) -Description: This package introduces common data structures for working with - epidemiological data reported by location and time and offers associated - utilities to perform basic signal processing tasks. The package is designed - to be used in conjunction with `epipredict` for building and evaluating - epidemiological models. +Description: This package introduces common data structures for working + with epidemiological data reported by location and time and offers + associated utilities to perform basic signal processing tasks. The + package is designed to be used in conjunction with `epipredict` for + building and evaluating epidemiological models. License: MIT + file LICENSE URL: https://cmu-delphi.github.io/epiprocess/ Depends: @@ -44,7 +44,6 @@ Imports: cli, data.table, dplyr (>= 1.1.0), - genlasso, ggplot2, glue, lifecycle (>= 1.0.1), @@ -63,6 +62,7 @@ Imports: waldo Suggests: devtools, + distributional, epidatr, epipredict, here, @@ -71,6 +71,7 @@ Suggests: readr, rmarkdown, testthat (>= 3.1.5), + trendfilter, withr VignetteBuilder: knitr @@ -79,7 +80,7 @@ Remotes: cmu-delphi/epidatasets, cmu-delphi/epidatr, cmu-delphi/epipredict, - glmgen/genlasso, + glmgen/trendfilter, reconverse/outbreaks Config/Needs/website: cmu-delphi/delphidocs Config/testthat/edition: 3 diff --git a/NAMESPACE b/NAMESPACE index a5aefadd..1a09d541 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -46,6 +46,7 @@ S3method(mean,epi_df) S3method(print,epi_archive) S3method(print,epi_df) S3method(print,grouped_epi_archive) +S3method(print,growth_rate_params) S3method(summary,epi_df) S3method(ungroup,epi_df) S3method(ungroup,grouped_epi_archive) @@ -80,6 +81,7 @@ export(group_by) export(group_epi_df) export(group_modify) export(growth_rate) +export(growth_rate_params) export(guess_period) export(is_epi_df) export(is_grouped_epi_archive) @@ -109,6 +111,7 @@ importFrom(checkmate,assert_function) importFrom(checkmate,assert_int) importFrom(checkmate,assert_list) importFrom(checkmate,assert_logical) +importFrom(checkmate,assert_number) importFrom(checkmate,assert_numeric) importFrom(checkmate,assert_scalar) importFrom(checkmate,assert_string) diff --git a/NEWS.md b/NEWS.md index 8c4254bc..bd0d518d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,6 +5,16 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat # epiprocess 0.11 ## Breaking changes + +- `growth_rate()` argument order and names have changed. You will need to + rewrite `growth_rate(x, y)` as `growth_rate(y, x)`. The interface for passing + arguments to the `"smooth_spline"` and `"trend_filter"` methods has also + changed. Finally, `growth_rate()` with `method = "trendfilter"` now uses the + `{trendfilter}` package rather than `{genlasso}`; results for this method will + be different than before. In order to make `{epiprocess}` installation easier + for users without a compiler, we have placed `{trendfilter}` in Suggests:; if + you want to use `method = "trendfilter"` you will need to manually install + this dependency (e.g., with `remotes::install_github("glmgen/trendfilter")`). - In `revision_summary()`: - Output now uses the name `lag_near_latest` instead of `time_near_latest`. To migrate, update references to `time_near_latest` to `lag_near_latest`. diff --git a/R/growth_rate.R b/R/growth_rate.R index 307309b5..7eac4440 100644 --- a/R/growth_rate.R +++ b/R/growth_rate.R @@ -5,12 +5,12 @@ #' vignette](https://cmu-delphi.github.io/epiprocess/articles/growth_rate.html) #' for examples. #' +#' @param y Signal values. #' @param x Design points corresponding to the signal values `y`. Default is #' `seq_along(y)` (that is, equally-spaced points from 1 to the length of #' `y`). -#' @param y Signal values. #' @param x0 Points at which we should estimate the growth rate. Must be a -#' subset of `x` (no extrapolation allowed). Default is `x`. +#' contained in the range of `x` (no extrapolation allowed). Default is `x`. #' @param method Either "rel_change", "linear_reg", "smooth_spline", or #' "trend_filter", indicating the method to use for the growth rate #' calculation. The first two are local methods: they are run in a sliding @@ -21,14 +21,10 @@ #' "linear_reg". See details for more explanation. #' @param log_scale Should growth rates be estimated using the parametrization #' on the log scale? See details for an explanation. Default is `FALSE`. -#' @param dup_rm Should we check and remove duplicates in `x` (and corresponding -#' elements of `y`) before the computation? Some methods might handle -#' duplicate `x` values gracefully, whereas others might fail (either quietly -#' or loudly). Default is `FALSE`. #' @param na_rm Should missing values be removed before the computation? Default #' is `FALSE`. -#' @param ... Additional arguments to pass to the method used to estimate the -#' derivative. +#' @param params Additional arguments to pass to the method used to estimate the +#' derivative. This should be created with `growth_rate_params()`. #' @return Vector of growth rate estimates at the specified points `x0`. #' #' @details The growth rate of a function f defined over a continuously-valued @@ -49,12 +45,14 @@ #' sliding window centered at the reference point `x0`, divided by the fitted #' value from this linear regression at `x0`. #' * "smooth_spline": uses the estimated derivative at `x0` from a smoothing -#' spline fit to `x` and `y`, via `stats::smooth.spline()`, divided by the +#' spline fit to `x` and `y`, via [stats::smooth.spline()], divided by the #' fitted value of the spline at `x0`. #' * "trend_filter": uses the estimated derivative at `x0` from polynomial trend #' filtering (a discrete spline) fit to `x` and `y`, via -#' `genlasso::trendfilter()`, divided by the fitted value of the discrete -#' spline at `x0`. +#' [trendfilter::trendfilter()], divided by the fitted value of the discrete +#' spline at `x0`. This method requires the +#' [`{trendfilter}` package](https://github.com/glmgen/trendfilter) +#' to be installed. #' #' ## Log Scale #' @@ -80,27 +78,30 @@ #' ## Additional Arguments #' #' For the global methods, "smooth_spline" and "trend_filter", additional -#' arguments can be specified via `...` for the underlying estimation -#' function. For the smoothing spline case, these additional arguments are -#' passed directly to `stats::smooth.spline()` (and the defaults are exactly -#' as in this function). The trend filtering case works a bit differently: -#' here, a custom set of arguments is allowed (which are distributed -#' internally to `genlasso::trendfilter()` and `genlasso::cv.trendfilter()`): +#' arguments can be specified via `params` for the underlying estimation +#' function. These additional arguments are +#' passed to [stats::smooth.spline()], [trendfilter::trendfilter()], or +#' [trendfilter::cv_trendfilter()]. The defaults are exactly +#' as specified in those functions, except when those defaults conflict +#' among these functions. These cases are as follows: #' -#' * `ord`: order of piecewise polynomial for the trend filtering fit. Default -#' is 3. -#' * `maxsteps`: maximum number of steps to take in the solution path before -#' terminating. Default is 1000. -#' * `cv`: should cross-validation be used to choose an effective degrees of -#' freedom for the fit? Default is `TRUE`. -#' * `k`: number of folds if cross-validation is to be used. Default is 3. -#' * `df`: desired effective degrees of freedom for the trend filtering fit. If -#' `cv = FALSE`, then `df` must be a positive integer; if `cv = TRUE`, then -#' `df` must be one of "min" or "1se" indicating the selection rule to use +#' * `df`: desired effective degrees of freedom. For "smooth_spline", this must be numeric (or `NULL`) and will +#' be passed along to the underlying function. For "trend_filter", if +#' `cv = FALSE`, then `df` must be a positive number (integer is most sensible); +#' if `cv = TRUE`, then `df` must be one of "min" or "1se" indicating the +#' selection rule to use #' based on the cross-validation error curve: minimum or 1-standard-error -#' rule, respectively. Default is "min" (going along with the default `cv = -#' TRUE`). Note that if `cv = FALSE`, then we require `df` to be set by the -#' user. +#' rule, respectively. The default is "min" (going along with the default +#' `cv = TRUE`). +#' * `lambda`: For "smooth_spline", this should be a scalar value or `NULL`. +#' For "trend_filter", this is allowed to also be a vector, as long as either +#' `cv = TRUE` or `df` is specified. +#' * `cv`: should cross-validation be used to choose an effective degrees of +#' freedom for the fit? The default is `FALSE` to match [stats::smooth.spline()]. +#' In that case, as in that function, GCV is used instead. +#' For "trend_filter", this will be coerced to `TRUE` if neither +#' `df` nor `lambda` are specified (the default). +#' Note that passing both `df` and a scalar `lambda` will always be an error. #' #' @export #' @examples @@ -109,54 +110,67 @@ #' group_by(geo_value) %>% #' mutate(cases_gr = growth_rate(x = time_value, y = cases)) #' -#' # Log scale, degree 4 polynomial and 6-fold cross validation +#' # Degree 3 polynomial and 5-fold cross validation on the log scale +#' # some locations report 0 cases, so we replace these with 1 #' cases_deaths_subset %>% #' group_by(geo_value) %>% -#' mutate(gr_poly = growth_rate(x = time_value, y = cases, log_scale = TRUE, ord = 4, k = 6)) -growth_rate <- function(x = seq_along(y), y, x0 = x, - method = c( - "rel_change", "linear_reg", - "smooth_spline", "trend_filter" - ), - h = 7, log_scale = FALSE, - dup_rm = FALSE, na_rm = FALSE, ...) { +#' mutate(gr_poly = growth_rate( +#' x = time_value, y = pmax(cases, 1), method = "trend_filter", +#' log_scale = TRUE, na_rm = TRUE +#' )) +growth_rate <- function( + y, x = seq_along(y), x0 = x, + method = c("rel_change", "linear_reg", "smooth_spline", "trend_filter"), + h = 7, log_scale = FALSE, na_rm = FALSE, + params = growth_rate_params()) { # Check x, y, x0 if (length(x) != length(y)) cli_abort("`x` and `y` must have the same length.") - if (!all(x0 %in% x)) cli_abort("`x0` must be a subset of `x`.") method <- rlang::arg_match(method) + assert_class(params, "growth_rate_params") + if (anyNA(x) || anyNA(x0)) { + cli_abort("Neither `x` nor `x0` may contain `NA`s.") + } + if (vctrs::vec_duplicate_any(x)) { + cli_abort( + "`x` contains duplicate values. (If being run on a + column in an `epi_df`, did you group by relevant key variables?)" + ) + } + if (method == "trend_filter" && !requireNamespace("trendfilter", quietly = TRUE)) { + cli_abort(c( + "The {.pkg trendfilter} package must be installed to use this option.", + i = "It is available at {.url https://github.com/glmgen/trendfilter}." + )) + } # Arrange in increasing order of x + if (min(x0) < min(x) || max(x0) > max(x)) { + cli_abort("`x0` must be contained in `[min(x), max(x)]`.") + } o <- order(x) x <- x[o] y <- y[o] + n <- length(y) # Convert to log(y) if we need to y <- as.numeric(y) - if (log_scale) y <- log(y) - - # Remove duplicates if we need to - if (dup_rm) { - o <- !duplicated(x) - if (any(!o)) { - cli_warn( - "`x` contains duplicate values. (If being run on a - column in an `epi_df`, did you group by relevant key variables?)" - ) + if (log_scale) { + if (any(y <= 0)) { + cli_warn("`y` contains 0 or negative values. Taking logs may produce + strange results.") } - x <- x[o] - y <- y[o] + y <- suppressWarnings(log(y)) } - - # Remove NAs if we need to - if (na_rm) { - o <- !(is.na(x) & is.na(y)) - x <- x[o] - y <- y[o] + if (!is.finite(y[1]) || !is.finite(y[n])) { + cli_abort("Either the first or last `y` values are not finite. This may be + due to `log_scale = TRUE`.") } - - # Useful indices for later - i0 <- x %in% x0 + good_obs <- (!is.na(y) | !na_rm) & is.finite(y) + x <- x[good_obs] + y <- y[good_obs] + x <- as.numeric(x) + x0 <- as.numeric(x0) # Local methods if (method == "rel_change" || method == "linear_reg") { @@ -195,24 +209,27 @@ growth_rate <- function(x = seq_along(y), y, x0 = x, } } }) - - return(g[i0]) + return(stats::approx(x, g, x0)$y) } # Global methods if (method == "smooth_spline" || method == "trend_filter") { - # Convert to numerics - x <- as.numeric(x) - x0 <- as.numeric(x0) - - # Collect parameters - params <- list(...) + if (any(is.na(x) | is.na(y) | !is.finite(x) | !is.finite(y))) { + cli_abort(c( + "{.val {method}} requires all real values without missingness.", + i = "Set `na_rm = TRUE` and / or check for infinite values.", + i = "Using `log_scale = TRUE` may induce either case." + )) + } - # Smoothing spline if (method == "smooth_spline") { - params$x <- x - params$y <- y - obj <- do.call(stats::smooth.spline, params) + if (is.character(params$df)) params$df <- NULL + if (length(params$lambda) > 1L) { + cli_abort("{.val smooth_spline} requires 1 `lambda` but more were used.") + } + params <- params[c("df", "spar", "lambda", "cv", "all.knots", "df.offset", "penalty")] + params <- params[!sapply(params, is.null)] + obj <- rlang::inject(stats::smooth.spline(x = x, y = y, !!!params)) f0 <- stats::predict(obj, x = x0)$y d0 <- stats::predict(obj, x = x0, deriv = 1)$y if (log_scale) { @@ -220,46 +237,149 @@ growth_rate <- function(x = seq_along(y), y, x0 = x, } else { return(d0 / f0) } - } else { - # Trend filtering - ord <- params$ord - maxsteps <- params$maxsteps - cv <- params$cv - df <- params$df - k <- params$k - - # Default parameters - ord <- ord %||% 3 - maxsteps <- maxsteps %||% 1000 - cv <- cv %||% TRUE - df <- df %||% "min" - k <- k %||% 3 - - # Check cv and df combo - if (is.numeric(df)) cv <- FALSE - if (!cv && !(is.numeric(df) && df == round(df))) { - cli_abort("If `cv = FALSE`, then `df` must be an integer.") - } - - # Compute trend filtering path - obj <- genlasso::trendfilter(y = y, pos = x, ord = ord, max = maxsteps) - - # Use CV to find df, if we need to - if (cv) { - cv_obj <- quiet(genlasso::cv.trendfilter(obj, k = k, mode = "df")) - df <- ifelse(df == "min", cv_obj$df.min, cv_obj$df.1se) + } else { # Trend filtering + params <- parse_trendfilter_params(params) + if (params$cv) { + obj <- trendfilter::cv_trendfilter( + y, x, + k = params$k, error_measure = params$error_measure, + nfolds = params$nfolds, family = params$family, lambda = params$lambda, + nlambda = params$nlambda, lambda_max = params$lambda_max, + lambda_min = params$lambda_min, lambda_min_ratio = params$lambda_min_ratio + ) + lam <- params$df + which_lambda <- paste0("lambda_", lam) + f <- stats::predict(obj, newx = x0, which_lambda = which_lambda) + } else { + obj <- trendfilter::trendfilter( + y, x, + k = params$k, family = params$family, lambda = params$lambda, + nlambda = params$nlambda, lambda_max = params$lambda_max, + lambda_min = params$lambda_min, lambda_min_ratio = params$lambda_min_ratio + ) + single_lambda <- length(obj$lambda) == 1L + lam <- ifelse(single_lambda, obj$lambda, obj$lambda[which.min(abs(params$df - obj$dof))]) + f <- stats::predict(obj, newx = x0, lambda = lam) } - # Estimate growth rate and return - f <- genlasso::coef.genlasso(obj, df = df)$beta - d <- diff(f) / diff(x) + d <- diff(f) / diff(x0) # Extend by one element d <- c(d, d[length(d)]) if (log_scale) { - return(d[i0]) + return(d) } else { - return((d / f)[i0]) + return(d / f) + } + } + } +} + +#' Optional parameters for growth rate methods +#' +#' Construct an object containing non-standard arguments for [growth_rate()]. +#' +#' @param df Numeric or NULL for "smooth_spline". May also be one of "min" or +#' "max" in the case of "trend_filter". The desired equivalent number of +#' degrees of freedom of the fit. Lower values give smoother estimates. +#' @param lambda The desired smoothing parameter. For "smooth_spline", this +#' can be specified instead of `spar`. For "trend_filter", this sequence +#' determines the balance between data fidelity and smoothness of the +#' estimated curve; larger `lambda` results in a smoother estimate. The +#' default, `NULL` results in an automatic computation based on `nlambda`, +#' the largest value of `lambda` that would result in a maximally smooth +#' estimate, and `lambda_min_ratio`. Supplying a value of `lambda` overrides +#' this behaviour. +#' @param cv For "smooth_spline", ordinary leave-one-out (`TRUE`) or ‘generalized’ +#' cross-validation (GCV) when `FALSE`; is used for smoothing parameter computation +#' only when both `spar` and `df` are not specified. For "trend_filter", +#' `cv` determines whether or not cross-validation is used to choose the +#' tuning parameter. If `FALSE`, then the user must specify either `lambda` +#' or `df`. +#' @inheritParams stats::smooth.spline +#' @inheritParams trendfilter::trendfilter +#' @inheritParams trendfilter::cv_trendfilter +#' +#' @return A list of parameter configurations. +#' @importFrom checkmate assert_number +#' @export +growth_rate_params <- function( + df = NULL, + lambda = NULL, + cv = FALSE, + spar = NULL, + all.knots = FALSE, # nolint + df.offset = 0, # nolint + penalty = 1, + k = 3L, + family = c("gaussian", "logistic", "poisson"), + nlambda = 50L, + lambda_max = NULL, + lambda_min = NULL, + lambda_min_ratio = 1e-5, + error_measure = c("deviance", "mse", "mae"), + nfolds = 3L) { + if (is.character(df)) { + df <- rlang::arg_match0(df, c("min", "1se")) + } else { + assert_number(df, lower = 0, null.ok = TRUE, finite = TRUE) + } + assert_number(spar, null.ok = TRUE, finite = TRUE) + assert_numeric(lambda, lower = 0, null.ok = TRUE, finite = TRUE) + assert_logical(cv, len = 1) + assert_logical(all.knots, len = 1) + assert_number(df.offset, lower = 0, finite = TRUE) + assert_number(penalty, lower = 0, finite = TRUE) + checkmate::assert_integerish(k, lower = 0, len = 1) + family <- arg_match(family) + assert_number(nlambda, lower = 0, finite = TRUE) + assert_number(lambda_max, lower = 0, finite = TRUE, null.ok = TRUE) + assert_number(lambda_min, lower = 0, finite = TRUE, null.ok = TRUE) + assert_number(lambda_min_ratio, lower = 0, upper = 1) + error_measure <- arg_match(error_measure) + checkmate::assert_integerish(nfolds, lower = 2, len = 1) + + structure(enlist( + df, lambda, cv, # shared by all + spar, all.knots, df.offset, penalty, # smooth.spline + k, family, nlambda, lambda_max, lambda_min, lambda_min_ratio, # all TF + error_measure, nfolds # cv_trendfilter + ), class = "growth_rate_params") +} + +#' @export +print.growth_rate_params <- function(x, ...) { + utils::str(x, give.attr = FALSE) +} + +parse_trendfilter_params <- function(params) { + assert_class(params, "growth_rate_params") + vec_lambda <- checkmate::test_numeric(params$lambda, min.len = 2L, null.ok = TRUE) + df_cv <- checkmate::test_character(params$df, null.ok = TRUE) + if (df_cv && vec_lambda) { + params$cv <- TRUE # Turn CV on (or leave it on) + params$df <- params$df %||% "min" # use the original arg or provide the minimizer + return(params) + } + if (params$cv) { # CV = TRUE on input but conflicts with other custom args + cli_abort( + "When `cv = TRUE`, `df` must be `NULL` or character and `lambda` must be + `NULL` or a vector." + ) + } else { # CV should stay FALSE + if (!vec_lambda) { + if (is.character(params$df)) { + cli_abort( + "`df` a character implies using CV, but also setting `lambda` to a + single value implies no CV." + ) + } + if (is.numeric(params$df)) { + cli_abort("`df` and `lambda` cannot both be scalars.") } } } + # If we got here, we fit TF. There are two possibilities: + # 1. df is NULL and lambda is a scalar + # 2. df is numeric and lambda is either NULL or a vector (vec_lambda = TRUE) + return(params) } diff --git a/R/time-utils.R b/R/time-utils.R index 28b7182a..73fbc8a5 100644 --- a/R/time-utils.R +++ b/R/time-utils.R @@ -265,8 +265,8 @@ time_type_unit_pluralizer <- c( #' Format a length-1 time delta to a character to assist messaging #' #' This is meant to address the following: -#' - glue::glue("{as.difftime(1, units = 'days')}") is "1" -#' - glue::glue("{format(as.difftime(1, units = 'days'))}") is "1 days" +#' - `glue::glue("{as.difftime(1, units = 'days')}")` is "1" +#' - `glue::glue("{format(as.difftime(1, units = 'days'))}")` is "1 days" #' - time deltas for yearmonths and integers don't have units attached at all #' #' @keywords internal diff --git a/_pkgdown.yml b/_pkgdown.yml index e8a2c8c3..3742bd41 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -63,7 +63,7 @@ reference: - sum_groups_epi_df - epi_cor - detect_outlr - - growth_rate + - starts_with("growth_rate") - as_tibble.epi_df - as_tsibble.epi_df diff --git a/man/format_time_delta.Rd b/man/format_time_delta.Rd index 88ba581a..3658ff27 100644 --- a/man/format_time_delta.Rd +++ b/man/format_time_delta.Rd @@ -9,8 +9,8 @@ format_time_delta(x, time_type) \description{ This is meant to address the following: \itemize{ -\item glue::glue("{as.difftime(1, units = 'days')}") is "1" -\item glue::glue("{format(as.difftime(1, units = 'days'))}") is "1 days" +\item \code{glue::glue("{as.difftime(1, units = 'days')}")} is "1" +\item \code{glue::glue("{format(as.difftime(1, units = 'days'))}")} is "1 days" \item time deltas for yearmonths and integers don't have units attached at all } } diff --git a/man/growth_rate.Rd b/man/growth_rate.Rd index 0e22508c..1529a836 100644 --- a/man/growth_rate.Rd +++ b/man/growth_rate.Rd @@ -5,26 +5,25 @@ \title{Estimate growth rate} \usage{ growth_rate( - x = seq_along(y), y, + x = seq_along(y), x0 = x, method = c("rel_change", "linear_reg", "smooth_spline", "trend_filter"), h = 7, log_scale = FALSE, - dup_rm = FALSE, na_rm = FALSE, - ... + params = growth_rate_params() ) } \arguments{ +\item{y}{Signal values.} + \item{x}{Design points corresponding to the signal values \code{y}. Default is \code{seq_along(y)} (that is, equally-spaced points from 1 to the length of \code{y}).} -\item{y}{Signal values.} - \item{x0}{Points at which we should estimate the growth rate. Must be a -subset of \code{x} (no extrapolation allowed). Default is \code{x}.} +contained in the range of \code{x} (no extrapolation allowed). Default is \code{x}.} \item{method}{Either "rel_change", "linear_reg", "smooth_spline", or "trend_filter", indicating the method to use for the growth rate @@ -39,16 +38,11 @@ the entire sequence. See details for more explanation.} \item{log_scale}{Should growth rates be estimated using the parametrization on the log scale? See details for an explanation. Default is \code{FALSE}.} -\item{dup_rm}{Should we check and remove duplicates in \code{x} (and corresponding -elements of \code{y}) before the computation? Some methods might handle -duplicate \code{x} values gracefully, whereas others might fail (either quietly -or loudly). Default is \code{FALSE}.} - \item{na_rm}{Should missing values be removed before the computation? Default is \code{FALSE}.} -\item{...}{Additional arguments to pass to the method used to estimate the -derivative.} +\item{params}{Additional arguments to pass to the method used to estimate the +derivative. This should be created with \code{growth_rate_params()}.} } \value{ Vector of growth rate estimates at the specified points \code{x0}. @@ -77,12 +71,14 @@ using a first-difference approximation to the derivative. sliding window centered at the reference point \code{x0}, divided by the fitted value from this linear regression at \code{x0}. \item "smooth_spline": uses the estimated derivative at \code{x0} from a smoothing -spline fit to \code{x} and \code{y}, via \code{stats::smooth.spline()}, divided by the +spline fit to \code{x} and \code{y}, via \code{\link[stats:smooth.spline]{stats::smooth.spline()}}, divided by the fitted value of the spline at \code{x0}. \item "trend_filter": uses the estimated derivative at \code{x0} from polynomial trend filtering (a discrete spline) fit to \code{x} and \code{y}, via -\code{genlasso::trendfilter()}, divided by the fitted value of the discrete -spline at \code{x0}. +\code{\link[trendfilter:trendfilter]{trendfilter::trendfilter()}}, divided by the fitted value of the discrete +spline at \code{x0}. This method requires the +\href{https://github.com/glmgen/trendfilter}{\code{{trendfilter}} package} +to be installed. } \subsection{Log Scale}{ @@ -110,26 +106,30 @@ behavior of \code{epi_slide()} with \code{before = h - 1} and \code{after = h}). \subsection{Additional Arguments}{ For the global methods, "smooth_spline" and "trend_filter", additional -arguments can be specified via \code{...} for the underlying estimation -function. For the smoothing spline case, these additional arguments are -passed directly to \code{stats::smooth.spline()} (and the defaults are exactly -as in this function). The trend filtering case works a bit differently: -here, a custom set of arguments is allowed (which are distributed -internally to \code{genlasso::trendfilter()} and \code{genlasso::cv.trendfilter()}): +arguments can be specified via \code{params} for the underlying estimation +function. These additional arguments are +passed to \code{\link[stats:smooth.spline]{stats::smooth.spline()}}, \code{\link[trendfilter:trendfilter]{trendfilter::trendfilter()}}, or +\code{\link[trendfilter:cv_trendfilter]{trendfilter::cv_trendfilter()}}. The defaults are exactly +as specified in those functions, except when those defaults conflict +among these functions. These cases are as follows: \itemize{ -\item \code{ord}: order of piecewise polynomial for the trend filtering fit. Default -is 3. -\item \code{maxsteps}: maximum number of steps to take in the solution path before -terminating. Default is 1000. -\item \code{cv}: should cross-validation be used to choose an effective degrees of -freedom for the fit? Default is \code{TRUE}. -\item \code{k}: number of folds if cross-validation is to be used. Default is 3. -\item \code{df}: desired effective degrees of freedom for the trend filtering fit. If -\code{cv = FALSE}, then \code{df} must be a positive integer; if \code{cv = TRUE}, then -\code{df} must be one of "min" or "1se" indicating the selection rule to use +\item \code{df}: desired effective degrees of freedom. For "smooth_spline", this must be numeric (or \code{NULL}) and will +be passed along to the underlying function. For "trend_filter", if +\code{cv = FALSE}, then \code{df} must be a positive number (integer is most sensible); +if \code{cv = TRUE}, then \code{df} must be one of "min" or "1se" indicating the +selection rule to use based on the cross-validation error curve: minimum or 1-standard-error -rule, respectively. Default is "min" (going along with the default \code{cv = TRUE}). Note that if \code{cv = FALSE}, then we require \code{df} to be set by the -user. +rule, respectively. The default is "min" (going along with the default +\code{cv = TRUE}). +\item \code{lambda}: For "smooth_spline", this should be a scalar value or \code{NULL}. +For "trend_filter", this is allowed to also be a vector, as long as either +\code{cv = TRUE} or \code{df} is specified. +\item \code{cv}: should cross-validation be used to choose an effective degrees of +freedom for the fit? The default is \code{FALSE} to match \code{\link[stats:smooth.spline]{stats::smooth.spline()}}. +In that case, as in that function, GCV is used instead. +For "trend_filter", this will be coerced to \code{TRUE} if neither +\code{df} nor \code{lambda} are specified (the default). +Note that passing both \code{df} and a scalar \code{lambda} will always be an error. } } } @@ -139,8 +139,12 @@ cases_deaths_subset \%>\% group_by(geo_value) \%>\% mutate(cases_gr = growth_rate(x = time_value, y = cases)) -# Log scale, degree 4 polynomial and 6-fold cross validation +# Degree 3 polynomial and 5-fold cross validation on the log scale +# some locations report 0 cases, so we replace these with 1 cases_deaths_subset \%>\% group_by(geo_value) \%>\% - mutate(gr_poly = growth_rate(x = time_value, y = cases, log_scale = TRUE, ord = 4, k = 6)) + mutate(gr_poly = growth_rate( + x = time_value, y = pmax(cases, 1), method = "trend_filter", + log_scale = TRUE, na_rm = TRUE + )) } diff --git a/man/growth_rate_params.Rd b/man/growth_rate_params.Rd new file mode 100644 index 00000000..b0bb00c8 --- /dev/null +++ b/man/growth_rate_params.Rd @@ -0,0 +1,120 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/growth_rate.R +\name{growth_rate_params} +\alias{growth_rate_params} +\title{Optional parameters for growth rate methods} +\usage{ +growth_rate_params( + df = NULL, + lambda = NULL, + cv = FALSE, + spar = NULL, + all.knots = FALSE, + df.offset = 0, + penalty = 1, + k = 3L, + family = c("gaussian", "logistic", "poisson"), + nlambda = 50L, + lambda_max = NULL, + lambda_min = NULL, + lambda_min_ratio = 1e-05, + error_measure = c("deviance", "mse", "mae"), + nfolds = 3L +) +} +\arguments{ +\item{df}{Numeric or NULL for "smooth_spline". May also be one of "min" or +"max" in the case of "trend_filter". The desired equivalent number of +degrees of freedom of the fit. Lower values give smoother estimates.} + +\item{lambda}{The desired smoothing parameter. For "smooth_spline", this +can be specified instead of \code{spar}. For "trend_filter", this sequence +determines the balance between data fidelity and smoothness of the +estimated curve; larger \code{lambda} results in a smoother estimate. The +default, \code{NULL} results in an automatic computation based on \code{nlambda}, +the largest value of \code{lambda} that would result in a maximally smooth +estimate, and \code{lambda_min_ratio}. Supplying a value of \code{lambda} overrides +this behaviour.} + +\item{cv}{For "smooth_spline", ordinary leave-one-out (\code{TRUE}) or ‘generalized’ +cross-validation (GCV) when \code{FALSE}; is used for smoothing parameter computation +only when both \code{spar} and \code{df} are not specified. For "trend_filter", +\code{cv} determines whether or not cross-validation is used to choose the +tuning parameter. If \code{FALSE}, then the user must specify either \code{lambda} +or \code{df}.} + +\item{spar}{smoothing parameter, typically (but not necessarily) in + \eqn{(0,1]}. When \code{spar} is specified, the coefficient + \eqn{\lambda} of the integral of the squared second derivative in the + fit (penalized log likelihood) criterion is a monotone function of + \code{spar}, see the details below. Alternatively \code{lambda} may + be specified instead of the \emph{scale free} \code{spar}=\eqn{s}.} + +\item{all.knots}{if \code{TRUE}, all distinct points in \code{x} are used + as knots. If \code{FALSE} (default), a subset of \code{x[]} is used, + specifically \code{x[j]} where the \code{nknots} indices are evenly + spaced in \code{1:n}, see also the next argument \code{nknots}. + + Alternatively, a strictly increasing \code{\link{numeric}} vector + specifying \dQuote{all the knots} to be used; must be rescaled + to \eqn{[0, 1]} already such that it corresponds to the + \code{ans $ fit$knots} sequence returned, not repeating the boundary + knots.} + +\item{df.offset}{allows the degrees of freedom to be increased by + \code{df.offset} in the GCV criterion.} + +\item{penalty}{the coefficient of the penalty for degrees of freedom + in the GCV criterion.} + +\item{k}{Integer. Degree of the piecewise polynomial curve to be +estimated. For example, \code{k = 0} corresponds to a piecewise constant +curve.} + +\item{family}{Character or function. Specifies the loss function +to use. Valid options are: +\itemize{ +\item \code{"gaussian"} - least squares loss (the default), +\item \code{"binomial"} - logistic loss (classification), +\item \code{"poisson"} - Poisson loss for count data +} + +For any other type, a valid \code{\link[stats:family]{stats::family()}} object may be passed. Note +that these will generally be much slower to estimate than the built-in +options passed as strings. So for example, \code{family = "gaussian"} and +\code{family = gaussian()} will produce the same results, but the first +will be much faster.character.} + +\item{nlambda}{Integer. Number of lambda values to use in the sequence.} + +\item{lambda_max}{Optional value for the largest \code{lambda} to use.} + +\item{lambda_min}{Optional value for the smallest \code{lambda} to use (> 0).} + +\item{lambda_min_ratio}{If neither \code{lambda} nor \code{lambda_min} is specified, +\code{lambda_min = lambda_max * lambda_min_ratio}. +A very small value will lead to the solution \code{theta = y} (for the Gaussian +loss). This argument has no effect if there is a user-defined \code{lambda} +sequence.} + +\item{error_measure}{Metric used to calculate cross validation scores. May +be \code{mse}, \code{mae}, or \code{deviance}.} + +\item{nfolds}{Integer. The number of folds to use. For leave-vth-out cross +validation, every vth \code{y} value and its corresponding position (and weight) +are placed into the same fold. The first and last observations are not +assigned to any folds. This value must be at least 2. As an example, with +15 data points and \code{nfolds = 4}, the points are assigned to folds in the +following way: +\deqn{ + 0 \; 1 \; 2 \; 3 \; 4 \; 1 \; 2 \; 3 \; 4 \; 1 \; 2 \; 3 \; 4 \; 1 \; 0 + }{0 1 2 3 4 1 2 3 4 1 2 3 4 1 0} where 0 indicates no assignment. +Therefore, the folds are not random and running \code{cv_trendfilter()} twice +will give the same result.} +} +\value{ +A list of parameter configurations. +} +\description{ +Construct an object containing non-standard arguments for \code{\link[=growth_rate]{growth_rate()}}. +} diff --git a/tests/testthat/_snaps/growth_rate.md b/tests/testthat/_snaps/growth_rate.md new file mode 100644 index 00000000..213c2756 --- /dev/null +++ b/tests/testthat/_snaps/growth_rate.md @@ -0,0 +1,109 @@ +# global param constructor errors when required + + Code + growth_rate_params(df = -5) + Condition + Error in `growth_rate_params()`: + ! Assertion on 'df' failed: Element 1 is not >= 0. + +--- + + Code + growth_rate_params(nlambda = 5:8) + Condition + Error in `growth_rate_params()`: + ! Assertion on 'nlambda' failed: Must have length 1. + +# new setup args and warnings are as expected + + Code + growth_rate(y = -10:10, log_scale = TRUE) + Condition + Warning: + `y` contains 0 or negative values. Taking logs may produce strange results. + Error in `growth_rate()`: + ! Either the first or last `y` values are not finite. This may be due to `log_scale = TRUE`. + +--- + + Code + growth_rate(y = -10:10, log_scale = TRUE, method = "smooth_spline") + Condition + Warning: + `y` contains 0 or negative values. Taking logs may produce strange results. + Error in `growth_rate()`: + ! Either the first or last `y` values are not finite. This may be due to `log_scale = TRUE`. + +--- + + Code + growth_rate(y = 1:30, x = c(1:20, NA, 22:30), na_rm = TRUE) + Condition + Error in `growth_rate()`: + ! Neither `x` nor `x0` may contain `NA`s. + +--- + + Code + growth_rate(y = 1:20, method = "smooth_spline", params = growth_rate_params( + lambda = 1:20)) + Condition + Error in `growth_rate()`: + ! "smooth_spline" requires 1 `lambda` but more were used. + +# parser sees all cases + + Code + parse_trendfilter_params(l) + Condition + Error in `parse_trendfilter_params()`: + ! When `cv = TRUE`, `df` must be `NULL` or character and `lambda` must be `NULL` or a vector. + +--- + + Code + parse_trendfilter_params(l) + Condition + Error in `parse_trendfilter_params()`: + ! When `cv = TRUE`, `df` must be `NULL` or character and `lambda` must be `NULL` or a vector. + +--- + + Code + parse_trendfilter_params(l) + Condition + Error in `parse_trendfilter_params()`: + ! When `cv = TRUE`, `df` must be `NULL` or character and `lambda` must be `NULL` or a vector. + +--- + + Code + parse_trendfilter_params(l) + Condition + Error in `parse_trendfilter_params()`: + ! When `cv = TRUE`, `df` must be `NULL` or character and `lambda` must be `NULL` or a vector. + +--- + + Code + parse_trendfilter_params(l) + Condition + Error in `parse_trendfilter_params()`: + ! When `cv = TRUE`, `df` must be `NULL` or character and `lambda` must be `NULL` or a vector. + +--- + + Code + parse_trendfilter_params(l) + Condition + Error in `parse_trendfilter_params()`: + ! `df` a character implies using CV, but also setting `lambda` to a single value implies no CV. + +--- + + Code + parse_trendfilter_params(l) + Condition + Error in `parse_trendfilter_params()`: + ! `df` and `lambda` cannot both be scalars. + diff --git a/tests/testthat/test-growth_rate.R b/tests/testthat/test-growth_rate.R new file mode 100644 index 00000000..9aa9936d --- /dev/null +++ b/tests/testthat/test-growth_rate.R @@ -0,0 +1,191 @@ +test_that("global param constructor errors when required", { + # Check the tree when there is parameter dependency + expect_identical(growth_rate_params(df = "1se")$df, "1se") + expect_false(growth_rate_params(df = 10, cv = FALSE)$cv) + expect_identical(growth_rate_params(df = 10L)$df, 10L) + expect_snapshot(error = TRUE, growth_rate_params(df = -5)) + + # Make sure that assert_number is len 1 + expect_identical(growth_rate_params(nlambda = 5L)$nlambda, 5L) + expect_snapshot(error = TRUE, growth_rate_params(nlambda = 5:8)) +}) + +test_that("new setup args and warnings are as expected", { + # NaN in log calculation + expect_snapshot(error = TRUE, growth_rate(y = -10:10, log_scale = TRUE)) + expect_snapshot(error = TRUE, growth_rate(y = -10:10, log_scale = TRUE, method = "smooth_spline")) + + # NAs in x or y are removed + expect_length(growth_rate(y = c(1:20, NA, 22:30)), 30L) + expect_length(growth_rate(y = c(1:20, NA, 22:30), na_rm = TRUE), 30L) + expect_snapshot(error = TRUE, growth_rate(y = 1:30, x = c(1:20, NA, 22:30), na_rm = TRUE)) + + # splines and trendfilter error on NAs + expect_length(growth_rate(y = c(1:20, NA, 22:30), method = "smooth_spline"), 30L) + expect_length(growth_rate(y = c(1:20, NA, 22:30), method = "trend_filter"), 30L) + expect_warning(growth_rate(y = c(1:20, -5, 22:30), log_scale = TRUE, method = "smooth_spline")) + expect_warning(growth_rate(y = c(1:20, -5, 22:30), log_scale = TRUE, method = "trend_filter")) + + # splines with multiple lambdas + expect_snapshot( + error = TRUE, + growth_rate( + y = 1:20, method = "smooth_spline", + params = growth_rate_params(lambda = 1:20) + ) + ) + + # other spline args give output (correctness not checked) + z <- rnorm(30) + expect_length(growth_rate(y = z, method = "smooth_spline"), 30L) + expect_length(growth_rate( + y = z, + method = "smooth_spline", params = growth_rate_params(spar = .5) + ), 30L) + expect_length(growth_rate( + y = z, + method = "smooth_spline", params = growth_rate_params(lambda = 10) + ), 30L) + expect_length(growth_rate( + y = z, + method = "smooth_spline", params = growth_rate_params(df = 14) + ), 30L) + expect_length(growth_rate( + y = z, + method = "smooth_spline", params = growth_rate_params(cv = TRUE) + ), 30L) +}) + +test_that("parser sees all cases", { + skip_if_not_installed("trendfilter", "0.0.2") + # 18 total cases + # lambda in {NULL, scalar, vector} + # df in {NULL, character, numeric} + # cv in {T/F} + + grab_l <- function(l) list(cv = l$cv, df = l$df, lambda = l$lambda) + + # CV TRUE + l <- growth_rate_params(cv = TRUE) + expect_identical( + grab_l(parse_trendfilter_params(l)), + list(cv = TRUE, df = "min", lambda = NULL) + ) + l <- growth_rate_params(cv = TRUE, df = "1se") + expect_identical( + grab_l(parse_trendfilter_params(l)), + list(cv = TRUE, df = "1se", lambda = NULL) + ) + l <- growth_rate_params(cv = TRUE, df = "min", lambda = 1:5) + expect_identical( + grab_l(parse_trendfilter_params(l)), + list(cv = TRUE, df = "min", lambda = 1:5) + ) + l <- growth_rate_params(cv = TRUE, lambda = 1:5) + expect_identical( + grab_l(parse_trendfilter_params(l)), + list(cv = TRUE, df = "min", lambda = 1:5) + ) + l <- growth_rate_params(cv = TRUE, lambda = 1) + expect_snapshot(error = TRUE, parse_trendfilter_params(l)) + l <- growth_rate_params(cv = TRUE, df = 1) + expect_snapshot(error = TRUE, parse_trendfilter_params(l)) + l <- growth_rate_params(cv = TRUE, df = 1, lambda = 1) + expect_snapshot(error = TRUE, parse_trendfilter_params(l)) + l <- growth_rate_params(cv = TRUE, df = "min", lambda = 1) + expect_snapshot(error = TRUE, parse_trendfilter_params(l)) + l <- growth_rate_params(cv = TRUE, df = 1, lambda = 1:5) + expect_snapshot(error = TRUE, parse_trendfilter_params(l)) + + # CV = FALSE (the default) + # 5 Cases where we turn CV on + l <- growth_rate_params(df = "1se") + expect_identical( + grab_l(parse_trendfilter_params(l)), + list(cv = TRUE, df = "1se", lambda = NULL) + ) + l <- growth_rate_params(df = "1se", lambda = 1:5) + expect_identical( + grab_l(parse_trendfilter_params(l)), + list(cv = TRUE, df = "1se", lambda = 1:5) + ) + l <- growth_rate_params(lambda = 1:5) + expect_identical( + grab_l(parse_trendfilter_params(l)), + list(cv = TRUE, df = "min", lambda = 1:5) + ) + expect_identical( + grab_l(parse_trendfilter_params(growth_rate_params())), + list(cv = TRUE, df = "min", lambda = NULL) + ) + # 3 cases where CV stays False + l <- growth_rate_params(lambda = 1) + expect_identical( + grab_l(parse_trendfilter_params(l)), + list(cv = FALSE, df = NULL, lambda = 1) + ) + l <- growth_rate_params(df = 5) + expect_identical( + grab_l(parse_trendfilter_params(l)), + list(cv = FALSE, df = 5, lambda = NULL) + ) + l <- growth_rate_params(df = 5, lambda = 1:5) + expect_identical( + grab_l(parse_trendfilter_params(l)), + list(cv = FALSE, df = 5, lambda = 1:5) + ) + + # 2 error cases + l <- growth_rate_params(df = "min", lambda = 1) + expect_snapshot(error = TRUE, parse_trendfilter_params(l)) + l <- growth_rate_params(df = 1, lambda = 1) + expect_snapshot(error = TRUE, parse_trendfilter_params(l)) +}) + +test_that("trendfilter growth_rate implementation", { + skip_if_not_installed("trendfilter", "0.0.2") + + # various tf args give output (correctness not checked) + z <- rnorm(30) + expect_length(growth_rate(y = z, method = "trend_filter"), 30L) + expect_length(growth_rate( + y = z, + method = "trend_filter", params = growth_rate_params(lambda = 10) + ), 30L) + expect_length(growth_rate( + y = z, + method = "trend_filter", params = growth_rate_params(df = 14) + ), 30L) + expect_length(growth_rate( + y = z, + method = "trend_filter", params = growth_rate_params(cv = TRUE) + ), 30L) + expect_length(growth_rate( + y = z, + method = "trend_filter", params = growth_rate_params(k = 3) + ), 30L) + expect_length(growth_rate( + y = z, + method = "trend_filter", params = growth_rate_params(nlambda = 10) + ), 30L) + expect_length(growth_rate( + y = z, + method = "trend_filter", params = growth_rate_params(lambda_max = 10) + ), 30L) + expect_length(growth_rate( + y = z, + method = "trend_filter", params = growth_rate_params(lambda_min = 10) + ), 30L) + expect_length(growth_rate( + y = z, + method = "trend_filter", params = growth_rate_params(lambda_min_ratio = .1) + ), 30L) + expect_length(growth_rate( + y = z, + method = "trend_filter", params = growth_rate_params(error_measure = "mse") + ), 30L) + expect_length(growth_rate( + y = z, + method = "trend_filter", params = growth_rate_params(nfolds = 3) + ), 30L) +}) diff --git a/tests/testthat/test-key_colnames.R b/tests/testthat/test-key_colnames.R index f112de8c..bd873ecc 100644 --- a/tests/testthat/test-key_colnames.R +++ b/tests/testthat/test-key_colnames.R @@ -85,11 +85,19 @@ test_that("`key_colnames` on `epi_df`s and similar tibbles works as expected", { # We can exclude keys: expect_equal( - key_colnames(gat_tbl, geo_keys = "geo_value", other_keys = "age_group", time_keys = "time_value", exclude = c("time_value")), + key_colnames( + gat_tbl, + geo_keys = "geo_value", other_keys = "age_group", time_keys = "time_value", + exclude = c("time_value") + ), c("geo_value", "age_group") ) expect_equal( - key_colnames(gat_tbl, geo_keys = "geo_value", other_keys = "age_group", time_keys = "time_value", exclude = c("geo_value", "time_value")), + key_colnames( + gat_tbl, + geo_keys = "geo_value", other_keys = "age_group", time_keys = "time_value", + exclude = c("geo_value", "time_value") + ), c("age_group") ) expect_equal( @@ -103,14 +111,22 @@ test_that("`key_colnames` on `epi_df`s and similar tibbles works as expected", { # Using `extra_keys =` is soft-deprecated and routes to `other_keys =`: expect_warning( - gat_tbl_extra_keys_res <- key_colnames(gat_tbl, geo_keys = "geo_value", time_keys = "time_value", extra_keys = "age_group"), + gat_tbl_extra_keys_res <- key_colnames( + gat_tbl, + geo_keys = "geo_value", time_keys = "time_value", + extra_keys = "age_group" + ), class = "lifecycle_warning_deprecated" ) expect_equal(gat_tbl_extra_keys_res, c("geo_value", "age_group", "time_value")) expect_warning( gat_edf_extra_keys_exclude_res <- - key_colnames(gat_edf, extra_keys = "age_group", exclude = c("geo_value", "time_value")), + key_colnames( + gat_edf, + extra_keys = "age_group", + exclude = c("geo_value", "time_value") + ), class = "lifecycle_warning_deprecated" ) expect_equal(gat_edf_extra_keys_exclude_res, c("age_group")) diff --git a/vignettes/growth_rate.Rmd b/vignettes/growth_rate.Rmd index ec073876..67a75903 100644 --- a/vignettes/growth_rate.Rmd +++ b/vignettes/growth_rate.Rmd @@ -82,7 +82,7 @@ the following methods for estimating the growth rate at a given reference point of the spline at `x0`. * "trend_filter": uses the estimated derivative at `x0` from polynomial trend filtering (a discrete spline) fit to `x` and `y`, via - `genlasso::trendfilter()`, divided by the fitted value of the discrete spline + `trendfilter::trendfilter()`, divided by the fitted value of the discrete spline at `x0`. The default in `growth_rate()` is `x0 = x`, so that it returns an estimate of @@ -99,7 +99,7 @@ the computed growth rates. ```{r} x <- x %>% group_by(geo_value) %>% - mutate(cases_gr1 = growth_rate(time_value, cases)) + mutate(cases_gr1 = growth_rate(cases)) head(x, 10) ``` @@ -162,7 +162,7 @@ but thankfully avoids some of the troublesome spikes: ```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 7} x <- x %>% group_by(geo_value) %>% - mutate(cases_gr2 = growth_rate(time_value, cases, method = "linear_reg")) + mutate(cases_gr2 = growth_rate(cases, method = "linear_reg")) x %>% pivot_longer( @@ -189,17 +189,20 @@ We can also use a nonparametric method to estimate the derivative, through computationally expensive, but it is also able to adapt better to the local level of smoothness. (The apparent efficiency is actually compounded by the particular implementations and default settings for these methods: -"trend_filter" is based on a full solution path algorithm provided in the -`genlasso` package, and performs cross-validation by default in order to pick -the level of regularization; read the documentation for `growth_rate()` more +"trend_filter" is based on a sequence of solutions provided in the +`trendfilter` package, and performs cross-validation by default in order to pick +the level of regularization; read the documentation for `growth_rate()` for more details.) +Note: The `trendfilter` package is not automatically installed with `epiprocess`. +To install it from GitHub, you can use `pak::pkg_install("glmgen/trendfilter")`. + ```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 7} x <- x %>% group_by(geo_value) %>% mutate( - cases_gr3 = growth_rate(time_value, cases, method = "smooth_spline"), - cases_gr4 = growth_rate(time_value, cases, method = "trend_filter") + cases_gr3 = growth_rate(cases, method = "smooth_spline"), + cases_gr4 = growth_rate(cases, method = "trend_filter") ) x %>% @@ -228,9 +231,9 @@ stable than the estimates from local relative changes and linear regressions. The smoothing spline growth rate estimates are based on the default settings in `stats::smooth.spline()`, and appear severely under-regularized here. Any of the arguments to `stats::smooth.spline()` can be customized by passing them as -additional arguments `...` in the call to `growth_rate()`; similarly, we can +additional arguments to `growth_rate_params()`; similarly, we can also use additional arguments to customize the settings in the underlying trend -filtering functions `genlasso::trendfilter()`, `genlasso::cv.trendfilter()`, and +filtering functions `trendfilter::trendfilter()`, `trendfilter::cv_trendfilter()`, and the documentation for `growth_rate()` gives the full details. ## Log scale estimation @@ -247,22 +250,10 @@ the call to `growth_rate()`. x <- x %>% group_by(geo_value) %>% mutate( - cases_gr5 = growth_rate(time_value, cases, - method = "rel_change", - log_scale = TRUE - ), - cases_gr6 = growth_rate(time_value, cases, - method = "linear_reg", - log_scale = TRUE - ), - cases_gr7 = growth_rate(time_value, cases, - method = "smooth_spline", - log_scale = TRUE - ), - cases_gr8 = growth_rate(time_value, cases, - method = "trend_filter", - log_scale = TRUE - ) + cases_gr5 = growth_rate(cases, method = "rel_change", log_scale = TRUE), + cases_gr6 = growth_rate(cases, method = "linear_reg", log_scale = TRUE), + cases_gr7 = growth_rate(cases, method = "smooth_spline", log_scale = TRUE), + cases_gr8 = growth_rate(cases, method = "trend_filter", log_scale = TRUE) ) x %>%