From b8d06a0114ccad58c377477f56a6c5d313731fe4 Mon Sep 17 00:00:00 2001 From: ChloeYou Date: Thu, 15 Sep 2022 19:35:45 -0700 Subject: [PATCH 1/2] make adjustments based on Daniels comments --- R/step_population_scaling.R | 70 +++++++++++++++--------- man/step_population_scaling.Rd | 52 +++++++++++------- tests/testthat/test-population_scaling.R | 59 ++++++++++++++++++++ 3 files changed, 135 insertions(+), 46 deletions(-) diff --git a/R/step_population_scaling.R b/R/step_population_scaling.R index 51a66abdc..3e558ae8a 100644 --- a/R/step_population_scaling.R +++ b/R/step_population_scaling.R @@ -1,12 +1,15 @@ -#' Create a recipe step that scales variables using population data +#' Convert raw scale predictions to per-capita #' #' `step_population_scaling` creates a specification of a recipe step -#' that will add a population scaled column in the data. For example, -#' load a dataset that contains county population, and join to an `epi_df` -#' that currently only contains number of new cases by county. Once scaled, -#' predictions can be made on case rate. Although worth noting that there is -#' nothing special about "population". The function can be used to scale by any -#' variable. Population is simply the most natural and common use case. +#' that will perform per-capita scaling. Typical usage would +#' load a dataset that contains state-level population, and use it to convert +#' predictions made from a raw scale model to rate-scale by dividing by +#' the population. +#' Although, it is worth noting that there is nothing special about "population". +#' The function can be used to scale by any variable. Population is the +#' standard use case in the epidemiology forecasting scenario. Any value +#' passed will *divide* the selected variables while the `rate_rescaling` +#' argument is a common *multiplier* of the selected variables. #' #' @param recipe A recipe object. The step will be added to the sequence of #' operations for this recipe. The recipe should contain information about the @@ -19,25 +22,30 @@ #' be ard are not limited to "outcome". #' @param trained A logical to indicate if the quantities for preprocessing #' have been estimated. -#' @param df a data frame that contains the population data used for scaling. -#' @param by A character vector of variables to left join by. +#' @param df a data frame that contains the population data to be used for +#' inverting the existing scaling. +#' @param by A (possibly named) character vector of variables to join by. #' #' If `NULL`, the default, the function will perform a natural join, using all -#' variables in common across the `epi_df` and the user-provided dataset. -#' If columns in `epi_df` and `df` have the same name (and aren't -#' included in by), `.df` is added to the one from the user-provided data +#' variables in common across the `epi_df` produced by the `predict()` call +#' and the user-provided dataset. +#' If columns in that `epi_df` and `df` have the same name (and aren't +#' included in `by`), `.df` is added to the one from the user-provided data #' to disambiguate. #' #' To join by different variables on the `epi_df` and `df`, use a named vector. -#' For example, by = c("geo_value" = "states") will match `epi_df$geo_value` +#' For example, `by = c("geo_value" = "states")` will match `epi_df$geo_value` #' to `df$states`. To join by multiple variables, use a vector with length > 1. -#' For example, by = c("geo_value" = "states", "county" = "county") will match +#' For example, `by = c("geo_value" = "states", "county" = "county")` will match #' `epi_df$geo_value` to `df$states` and `epi_df$county` to `df$county`. #' #' See [dplyr::left_join()] for more details. #' @param df_pop_col the name of the column in the data frame `df` that #' contains the population data and will be used for scaling. #' This should be one column. +#' @param rate_rescaling Sometimes raw scales are "per 100K" or "per 1M". +#' Adjustments can be made here. For example, if the original +#' scale is "per 100K", then set `rate_rescaling = 1e5` to get rates. #' @param create_new TRUE to create a new column and keep the original column #' in the `epi_df` #' @param suffix a character. The suffix added to the column name if @@ -61,8 +69,7 @@ #' dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ca", "ny")) %>% #' dplyr::select(geo_value, time_value, cases) #' -#' pop_data = data.frame(states = c("ca", "ny"), -#' value = c(20000, 30000)) +#' pop_data = data.frame(states = c("ca", "ny"), value = c(20000, 30000)) #' #' r <- epi_recipe(jhu) %>% #' step_population_scaling(df = pop_data, @@ -86,11 +93,12 @@ #' parsnip::fit(jhu) %>% #' add_frosting(f) #' -#' latest <- get_test_data(recipe = r, -#' x = epiprocess::jhu_csse_daily_subset %>% -#' dplyr::filter(time_value > "2021-11-01", -#' geo_value %in% c("ca", "ny")) %>% -#' dplyr::select(geo_value, time_value, cases)) +#' latest <- get_test_data( +#' recipe = r, +#' x = epiprocess::jhu_csse_daily_subset %>% +#' dplyr::filter(time_value > "2021-11-01", +#' geo_value %in% c("ca", "ny")) %>% +#' dplyr::select(geo_value, time_value, cases)) #' #' #' predict(wf, latest) @@ -102,11 +110,19 @@ step_population_scaling <- df, by = NULL, df_pop_col, + rate_rescaling = 1, create_new = TRUE, suffix = "_scaled", columns = NULL, skip = FALSE, id = rand_id("population_scaling")){ + arg_is_scalar(role, trained, df_pop_col, rate_rescaling, create_new, suffix, id) + arg_is_lgl(create_new, skip) + arg_is_chr(df_pop_col, suffix, id) + arg_is_chr(by, columns, allow_null = TRUE) + if (rate_rescaling <= 0) + cli_stop("`rate_rescaling` should be a positive number") + add_step( recipe, step_population_scaling_new( @@ -116,6 +132,7 @@ step_population_scaling <- df = df, by = by, df_pop_col = df_pop_col, + rate_rescaling = rate_rescaling, create_new = create_new, suffix = suffix, columns = columns, @@ -126,7 +143,7 @@ step_population_scaling <- } step_population_scaling_new <- - function(role, trained, df, by, df_pop_col, terms, create_new, + function(role, trained, df, by, df_pop_col, rate_rescaling, terms, create_new, suffix, columns, skip, id) { step( subclass = "population_scaling", @@ -136,6 +153,7 @@ step_population_scaling_new <- df = df, by = by, df_pop_col = df_pop_col, + rate_rescaling = rate_rescaling, create_new = create_new, suffix = suffix, columns = columns, @@ -153,6 +171,7 @@ prep.step_population_scaling <- function(x, training, info = NULL, ...) { df = x$df, by = x$by, df_pop_col = x$df_pop_col, + rate_rescaling = x$rate_rescaling, create_new = x$create_new, suffix = x$suffix, columns = recipes_eval_select(x$terms, training, info), @@ -172,8 +191,9 @@ bake.step_population_scaling <- function(object, try_join <- try(dplyr::left_join(new_data, object$df, by= object$by), silent = TRUE) - if (any(grepl("Join columns must be present in data", unlist(try_join)))){ - stop("columns in `by` selectors of `step_population_scaling` must be present in data and match")} + if (any(grepl("Join columns must be present in data", unlist(try_join)))) { + cli_stop(c("columns in `by` selectors of `layer_population_scaling` ", + "must be present in data and match"))} if(object$suffix != "_scaled" && object$create_new == FALSE){ message("`suffix` not used to generate new column in `step_population_scaling`") @@ -194,7 +214,7 @@ bake.step_population_scaling <- function(object, dplyr::mutate( dplyr::across( dplyr::all_of(object$columns), - ~.x/!!pop_col , + ~.x * object$rate_rescaling /!!pop_col , .names = "{.col}{suffix}")) %>% # removed so the models do not use the population column dplyr::select(- !!pop_col) diff --git a/man/step_population_scaling.Rd b/man/step_population_scaling.Rd index 3d1fcd193..8dd800034 100644 --- a/man/step_population_scaling.Rd +++ b/man/step_population_scaling.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/step_population_scaling.R \name{step_population_scaling} \alias{step_population_scaling} -\title{Create a recipe step that scales variables using population data} +\title{Convert raw scale predictions to per-capita} \usage{ step_population_scaling( recipe, @@ -12,6 +12,7 @@ step_population_scaling( df, by = NULL, df_pop_col, + rate_rescaling = 1, create_new = TRUE, suffix = "_scaled", columns = NULL, @@ -35,20 +36,22 @@ be ard are not limited to "outcome".} \item{trained}{A logical to indicate if the quantities for preprocessing have been estimated.} -\item{df}{a data frame that contains the population data used for scaling.} +\item{df}{a data frame that contains the population data to be used for +inverting the existing scaling.} -\item{by}{A character vector of variables to left join by. +\item{by}{A (possibly named) character vector of variables to join by. If \code{NULL}, the default, the function will perform a natural join, using all -variables in common across the \code{epi_df} and the user-provided dataset. -If columns in \code{epi_df} and \code{df} have the same name (and aren't -included in by), \code{.df} is added to the one from the user-provided data +variables in common across the \code{epi_df} produced by the \code{predict()} call +and the user-provided dataset. +If columns in that \code{epi_df} and \code{df} have the same name (and aren't +included in \code{by}), \code{.df} is added to the one from the user-provided data to disambiguate. To join by different variables on the \code{epi_df} and \code{df}, use a named vector. -For example, by = c("geo_value" = "states") will match \code{epi_df$geo_value} +For example, \code{by = c("geo_value" = "states")} will match \code{epi_df$geo_value} to \code{df$states}. To join by multiple variables, use a vector with length > 1. -For example, by = c("geo_value" = "states", "county" = "county") will match +For example, \code{by = c("geo_value" = "states", "county" = "county")} will match \code{epi_df$geo_value} to \code{df$states} and \code{epi_df$county} to \code{df$county}. See \code{\link[dplyr:mutate-joins]{dplyr::left_join()}} for more details.} @@ -57,6 +60,10 @@ See \code{\link[dplyr:mutate-joins]{dplyr::left_join()}} for more details.} contains the population data and will be used for scaling. This should be one column.} +\item{rate_rescaling}{Sometimes raw scales are "per 100K" or "per 1M". +Adjustments can be made here. For example, if the original +scale is "per 100K", then set \code{rate_rescaling = 1e5} to get rates.} + \item{create_new}{TRUE to create a new column and keep the original column in the \code{epi_df}} @@ -80,12 +87,15 @@ Scales raw data by the population } \description{ \code{step_population_scaling} creates a specification of a recipe step -that will add a population scaled column in the data. For example, -load a dataset that contains county population, and join to an \code{epi_df} -that currently only contains number of new cases by county. Once scaled, -predictions can be made on case rate. Although worth noting that there is -nothing special about "population". The function can be used to scale by any -variable. Population is simply the most natural and common use case. +that will perform per-capita scaling. Typical usage would +load a dataset that contains state-level population, and use it to convert +predictions made from a raw scale model to rate-scale by dividing by +the population. +Although, it is worth noting that there is nothing special about "population". +The function can be used to scale by any variable. Population is the +standard use case in the epidemiology forecasting scenario. Any value +passed will \emph{divide} the selected variables while the \code{rate_rescaling} +argument is a common \emph{multiplier} of the selected variables. } \examples{ library(epiprocess) @@ -94,8 +104,7 @@ jhu <- epiprocess::jhu_csse_daily_subset \%>\% dplyr::filter(time_value > "2021-11-01", geo_value \%in\% c("ca", "ny")) \%>\% dplyr::select(geo_value, time_value, cases) -pop_data = data.frame(states = c("ca", "ny"), - value = c(20000, 30000)) +pop_data = data.frame(states = c("ca", "ny"), value = c(20000, 30000)) r <- epi_recipe(jhu) \%>\% step_population_scaling(df = pop_data, @@ -119,11 +128,12 @@ wf <- epi_workflow(r, parsnip::fit(jhu) \%>\% add_frosting(f) -latest <- get_test_data(recipe = r, - x = epiprocess::jhu_csse_daily_subset \%>\% - dplyr::filter(time_value > "2021-11-01", - geo_value \%in\% c("ca", "ny")) \%>\% - dplyr::select(geo_value, time_value, cases)) +latest <- get_test_data( + recipe = r, + x = epiprocess::jhu_csse_daily_subset \%>\% + dplyr::filter(time_value > "2021-11-01", + geo_value \%in\% c("ca", "ny")) \%>\% + dplyr::select(geo_value, time_value, cases)) predict(wf, latest) diff --git a/tests/testthat/test-population_scaling.R b/tests/testthat/test-population_scaling.R index 5b805b33b..5ccbf9660 100644 --- a/tests/testthat/test-population_scaling.R +++ b/tests/testthat/test-population_scaling.R @@ -295,3 +295,62 @@ test_that("expect error if `by` selector does not match", { ) }) + +test_that("Rate rescaling behaves as expected", { + x <- tibble(geo_value = rep("place",50), + time_value = as.Date("2021-01-01") + 0:49, + case_rate = rep(0.0005, 50), + cases = rep(5000, 50)) %>% + as_epi_df() + + reverse_pop_data = data.frame(states = c("place"), + value = c(1/1000)) + + r <- epi_recipe(x) %>% + step_population_scaling(df = reverse_pop_data, + df_pop_col = "value", + rate_rescaling = 100, # cases per 100 + by = c("geo_value" = "states"), + case_rate, suffix = "_scaled") + + expect_equal(unique(bake(prep(r,x),x)$case_rate_scaled), + 0.0005*(1/1000)*100) # done testing step_* + + f <- frosting() %>% + layer_population_scaling(.pred, df = reverse_pop_data, + rate_rescaling = 100, # revert back to case rate per 100 + by = c("geo_value" = "states"), + df_pop_col = "value") + + x <- tibble(geo_value = rep("place",50), + time_value = as.Date("2021-01-01") + 0:49, + case_rate = rep(0.0005, 50)) %>% + as_epi_df() + + r <- epi_recipe(x) %>% + step_epi_lag(case_rate, lag = c(7, 14)) %>% # cases + step_epi_ahead(case_rate, ahead = 7, role = "outcome") %>% # cases + step_naomit(all_predictors()) %>% + step_naomit(all_outcomes(), skip = TRUE) + + f <- frosting() %>% + layer_predict() %>% + layer_threshold(.pred) %>% + layer_naomit(.pred) %>% + layer_population_scaling(.pred, df = reverse_pop_data, + rate_rescaling = 100, # revert back to case rate per 100 + by = c("geo_value" = "states"), + df_pop_col = "value") + + wf <- epi_workflow(r, parsnip::linear_reg()) %>% + fit(x) %>% + add_frosting(f) + + latest <- get_test_data(recipe = r, x = x) + + # suppress warning: prediction from a rank-deficient fit may be misleading + suppressWarnings(expect_equal(unique(predict(wf, latest)$.pred)*1000/100, + unique(predict(wf, latest)$.pred_scaled))) +}) + + From 8940f6d4be0a77fbabafa5954f21cfd2a9401de1 Mon Sep 17 00:00:00 2001 From: ChloeYou Date: Fri, 16 Sep 2022 18:22:49 -0700 Subject: [PATCH 2/2] fix error --- R/step_population_scaling.R | 2 +- tests/testthat/test-population_scaling.R | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/R/step_population_scaling.R b/R/step_population_scaling.R index 3e558ae8a..b2f9d2899 100644 --- a/R/step_population_scaling.R +++ b/R/step_population_scaling.R @@ -192,7 +192,7 @@ bake.step_population_scaling <- function(object, by= object$by), silent = TRUE) if (any(grepl("Join columns must be present in data", unlist(try_join)))) { - cli_stop(c("columns in `by` selectors of `layer_population_scaling` ", + cli_stop(c("columns in `by` selectors of `step_population_scaling` ", "must be present in data and match"))} if(object$suffix != "_scaled" && object$create_new == FALSE){ diff --git a/tests/testthat/test-population_scaling.R b/tests/testthat/test-population_scaling.R index 5ccbf9660..6d32fb9b2 100644 --- a/tests/testthat/test-population_scaling.R +++ b/tests/testthat/test-population_scaling.R @@ -314,7 +314,7 @@ test_that("Rate rescaling behaves as expected", { case_rate, suffix = "_scaled") expect_equal(unique(bake(prep(r,x),x)$case_rate_scaled), - 0.0005*(1/1000)*100) # done testing step_* + 0.0005*100/(1/1000)) # done testing step_* f <- frosting() %>% layer_population_scaling(.pred, df = reverse_pop_data, @@ -349,7 +349,7 @@ test_that("Rate rescaling behaves as expected", { latest <- get_test_data(recipe = r, x = x) # suppress warning: prediction from a rank-deficient fit may be misleading - suppressWarnings(expect_equal(unique(predict(wf, latest)$.pred)*1000/100, + suppressWarnings(expect_equal(unique(predict(wf, latest)$.pred)*(1/1000)/100, unique(predict(wf, latest)$.pred_scaled))) })