From b8d06a0114ccad58c377477f56a6c5d313731fe4 Mon Sep 17 00:00:00 2001
From: ChloeYou <chloeyxy@outlook.com>
Date: Thu, 15 Sep 2022 19:35:45 -0700
Subject: [PATCH 1/2] make adjustments based on Daniels comments

---
 R/step_population_scaling.R              | 70 +++++++++++++++---------
 man/step_population_scaling.Rd           | 52 +++++++++++-------
 tests/testthat/test-population_scaling.R | 59 ++++++++++++++++++++
 3 files changed, 135 insertions(+), 46 deletions(-)

diff --git a/R/step_population_scaling.R b/R/step_population_scaling.R
index 51a66abdc..3e558ae8a 100644
--- a/R/step_population_scaling.R
+++ b/R/step_population_scaling.R
@@ -1,12 +1,15 @@
-#' Create a recipe step that scales variables using population data
+#' Convert raw scale predictions to per-capita
 #'
 #' `step_population_scaling` creates a specification of a recipe step
-#' that will add a population scaled column in the data. For example,
-#' load a dataset that contains county population, and join to an `epi_df`
-#' that currently only contains number of new cases by county. Once scaled,
-#' predictions can be made on case rate. Although worth noting that there is
-#' nothing special about "population". The function can be used to scale by any
-#' variable. Population is simply the most natural and common use case.
+#' that will perform per-capita scaling. Typical usage would
+#' load a dataset that contains state-level population, and use it to convert
+#' predictions made from a raw scale model to rate-scale by dividing by
+#' the population.
+#' Although, it is worth noting that there is nothing special about "population".
+#' The function can be used to scale by any variable. Population is the
+#' standard use case in the epidemiology forecasting scenario. Any value
+#' passed will *divide* the selected variables while the `rate_rescaling`
+#' argument is a common *multiplier* of the selected variables.
 #'
 #' @param recipe A recipe object. The step will be added to the sequence of
 #' operations for this recipe. The recipe should contain information about the
@@ -19,25 +22,30 @@
 #' be ard are not limited to "outcome".
 #' @param trained A logical to indicate if the quantities for preprocessing
 #' have been estimated.
-#' @param df a data frame that contains the population data used for scaling.
-#' @param by A character vector of variables to left join by.
+#' @param df a data frame that contains the population data to be used for
+#'   inverting the existing scaling.
+#' @param by A (possibly named) character vector of variables to join by.
 #'
 #' If `NULL`, the default, the function will perform a natural join, using all
-#' variables in common across the `epi_df` and the user-provided dataset.
-#' If columns in `epi_df` and `df` have the same name (and aren't
-#' included in by), `.df` is added to the one from the user-provided data
+#' variables in common across the `epi_df` produced by the `predict()` call
+#' and the user-provided dataset.
+#' If columns in that `epi_df` and `df` have the same name (and aren't
+#' included in `by`), `.df` is added to the one from the user-provided data
 #' to disambiguate.
 #'
 #' To join by different variables on the `epi_df` and `df`, use a named vector.
-#' For example, by = c("geo_value" = "states") will match `epi_df$geo_value`
+#' For example, `by = c("geo_value" = "states")` will match `epi_df$geo_value`
 #' to `df$states`. To join by multiple variables, use a vector with length > 1.
-#' For example, by = c("geo_value" = "states", "county" = "county") will match
+#' For example, `by = c("geo_value" = "states", "county" = "county")` will match
 #' `epi_df$geo_value` to `df$states` and `epi_df$county` to `df$county`.
 #'
 #' See [dplyr::left_join()] for more details.
 #' @param df_pop_col the name of the column in the data frame `df` that
 #' contains the population data and will be used for scaling.
 #' This should be one column.
+#' @param rate_rescaling Sometimes raw scales are "per 100K" or "per 1M".
+#' Adjustments can be made here. For example, if the original
+#' scale is "per 100K", then set `rate_rescaling = 1e5` to get rates.
 #' @param create_new TRUE to create a new column and keep the original column
 #' in the `epi_df`
 #' @param suffix a character. The suffix added to the column name if
@@ -61,8 +69,7 @@
 #'   dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ca", "ny")) %>%
 #'   dplyr::select(geo_value, time_value, cases)
 #'
-#' pop_data = data.frame(states = c("ca", "ny"),
-#'                       value = c(20000, 30000))
+#' pop_data = data.frame(states = c("ca", "ny"), value = c(20000, 30000))
 #'
 #' r <- epi_recipe(jhu) %>%
 #'   step_population_scaling(df = pop_data,
@@ -86,11 +93,12 @@
 #'   parsnip::fit(jhu) %>%
 #'   add_frosting(f)
 #'
-#' latest <- get_test_data(recipe = r,
-#'                         x = epiprocess::jhu_csse_daily_subset %>%
-#'                           dplyr::filter(time_value > "2021-11-01",
-#'                                         geo_value %in% c("ca", "ny")) %>%
-#'                           dplyr::select(geo_value, time_value, cases))
+#' latest <- get_test_data(
+#'    recipe = r,
+#'    x = epiprocess::jhu_csse_daily_subset %>%
+#'    dplyr::filter(time_value > "2021-11-01",
+#'    geo_value %in% c("ca", "ny")) %>%
+#'    dplyr::select(geo_value, time_value, cases))
 #'
 #'
 #' predict(wf, latest)
@@ -102,11 +110,19 @@ step_population_scaling <-
           df,
           by = NULL,
           df_pop_col,
+          rate_rescaling = 1,
           create_new = TRUE,
           suffix = "_scaled",
           columns = NULL,
           skip = FALSE,
           id = rand_id("population_scaling")){
+  arg_is_scalar(role, trained, df_pop_col, rate_rescaling, create_new, suffix, id)
+  arg_is_lgl(create_new, skip)
+  arg_is_chr(df_pop_col, suffix, id)
+  arg_is_chr(by, columns, allow_null = TRUE)
+  if (rate_rescaling <= 0)
+    cli_stop("`rate_rescaling` should be a positive number")
+
   add_step(
     recipe,
     step_population_scaling_new(
@@ -116,6 +132,7 @@ step_population_scaling <-
       df = df,
       by = by,
       df_pop_col = df_pop_col,
+      rate_rescaling = rate_rescaling,
       create_new = create_new,
       suffix = suffix,
       columns = columns,
@@ -126,7 +143,7 @@ step_population_scaling <-
 }
 
 step_population_scaling_new <-
-  function(role, trained, df, by, df_pop_col, terms, create_new,
+  function(role, trained, df, by, df_pop_col, rate_rescaling, terms, create_new,
            suffix, columns, skip, id) {
     step(
       subclass = "population_scaling",
@@ -136,6 +153,7 @@ step_population_scaling_new <-
       df = df,
       by = by,
       df_pop_col = df_pop_col,
+      rate_rescaling = rate_rescaling,
       create_new = create_new,
       suffix = suffix,
       columns = columns,
@@ -153,6 +171,7 @@ prep.step_population_scaling <- function(x, training, info = NULL, ...) {
     df = x$df,
     by = x$by,
     df_pop_col = x$df_pop_col,
+    rate_rescaling = x$rate_rescaling,
     create_new = x$create_new,
     suffix = x$suffix,
     columns = recipes_eval_select(x$terms, training, info),
@@ -172,8 +191,9 @@ bake.step_population_scaling <- function(object,
   try_join <- try(dplyr::left_join(new_data, object$df,
                             by= object$by),
                 silent = TRUE)
-  if (any(grepl("Join columns must be present in data", unlist(try_join)))){
-      stop("columns in `by` selectors of `step_population_scaling` must be present in data and match")}
+  if (any(grepl("Join columns must be present in data", unlist(try_join)))) {
+    cli_stop(c("columns in `by` selectors of `layer_population_scaling` ",
+               "must be present in data and match"))}
 
   if(object$suffix != "_scaled" && object$create_new == FALSE){
     message("`suffix` not used to generate new column in `step_population_scaling`")
@@ -194,7 +214,7 @@ bake.step_population_scaling <- function(object,
     dplyr::mutate(
       dplyr::across(
         dplyr::all_of(object$columns),
-        ~.x/!!pop_col ,
+        ~.x * object$rate_rescaling /!!pop_col ,
         .names = "{.col}{suffix}")) %>%
     # removed so the models do not use the population column
    dplyr::select(- !!pop_col)
diff --git a/man/step_population_scaling.Rd b/man/step_population_scaling.Rd
index 3d1fcd193..8dd800034 100644
--- a/man/step_population_scaling.Rd
+++ b/man/step_population_scaling.Rd
@@ -2,7 +2,7 @@
 % Please edit documentation in R/step_population_scaling.R
 \name{step_population_scaling}
 \alias{step_population_scaling}
-\title{Create a recipe step that scales variables using population data}
+\title{Convert raw scale predictions to per-capita}
 \usage{
 step_population_scaling(
   recipe,
@@ -12,6 +12,7 @@ step_population_scaling(
   df,
   by = NULL,
   df_pop_col,
+  rate_rescaling = 1,
   create_new = TRUE,
   suffix = "_scaled",
   columns = NULL,
@@ -35,20 +36,22 @@ be ard are not limited to "outcome".}
 \item{trained}{A logical to indicate if the quantities for preprocessing
 have been estimated.}
 
-\item{df}{a data frame that contains the population data used for scaling.}
+\item{df}{a data frame that contains the population data to be used for
+inverting the existing scaling.}
 
-\item{by}{A character vector of variables to left join by.
+\item{by}{A (possibly named) character vector of variables to join by.
 
 If \code{NULL}, the default, the function will perform a natural join, using all
-variables in common across the \code{epi_df} and the user-provided dataset.
-If columns in \code{epi_df} and \code{df} have the same name (and aren't
-included in by), \code{.df} is added to the one from the user-provided data
+variables in common across the \code{epi_df} produced by the \code{predict()} call
+and the user-provided dataset.
+If columns in that \code{epi_df} and \code{df} have the same name (and aren't
+included in \code{by}), \code{.df} is added to the one from the user-provided data
 to disambiguate.
 
 To join by different variables on the \code{epi_df} and \code{df}, use a named vector.
-For example, by = c("geo_value" = "states") will match \code{epi_df$geo_value}
+For example, \code{by = c("geo_value" = "states")} will match \code{epi_df$geo_value}
 to \code{df$states}. To join by multiple variables, use a vector with length > 1.
-For example, by = c("geo_value" = "states", "county" = "county") will match
+For example, \code{by = c("geo_value" = "states", "county" = "county")} will match
 \code{epi_df$geo_value} to \code{df$states} and \code{epi_df$county} to \code{df$county}.
 
 See \code{\link[dplyr:mutate-joins]{dplyr::left_join()}} for more details.}
@@ -57,6 +60,10 @@ See \code{\link[dplyr:mutate-joins]{dplyr::left_join()}} for more details.}
 contains the population data and will be used for scaling.
 This should be one column.}
 
+\item{rate_rescaling}{Sometimes raw scales are "per 100K" or "per 1M".
+Adjustments can be made here. For example, if the original
+scale is "per 100K", then set \code{rate_rescaling = 1e5} to get rates.}
+
 \item{create_new}{TRUE to create a new column and keep the original column
 in the \code{epi_df}}
 
@@ -80,12 +87,15 @@ Scales raw data by the population
 }
 \description{
 \code{step_population_scaling} creates a specification of a recipe step
-that will add a population scaled column in the data. For example,
-load a dataset that contains county population, and join to an \code{epi_df}
-that currently only contains number of new cases by county. Once scaled,
-predictions can be made on case rate. Although worth noting that there is
-nothing special about "population". The function can be used to scale by any
-variable. Population is simply the most natural and common use case.
+that will perform per-capita scaling. Typical usage would
+load a dataset that contains state-level population, and use it to convert
+predictions made from a raw scale model to rate-scale by dividing by
+the population.
+Although, it is worth noting that there is nothing special about "population".
+The function can be used to scale by any variable. Population is the
+standard use case in the epidemiology forecasting scenario. Any value
+passed will \emph{divide} the selected variables while the \code{rate_rescaling}
+argument is a common \emph{multiplier} of the selected variables.
 }
 \examples{
 library(epiprocess)
@@ -94,8 +104,7 @@ jhu <- epiprocess::jhu_csse_daily_subset \%>\%
   dplyr::filter(time_value > "2021-11-01", geo_value \%in\% c("ca", "ny")) \%>\%
   dplyr::select(geo_value, time_value, cases)
 
-pop_data = data.frame(states = c("ca", "ny"),
-                      value = c(20000, 30000))
+pop_data = data.frame(states = c("ca", "ny"), value = c(20000, 30000))
 
 r <- epi_recipe(jhu) \%>\%
   step_population_scaling(df = pop_data,
@@ -119,11 +128,12 @@ wf <- epi_workflow(r,
   parsnip::fit(jhu) \%>\%
   add_frosting(f)
 
-latest <- get_test_data(recipe = r,
-                        x = epiprocess::jhu_csse_daily_subset \%>\%
-                          dplyr::filter(time_value > "2021-11-01",
-                                        geo_value \%in\% c("ca", "ny")) \%>\%
-                          dplyr::select(geo_value, time_value, cases))
+latest <- get_test_data(
+   recipe = r,
+   x = epiprocess::jhu_csse_daily_subset \%>\%
+   dplyr::filter(time_value > "2021-11-01",
+   geo_value \%in\% c("ca", "ny")) \%>\%
+   dplyr::select(geo_value, time_value, cases))
 
 
 predict(wf, latest)
diff --git a/tests/testthat/test-population_scaling.R b/tests/testthat/test-population_scaling.R
index 5b805b33b..5ccbf9660 100644
--- a/tests/testthat/test-population_scaling.R
+++ b/tests/testthat/test-population_scaling.R
@@ -295,3 +295,62 @@ test_that("expect error if `by` selector does not match", {
                 )
 })
 
+
+test_that("Rate rescaling behaves as expected", {
+  x <- tibble(geo_value = rep("place",50),
+              time_value = as.Date("2021-01-01") + 0:49,
+              case_rate = rep(0.0005, 50),
+              cases = rep(5000, 50)) %>%
+    as_epi_df()
+
+  reverse_pop_data = data.frame(states = c("place"),
+                                value = c(1/1000))
+
+  r <- epi_recipe(x) %>%
+    step_population_scaling(df = reverse_pop_data,
+                            df_pop_col = "value",
+                            rate_rescaling = 100, # cases per 100
+                            by = c("geo_value" = "states"),
+                            case_rate, suffix = "_scaled")
+
+  expect_equal(unique(bake(prep(r,x),x)$case_rate_scaled),
+               0.0005*(1/1000)*100) # done testing step_*
+
+  f <- frosting() %>%
+    layer_population_scaling(.pred, df = reverse_pop_data,
+                             rate_rescaling = 100, # revert back to case rate per 100
+                             by =  c("geo_value" = "states"),
+                             df_pop_col = "value")
+
+  x <- tibble(geo_value = rep("place",50),
+              time_value = as.Date("2021-01-01") + 0:49,
+              case_rate = rep(0.0005, 50)) %>%
+    as_epi_df()
+
+  r <- epi_recipe(x) %>%
+    step_epi_lag(case_rate, lag = c(7, 14)) %>% # cases
+    step_epi_ahead(case_rate, ahead = 7, role = "outcome") %>% # cases
+    step_naomit(all_predictors()) %>%
+    step_naomit(all_outcomes(), skip = TRUE)
+
+  f <- frosting() %>%
+    layer_predict() %>%
+    layer_threshold(.pred) %>%
+    layer_naomit(.pred) %>%
+    layer_population_scaling(.pred, df = reverse_pop_data,
+                             rate_rescaling = 100, # revert back to case rate per 100
+                             by =  c("geo_value" = "states"),
+                             df_pop_col = "value")
+
+  wf <- epi_workflow(r, parsnip::linear_reg()) %>%
+    fit(x) %>%
+    add_frosting(f)
+
+  latest <- get_test_data(recipe = r, x = x)
+
+  # suppress warning: prediction from a rank-deficient fit may be misleading
+  suppressWarnings(expect_equal(unique(predict(wf, latest)$.pred)*1000/100,
+               unique(predict(wf, latest)$.pred_scaled)))
+})
+
+

From 8940f6d4be0a77fbabafa5954f21cfd2a9401de1 Mon Sep 17 00:00:00 2001
From: ChloeYou <chloeyxy@outlook.com>
Date: Fri, 16 Sep 2022 18:22:49 -0700
Subject: [PATCH 2/2] fix error

---
 R/step_population_scaling.R              | 2 +-
 tests/testthat/test-population_scaling.R | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/R/step_population_scaling.R b/R/step_population_scaling.R
index 3e558ae8a..b2f9d2899 100644
--- a/R/step_population_scaling.R
+++ b/R/step_population_scaling.R
@@ -192,7 +192,7 @@ bake.step_population_scaling <- function(object,
                             by= object$by),
                 silent = TRUE)
   if (any(grepl("Join columns must be present in data", unlist(try_join)))) {
-    cli_stop(c("columns in `by` selectors of `layer_population_scaling` ",
+    cli_stop(c("columns in `by` selectors of `step_population_scaling` ",
                "must be present in data and match"))}
 
   if(object$suffix != "_scaled" && object$create_new == FALSE){
diff --git a/tests/testthat/test-population_scaling.R b/tests/testthat/test-population_scaling.R
index 5ccbf9660..6d32fb9b2 100644
--- a/tests/testthat/test-population_scaling.R
+++ b/tests/testthat/test-population_scaling.R
@@ -314,7 +314,7 @@ test_that("Rate rescaling behaves as expected", {
                             case_rate, suffix = "_scaled")
 
   expect_equal(unique(bake(prep(r,x),x)$case_rate_scaled),
-               0.0005*(1/1000)*100) # done testing step_*
+               0.0005*100/(1/1000)) # done testing step_*
 
   f <- frosting() %>%
     layer_population_scaling(.pred, df = reverse_pop_data,
@@ -349,7 +349,7 @@ test_that("Rate rescaling behaves as expected", {
   latest <- get_test_data(recipe = r, x = x)
 
   # suppress warning: prediction from a rank-deficient fit may be misleading
-  suppressWarnings(expect_equal(unique(predict(wf, latest)$.pred)*1000/100,
+  suppressWarnings(expect_equal(unique(predict(wf, latest)$.pred)*(1/1000)/100,
                unique(predict(wf, latest)$.pred_scaled)))
 })