1
- # ' Create a recipe step that scales variables using population data
1
+ # ' Convert raw scale predictions to per-capita
2
2
# '
3
3
# ' `step_population_scaling` creates a specification of a recipe step
4
- # ' that will add a population scaled column in the data. For example,
5
- # ' load a dataset that contains county population, and join to an `epi_df`
6
- # ' that currently only contains number of new cases by county. Once scaled,
7
- # ' predictions can be made on case rate. Although worth noting that there is
8
- # ' nothing special about "population". The function can be used to scale by any
9
- # ' variable. Population is simply the most natural and common use case.
4
+ # ' that will perform per-capita scaling. Typical usage would
5
+ # ' load a dataset that contains state-level population, and use it to convert
6
+ # ' predictions made from a raw scale model to rate-scale by dividing by
7
+ # ' the population.
8
+ # ' Although, it is worth noting that there is nothing special about "population".
9
+ # ' The function can be used to scale by any variable. Population is the
10
+ # ' standard use case in the epidemiology forecasting scenario. Any value
11
+ # ' passed will *divide* the selected variables while the `rate_rescaling`
12
+ # ' argument is a common *multiplier* of the selected variables.
10
13
# '
11
14
# ' @param recipe A recipe object. The step will be added to the sequence of
12
15
# ' operations for this recipe. The recipe should contain information about the
19
22
# ' be ard are not limited to "outcome".
20
23
# ' @param trained A logical to indicate if the quantities for preprocessing
21
24
# ' have been estimated.
22
- # ' @param df a data frame that contains the population data used for scaling.
23
- # ' @param by A character vector of variables to left join by.
25
+ # ' @param df a data frame that contains the population data to be used for
26
+ # ' inverting the existing scaling.
27
+ # ' @param by A (possibly named) character vector of variables to join by.
24
28
# '
25
29
# ' If `NULL`, the default, the function will perform a natural join, using all
26
- # ' variables in common across the `epi_df` and the user-provided dataset.
27
- # ' If columns in `epi_df` and `df` have the same name (and aren't
28
- # ' included in by), `.df` is added to the one from the user-provided data
30
+ # ' variables in common across the `epi_df` produced by the `predict()` call
31
+ # ' and the user-provided dataset.
32
+ # ' If columns in that `epi_df` and `df` have the same name (and aren't
33
+ # ' included in `by`), `.df` is added to the one from the user-provided data
29
34
# ' to disambiguate.
30
35
# '
31
36
# ' To join by different variables on the `epi_df` and `df`, use a named vector.
32
- # ' For example, by = c("geo_value" = "states") will match `epi_df$geo_value`
37
+ # ' For example, ` by = c("geo_value" = "states")` will match `epi_df$geo_value`
33
38
# ' to `df$states`. To join by multiple variables, use a vector with length > 1.
34
- # ' For example, by = c("geo_value" = "states", "county" = "county") will match
39
+ # ' For example, ` by = c("geo_value" = "states", "county" = "county")` will match
35
40
# ' `epi_df$geo_value` to `df$states` and `epi_df$county` to `df$county`.
36
41
# '
37
42
# ' See [dplyr::left_join()] for more details.
38
43
# ' @param df_pop_col the name of the column in the data frame `df` that
39
44
# ' contains the population data and will be used for scaling.
40
45
# ' This should be one column.
46
+ # ' @param rate_rescaling Sometimes raw scales are "per 100K" or "per 1M".
47
+ # ' Adjustments can be made here. For example, if the original
48
+ # ' scale is "per 100K", then set `rate_rescaling = 1e5` to get rates.
41
49
# ' @param create_new TRUE to create a new column and keep the original column
42
50
# ' in the `epi_df`
43
51
# ' @param suffix a character. The suffix added to the column name if
61
69
# ' dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ca", "ny")) %>%
62
70
# ' dplyr::select(geo_value, time_value, cases)
63
71
# '
64
- # ' pop_data = data.frame(states = c("ca", "ny"),
65
- # ' value = c(20000, 30000))
72
+ # ' pop_data = data.frame(states = c("ca", "ny"), value = c(20000, 30000))
66
73
# '
67
74
# ' r <- epi_recipe(jhu) %>%
68
75
# ' step_population_scaling(df = pop_data,
86
93
# ' parsnip::fit(jhu) %>%
87
94
# ' add_frosting(f)
88
95
# '
89
- # ' latest <- get_test_data(recipe = r,
90
- # ' x = epiprocess::jhu_csse_daily_subset %>%
91
- # ' dplyr::filter(time_value > "2021-11-01",
92
- # ' geo_value %in% c("ca", "ny")) %>%
93
- # ' dplyr::select(geo_value, time_value, cases))
96
+ # ' latest <- get_test_data(
97
+ # ' recipe = r,
98
+ # ' x = epiprocess::jhu_csse_daily_subset %>%
99
+ # ' dplyr::filter(time_value > "2021-11-01",
100
+ # ' geo_value %in% c("ca", "ny")) %>%
101
+ # ' dplyr::select(geo_value, time_value, cases))
94
102
# '
95
103
# '
96
104
# ' predict(wf, latest)
@@ -102,11 +110,19 @@ step_population_scaling <-
102
110
df ,
103
111
by = NULL ,
104
112
df_pop_col ,
113
+ rate_rescaling = 1 ,
105
114
create_new = TRUE ,
106
115
suffix = " _scaled" ,
107
116
columns = NULL ,
108
117
skip = FALSE ,
109
118
id = rand_id(" population_scaling" )){
119
+ arg_is_scalar(role , trained , df_pop_col , rate_rescaling , create_new , suffix , id )
120
+ arg_is_lgl(create_new , skip )
121
+ arg_is_chr(df_pop_col , suffix , id )
122
+ arg_is_chr(by , columns , allow_null = TRUE )
123
+ if (rate_rescaling < = 0 )
124
+ cli_stop(" `rate_rescaling` should be a positive number" )
125
+
110
126
add_step(
111
127
recipe ,
112
128
step_population_scaling_new(
@@ -116,6 +132,7 @@ step_population_scaling <-
116
132
df = df ,
117
133
by = by ,
118
134
df_pop_col = df_pop_col ,
135
+ rate_rescaling = rate_rescaling ,
119
136
create_new = create_new ,
120
137
suffix = suffix ,
121
138
columns = columns ,
@@ -126,7 +143,7 @@ step_population_scaling <-
126
143
}
127
144
128
145
step_population_scaling_new <-
129
- function (role , trained , df , by , df_pop_col , terms , create_new ,
146
+ function (role , trained , df , by , df_pop_col , rate_rescaling , terms , create_new ,
130
147
suffix , columns , skip , id ) {
131
148
step(
132
149
subclass = " population_scaling" ,
@@ -136,6 +153,7 @@ step_population_scaling_new <-
136
153
df = df ,
137
154
by = by ,
138
155
df_pop_col = df_pop_col ,
156
+ rate_rescaling = rate_rescaling ,
139
157
create_new = create_new ,
140
158
suffix = suffix ,
141
159
columns = columns ,
@@ -153,6 +171,7 @@ prep.step_population_scaling <- function(x, training, info = NULL, ...) {
153
171
df = x $ df ,
154
172
by = x $ by ,
155
173
df_pop_col = x $ df_pop_col ,
174
+ rate_rescaling = x $ rate_rescaling ,
156
175
create_new = x $ create_new ,
157
176
suffix = x $ suffix ,
158
177
columns = recipes_eval_select(x $ terms , training , info ),
@@ -172,8 +191,9 @@ bake.step_population_scaling <- function(object,
172
191
try_join <- try(dplyr :: left_join(new_data , object $ df ,
173
192
by = object $ by ),
174
193
silent = TRUE )
175
- if (any(grepl(" Join columns must be present in data" , unlist(try_join )))){
176
- stop(" columns in `by` selectors of `step_population_scaling` must be present in data and match" )}
194
+ if (any(grepl(" Join columns must be present in data" , unlist(try_join )))) {
195
+ cli_stop(c(" columns in `by` selectors of `step_population_scaling` " ,
196
+ " must be present in data and match" ))}
177
197
178
198
if (object $ suffix != " _scaled" && object $ create_new == FALSE ){
179
199
message(" `suffix` not used to generate new column in `step_population_scaling`" )
@@ -194,7 +214,7 @@ bake.step_population_scaling <- function(object,
194
214
dplyr :: mutate(
195
215
dplyr :: across(
196
216
dplyr :: all_of(object $ columns ),
197
- ~ .x / !! pop_col ,
217
+ ~ .x * object $ rate_rescaling / !! pop_col ,
198
218
.names = " {.col}{suffix}" )) %> %
199
219
# removed so the models do not use the population column
200
220
dplyr :: select(- !! pop_col )
0 commit comments