Skip to content

Commit c78b524

Browse files
authored
Merge pull request #934 from cmu-delphi/fb-package-speed-up-3
[fb-package] Usability improvements + fix to hesitant_and_* vars
2 parents e5c80f1 + 572819f commit c78b524

File tree

10 files changed

+240
-165
lines changed

10 files changed

+240
-165
lines changed

facebook/delphiFacebook/NAMESPACE

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ export(filter_complete_responses)
1717
export(filter_data_for_aggregation)
1818
export(filter_responses)
1919
export(floor_epiweek)
20-
export(get_date_range_from_filenames)
2120
export(get_filenames_in_range)
2221
export(get_range_prev_full_month)
2322
export(get_range_prev_full_period)

facebook/delphiFacebook/R/contingency_aggregate.R

Lines changed: 46 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
#' @return none
2525
#'
2626
#' @import data.table
27-
#' @importFrom dplyr full_join %>%
27+
#' @importFrom dplyr full_join %>% select all_of
2828
#' @importFrom purrr reduce
2929
#'
3030
#' @export
@@ -44,9 +44,18 @@ produce_aggregates <- function(df, aggregations, cw_list, params) {
4444
df <- output[[1]]
4545
aggregations <- output[[2]]
4646

47+
## Keep only columns used in indicators, plus supporting columns.
48+
group_vars <- unique( unlist(aggregations$group_by) )
49+
df <- select(df,
50+
all_of(unique(aggregations$metric)),
51+
all_of(unique(aggregations$var_weight)),
52+
all_of( group_vars[group_vars != "geo_id"] ),
53+
zip5,
54+
start_dt)
55+
4756
agg_groups <- unique(aggregations[c("group_by", "geo_level")])
4857

49-
# For each unique combination of groupby_vars and geo level, run aggregation process once
58+
# For each unique combination of group_vars and geo level, run aggregation process once
5059
# and calculate all desired aggregations on the grouping. Rename columns. Save
5160
# to individual files
5261
for (group_ind in seq_along(agg_groups$group_by)) {
@@ -158,37 +167,43 @@ post_process_aggs <- function(df, aggregations, cw_list) {
158167
# - multi-select items are converted to a series of binary columns, one for
159168
# each unique level/response code; multi-select used for grouping are left as-is.
160169
# - multiple choice items are left as-is
161-
170+
162171
#### TODO: How do we want to handle multi-select items when used for grouping?
163-
agg_groups <- unique(aggregations$group_by)
164-
group_cols_to_convert <- unique(do.call(c, agg_groups))
165-
group_cols_to_convert <- group_cols_to_convert[startsWith(group_cols_to_convert, "b_")]
166-
167-
metric_cols_to_convert <- unique(aggregations$metric)
168-
169-
for (col_var in c(group_cols_to_convert, metric_cols_to_convert)) {
170-
if ( is.null(df[[col_var]]) ) {
171-
aggregations <- aggregations[aggregations$metric != col_var &
172-
!mapply(aggregations$group_by,
173-
FUN=function(x) {col_var %in% x}), ]
174-
msg_plain(
175-
paste0(
176-
col_var, " is not defined. Removing all aggregations that use it. ",
177-
nrow(aggregations), " remaining")
178-
)
172+
group_vars <- unique( unlist(aggregations$group_by) )
173+
group_vars <- group_vars[group_vars != "geo_id"]
174+
175+
metric_cols <- unique(aggregations$metric)
176+
177+
cols_check_available <- unique(c(group_vars, metric_cols))
178+
available <- cols_check_available %in% names(df)
179+
cols_not_available <- cols_check_available[ !available ]
180+
for (col_var in cols_not_available) {
181+
# Remove from aggregations
182+
aggregations <- aggregations[aggregations$metric != col_var &
183+
!mapply(aggregations$group_by,
184+
FUN=function(x) {col_var %in% x}), ]
185+
msg_plain(paste0(
186+
col_var, " is not defined. Removing all aggregations that use it. ",
187+
nrow(aggregations), " remaining")
188+
)
189+
}
190+
191+
cols_available <- cols_check_available[ available ]
192+
for (col_var in cols_available) {
193+
if ( col_var %in% group_vars & !(col_var %in% metric_cols) & !startsWith(col_var, "b_") ) {
179194
next
180195
}
181196

182197
if (startsWith(col_var, "b_")) { # Binary
183198
output <- code_binary(df, aggregations, col_var)
184-
} else if (startsWith(col_var, "ms_")) { # Multiselect
185-
output <- code_multiselect(df, aggregations, col_var)
186199
} else if (startsWith(col_var, "n_")) { # Numeric free response
187200
output <- code_numeric_freeresponse(df, aggregations, col_var)
188-
} else if (startsWith(col_var, "mc_")) { # Multiple choice
201+
} else if (startsWith(col_var, "ms_")) { # Multi-select
202+
output <- code_multiselect(df, aggregations, col_var)
203+
} else {
204+
# Multiple choice and variables that are formatted differently
189205
output <- list(df, aggregations)
190206
}
191-
192207
df <- output[[1]]
193208
aggregations <- output[[2]]
194209
}
@@ -233,28 +248,27 @@ summarize_aggs <- function(df, crosswalk_data, aggregations, geo_level, params)
233248
## inefficient; profiling shows the cost to be negligible, so shut it up
234249
df <- suppressWarnings(inner_join(df, crosswalk_data, by = "zip5"))
235250

236-
groupby_vars <- aggregations$group_by[[1]]
251+
group_vars <- aggregations$group_by[[1]]
237252

238-
if (all(groupby_vars %in% names(df))) {
239-
unique_group_combos <- unique(df[, groupby_vars, with=FALSE])
253+
if (all(group_vars %in% names(df))) {
254+
unique_group_combos <- unique(df[, group_vars, with=FALSE])
240255
unique_group_combos <- unique_group_combos[complete.cases(unique_group_combos)]
241256
} else {
242257
msg_plain(
243258
sprintf(
244259
"not all of groupby columns %s available in data; skipping aggregation",
245-
paste(groupby_vars, collapse=", ")
260+
paste(group_vars, collapse=", ")
246261
))
247262
}
248263

249264
if ( !exists("unique_group_combos") || nrow(unique_group_combos) == 0 ) {
250265
return(list())
251266
}
252267

253-
254268
## Set an index on the groupby var columns so that the groupby step can be
255269
## faster; data.table stores the sort order of the column and
256270
## uses a binary search to find matching values, rather than a linear scan.
257-
setindexv(df, groupby_vars)
271+
setindexv(df, group_vars)
258272

259273
calculate_group <- function(ii) {
260274
target_group <- unique_group_combos[ii]
@@ -287,15 +301,15 @@ summarize_aggs <- function(df, crosswalk_data, aggregations, geo_level, params)
287301
## Do post-processing.
288302
for (row in seq_len(nrow(aggregations))) {
289303
aggregation <- aggregations$id[row]
290-
groupby_vars <- aggregations$group_by[[row]]
304+
group_vars <- aggregations$group_by[[row]]
291305
post_fn <- aggregations$post_fn[[row]]
292306

293307
dfs_out[[aggregation]] <- dfs_out[[aggregation]][
294-
rowSums(is.na(dfs_out[[aggregation]][, c("val", "sample_size", groupby_vars)])) == 0,
308+
rowSums(is.na(dfs_out[[aggregation]][, c("val", "sample_size", group_vars)])) == 0,
295309
]
296310

297311
if (geo_level == "county") {
298-
df_megacounties <- megacounty(dfs_out[[aggregation]], params$num_filter, groupby_vars)
312+
df_megacounties <- megacounty(dfs_out[[aggregation]], params$num_filter, group_vars)
299313
dfs_out[[aggregation]] <- bind_rows(dfs_out[[aggregation]], df_megacounties)
300314
}
301315

facebook/delphiFacebook/R/contingency_run.R

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,127 @@
2323
set_aggs <- function() {
2424
weekly_aggs <- tribble(
2525
~name, ~metric, ~group_by, ~compute_fn, ~post_fn,
26+
#### Cut 1: side effects if hesitant about getting vaccine and generally
27+
# National
28+
"pct_concerned_sideeffects", "b_concerned_sideeffects", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "nation"), compute_binary, jeffreys_binary,
29+
"pct_hesitant_sideeffects", "b_hesitant_sideeffects", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "nation"), compute_binary, jeffreys_binary,
30+
31+
# State
32+
"pct_concerned_sideeffects", "b_concerned_sideeffects", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary,
33+
"pct_hesitant_sideeffects", "b_hesitant_sideeffects", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary,
34+
35+
# State marginal
36+
"pct_concerned_sideeffects", "b_concerned_sideeffects", c("mc_age", "state"), compute_binary, jeffreys_binary,
37+
"pct_concerned_sideeffects", "b_concerned_sideeffects", c("mc_gender", "state"), compute_binary, jeffreys_binary,
38+
"pct_concerned_sideeffects", "b_concerned_sideeffects", c("mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary,
39+
"pct_hesitant_sideeffects", "b_hesitant_sideeffects", c("mc_age", "state"), compute_binary, jeffreys_binary,
40+
"pct_hesitant_sideeffects", "b_hesitant_sideeffects", c("mc_gender", "state"), compute_binary, jeffreys_binary,
41+
"pct_hesitant_sideeffects", "b_hesitant_sideeffects", c("mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary,
42+
43+
44+
45+
#### Cut 2: trust various institutions if hesitant about getting vaccine
46+
# National
47+
"pct_hesitant_trust_fam", "b_hesitant_trust_fam", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "nation"), compute_binary, jeffreys_binary,
48+
"pct_hesitant_trust_healthcare", "b_hesitant_trust_healthcare", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "nation"), compute_binary, jeffreys_binary,
49+
"pct_hesitant_trust_who", "b_hesitant_trust_who", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "nation"), compute_binary, jeffreys_binary,
50+
"pct_hesitant_trust_govt", "b_hesitant_trust_govt", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "nation"), compute_binary, jeffreys_binary,
51+
"pct_hesitant_trust_politicians", "b_hesitant_trust_politicians", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "nation"), compute_binary, jeffreys_binary,
52+
53+
# State
54+
"pct_hesitant_trust_fam", "b_hesitant_trust_fam", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary,
55+
"pct_hesitant_trust_healthcare", "b_hesitant_trust_healthcare", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary,
56+
"pct_hesitant_trust_who", "b_hesitant_trust_who", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary,
57+
"pct_hesitant_trust_govt", "b_hesitant_trust_govt", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary,
58+
"pct_hesitant_trust_politicians", "b_hesitant_trust_politicians", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary,
59+
60+
# State marginal
61+
"pct_hesitant_trust_fam", "b_hesitant_trust_fam", c("mc_age", "state"), compute_binary, jeffreys_binary,
62+
"pct_hesitant_trust_fam", "b_hesitant_trust_fam", c("mc_gender", "state"), compute_binary, jeffreys_binary,
63+
"pct_hesitant_trust_fam", "b_hesitant_trust_fam", c("mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary,
64+
"pct_hesitant_trust_healthcare", "b_hesitant_trust_healthcare", c("mc_age", "state"), compute_binary, jeffreys_binary,
65+
"pct_hesitant_trust_healthcare", "b_hesitant_trust_healthcare", c("mc_gender", "state"), compute_binary, jeffreys_binary,
66+
"pct_hesitant_trust_healthcare", "b_hesitant_trust_healthcare", c("mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary,
67+
"pct_hesitant_trust_who", "b_hesitant_trust_who", c("mc_age", "state"), compute_binary, jeffreys_binary,
68+
"pct_hesitant_trust_who", "b_hesitant_trust_who", c("mc_gender", "state"), compute_binary, jeffreys_binary,
69+
"pct_hesitant_trust_who", "b_hesitant_trust_who", c("mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary,
70+
"pct_hesitant_trust_govt", "b_hesitant_trust_govt", c("mc_age", "state"), compute_binary, jeffreys_binary,
71+
"pct_hesitant_trust_govt", "b_hesitant_trust_govt", c("mc_gender", "state"), compute_binary, jeffreys_binary,
72+
"pct_hesitant_trust_govt", "b_hesitant_trust_govt", c("mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary,
73+
"pct_hesitant_trust_politicians", "b_hesitant_trust_politicians", c("mc_age", "state"), compute_binary, jeffreys_binary,
74+
"pct_hesitant_trust_politicians", "b_hesitant_trust_politicians", c("mc_gender", "state"), compute_binary, jeffreys_binary,
75+
"pct_hesitant_trust_politicians", "b_hesitant_trust_politicians", c("mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary,
76+
77+
78+
79+
#### Cut 3: trust various institutions
80+
# National
81+
"pct_trust_fam", "b_vaccine_likely_friends", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "nation"), compute_binary, jeffreys_binary,
82+
"pct_trust_healthcare", "b_vaccine_likely_local_health", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "nation"), compute_binary, jeffreys_binary,
83+
"pct_trust_who", "b_vaccine_likely_who", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "nation"), compute_binary, jeffreys_binary,
84+
"pct_trust_govt", "b_vaccine_likely_govt_health", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "nation"), compute_binary, jeffreys_binary,
85+
"pct_trust_politicians", "b_vaccine_likely_politicians", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "nation"), compute_binary, jeffreys_binary,
86+
87+
# State
88+
"pct_trust_fam", "b_vaccine_likely_friends", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary,
89+
"pct_trust_healthcare", "b_vaccine_likely_local_health", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary,
90+
"pct_trust_who", "b_vaccine_likely_who", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary,
91+
"pct_trust_govt", "b_vaccine_likely_govt_health", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary,
92+
"pct_trust_politicians", "b_vaccine_likely_politicians", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary,
93+
94+
# State marginal
95+
"pct_trust_fam", "b_vaccine_likely_friends", c("mc_age", "state"), compute_binary, jeffreys_binary,
96+
"pct_trust_fam", "b_vaccine_likely_friends", c("mc_gender", "state"), compute_binary, jeffreys_binary,
97+
"pct_trust_fam", "b_vaccine_likely_friends", c("mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary,
98+
"pct_trust_healthcare", "b_vaccine_likely_local_health", c("mc_age", "state"), compute_binary, jeffreys_binary,
99+
"pct_trust_healthcare", "b_vaccine_likely_local_health", c("mc_gender", "state"), compute_binary, jeffreys_binary,
100+
"pct_trust_healthcare", "b_vaccine_likely_local_health", c("mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary,
101+
"pct_trust_who", "b_vaccine_likely_who", c("mc_age", "state"), compute_binary, jeffreys_binary,
102+
"pct_trust_who", "b_vaccine_likely_who", c("mc_gender", "state"), compute_binary, jeffreys_binary,
103+
"pct_trust_who", "b_vaccine_likely_who", c("mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary,
104+
"pct_trust_govt", "b_vaccine_likely_govt_health", c("mc_age", "state"), compute_binary, jeffreys_binary,
105+
"pct_trust_govt", "b_vaccine_likely_govt_health", c("mc_gender", "state"), compute_binary, jeffreys_binary,
106+
"pct_trust_govt", "b_vaccine_likely_govt_health", c("mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary,
107+
"pct_trust_politicians", "b_vaccine_likely_politicians", c("mc_age", "state"), compute_binary, jeffreys_binary,
108+
"pct_trust_politicians", "b_vaccine_likely_politicians", c("mc_gender", "state"), compute_binary, jeffreys_binary,
109+
"pct_trust_politicians", "b_vaccine_likely_politicians", c("mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary,
110+
111+
112+
113+
#### Cuts 4, 5, 6: vaccinated and accepting if senior, in healthcare, or generally
114+
# National
115+
"pct_vaccinated", "b_had_cov_vaccine", c("b_work_in_healthcare", "mc_age", "mc_gender", "mc_race", "b_hispanic", "nation"), compute_binary, jeffreys_binary,
116+
"pct_vaccinated", "b_had_cov_vaccine", c("b_65_or_older", "mc_gender", "mc_race", "b_hispanic", "nation"), compute_binary, jeffreys_binary,
117+
"pct_vaccinated", "b_had_cov_vaccine", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "nation"), compute_binary, jeffreys_binary,
118+
"pct_accepting", "b_accept_cov_vaccine", c("b_work_in_healthcare", "mc_age", "mc_gender", "mc_race", "b_hispanic", "nation"), compute_binary, jeffreys_binary,
119+
"pct_accepting", "b_accept_cov_vaccine", c("b_65_or_older", "mc_gender", "mc_race", "b_hispanic", "nation"), compute_binary, jeffreys_binary,
120+
"pct_accepting", "b_accept_cov_vaccine", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "nation"), compute_binary, jeffreys_binary,
121+
122+
# State
123+
"pct_vaccinated", "b_had_cov_vaccine", c("b_work_in_healthcare", "mc_age", "mc_gender", "mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary,
124+
"pct_vaccinated", "b_had_cov_vaccine", c("b_65_or_older", "mc_gender", "mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary,
125+
"pct_vaccinated", "b_had_cov_vaccine", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary,
126+
"pct_accepting", "b_accept_cov_vaccine", c("b_work_in_healthcare", "mc_age", "mc_gender", "mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary,
127+
"pct_accepting", "b_accept_cov_vaccine", c("b_65_or_older", "mc_gender", "mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary,
128+
"pct_accepting", "b_accept_cov_vaccine", c("mc_age", "mc_gender", "mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary,
129+
130+
# State marginal
131+
"pct_vaccinated", "b_had_cov_vaccine", c("b_work_in_healthcare", "mc_age", "state"), compute_binary, jeffreys_binary,
132+
"pct_vaccinated", "b_had_cov_vaccine", c("b_work_in_healthcare","mc_gender", "state"), compute_binary, jeffreys_binary,
133+
"pct_vaccinated", "b_had_cov_vaccine", c("b_work_in_healthcare", "mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary,
134+
"pct_vaccinated", "b_had_cov_vaccine", c("b_65_or_older", "mc_gender", "state"), compute_binary, jeffreys_binary,
135+
"pct_vaccinated", "b_had_cov_vaccine", c("b_65_or_older", "mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary,
136+
"pct_vaccinated", "b_had_cov_vaccine", c("mc_age", "state"), compute_binary, jeffreys_binary,
137+
"pct_vaccinated", "b_had_cov_vaccine", c("mc_gender", "state"), compute_binary, jeffreys_binary,
138+
"pct_vaccinated", "b_had_cov_vaccine", c("mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary,
139+
"pct_accepting", "b_accept_cov_vaccine", c("b_work_in_healthcare", "mc_age", "state"), compute_binary, jeffreys_binary,
140+
"pct_accepting", "b_accept_cov_vaccine", c("b_work_in_healthcare","mc_gender", "state"), compute_binary, jeffreys_binary,
141+
"pct_accepting", "b_accept_cov_vaccine", c("b_work_in_healthcare", "mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary,
142+
"pct_accepting", "b_accept_cov_vaccine", c("b_65_or_older", "mc_gender", "state"), compute_binary, jeffreys_binary,
143+
"pct_accepting", "b_accept_cov_vaccine", c("b_65_or_older", "mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary,
144+
"pct_accepting", "b_accept_cov_vaccine", c("mc_age", "state"), compute_binary, jeffreys_binary,
145+
"pct_accepting", "b_accept_cov_vaccine", c("mc_gender", "state"), compute_binary, jeffreys_binary,
146+
"pct_accepting", "b_accept_cov_vaccine", c("mc_race", "b_hispanic", "state"), compute_binary, jeffreys_binary,
26147
)
27148

28149
monthly_aggs <- tribble(

0 commit comments

Comments
 (0)