9
9
# ' @template indicator-template
10
10
# ' @template signal-template
11
11
# '
12
- # ' @importFrom dplyr %>% filter select group_by summarize across everything group_split ungroup
12
+ # ' @importFrom dplyr %>% filter group_by summarize across everything group_split ungroup
13
13
# ' @importFrom tidyr drop_na
14
- # ' @importFrom rlang .data .env
15
- # ' @importFrom stringr str_interp
16
14
# '
17
15
# ' @export
18
16
run_backfill <- function (df , params ,
19
17
refd_col = " time_value" , lag_col = " lag" , issued_col = " issue_date" ,
20
18
signal_suffixes = c(" " ), indicator = " " , signal = " " ) {
21
- df <- filter(df , .data $ lag < params $ ref_lag + 30 ) # a rough filtration to save memory
19
+ df <- filter(df , lag < params $ ref_lag + 30 ) # a rough filtration to save memory
22
20
23
21
geo_levels <- params $ geo_levels
24
22
if (" state" %in% geo_levels ) {
@@ -28,23 +26,24 @@ run_backfill <- function(df, params,
28
26
}
29
27
30
28
for (geo_level in geo_levels ) {
31
- msg_ts(str_interp( " geo level ${geo_level} " ) )
29
+ msg_ts(" geo level " , geo_level )
32
30
# Get full list of interested locations
33
31
if (geo_level == " state" ) {
34
32
# Drop county field and make new "geo_value" field from "state_id".
35
33
# Aggregate counties up to state level
36
34
agg_cols <- c(" geo_value" , issued_col , refd_col , lag_col )
37
35
# Sum all non-agg columns. Summarized columns keep original names
36
+ df $ geo_value <- df $ state_id
37
+ df $ state_id <- NULL
38
38
df <- df %> %
39
- select(- .data $ geo_value , geo_value = .data $ state_id ) %> %
40
39
group_by(across(agg_cols )) %> %
41
40
summarize(across(everything(), sum )) %> %
42
41
ungroup()
43
42
}
44
43
if (geo_level == " county" ) {
45
44
# Keep only 200 most populous (within the US) counties
46
45
top_200_geos <- get_populous_counties()
47
- df <- filter(df , .data $ geo_value %in% top_200_geos )
46
+ df <- filter(df , geo_value %in% top_200_geos )
48
47
}
49
48
50
49
test_data_list <- list ()
@@ -59,13 +58,13 @@ run_backfill <- function(df, params,
59
58
}
60
59
61
60
msg_ts(" Splitting data into geo groups" )
62
- group_dfs <- group_split(df , .data $ geo_value )
61
+ group_dfs <- group_split(df , geo_value )
63
62
64
63
# Build model for each location
65
64
for (subdf in group_dfs ) {
66
65
geo <- subdf $ geo_value [1 ]
67
66
68
- msg_ts(str_interp( " Processing ${ geo} geo group" ) )
67
+ msg_ts(" Processing " , geo , " geo group" )
69
68
70
69
min_refd <- min(subdf [[refd_col ]])
71
70
max_refd <- max(subdf [[refd_col ]])
@@ -78,7 +77,7 @@ run_backfill <- function(df, params,
78
77
# process again. Main use case is for quidel which has overall and
79
78
# age-based signals.
80
79
if (signal_suffix != " " ) {
81
- msg_ts(str_interp( " signal suffix ${signal_suffix} " ) )
80
+ msg_ts(" signal suffix " , signal_suffix )
82
81
num_col <- paste(params $ num_col , signal_suffix , sep = " _" )
83
82
denom_col <- paste(params $ denom_col , signal_suffix , sep = " _" )
84
83
} else {
@@ -87,7 +86,7 @@ run_backfill <- function(df, params,
87
86
}
88
87
89
88
for (value_type in params $ value_types ) {
90
- msg_ts(str_interp( " value type ${value_type} " ) )
89
+ msg_ts(" value type " , value_type )
91
90
# Handle different signal types
92
91
if (value_type == " count" ) { # For counts data only
93
92
combined_df <- fill_missing_updates(subdf , num_col , refd_col , lag_col )
@@ -113,15 +112,17 @@ run_backfill <- function(df, params,
113
112
)
114
113
}
115
114
combined_df <- add_params_for_dates(combined_df , refd_col , lag_col )
116
- combined_df <- combined_df % > % filter(.data $ lag < params $ ref_lag )
115
+ combined_df <- filter(combined_df , lag < params $ ref_lag )
117
116
118
- geo_train_data <- combined_df %> %
119
- filter(.data $ issue_date < params $ training_end_date ) %> %
120
- filter(.data $ target_date < = params $ training_end_date ) %> %
121
- filter(.data $ target_date > params $ training_start_date ) %> %
117
+ geo_train_data <- filter(combined_df ,
118
+ issue_date < params $ training_end_date ,
119
+ target_date < = params $ training_end_date ,
120
+ target_date > params $ training_start_date ,
121
+ ) %> %
122
122
drop_na()
123
- geo_test_data <- combined_df %> %
124
- filter(.data $ issue_date %in% params $ test_dates ) %> %
123
+ geo_test_data <- filter(combined_df ,
124
+ issue_date %in% params $ test_dates
125
+ ) %> %
125
126
drop_na()
126
127
127
128
if (nrow(geo_test_data ) == 0 ) {
@@ -135,9 +136,10 @@ run_backfill <- function(df, params,
135
136
136
137
if (value_type == " fraction" ) {
137
138
# Use beta prior approach to adjust fractions
138
- geo_prior_test_data = combined_df %> %
139
- filter(.data $ issue_date > min(params $ test_dates ) - 7 ) %> %
140
- filter(.data $ issue_date < = max(params $ test_dates ))
139
+ geo_prior_test_data = filter(combined_df ,
140
+ issue_date > min(params $ test_dates ) - 7 ,
141
+ issue_date < = max(params $ test_dates )
142
+ )
141
143
updated_data <- frac_adj(geo_train_data , geo_test_data , geo_prior_test_data ,
142
144
indicator = indicator , signal = signal ,
143
145
geo_level = geo_level , signal_suffix = signal_suffix ,
@@ -154,16 +156,15 @@ run_backfill <- function(df, params,
154
156
}
155
157
max_raw = sqrt(max(geo_train_data $ value_raw ))
156
158
for (test_lag in params $ test_lags ) {
157
- msg_ts(str_interp( " test lag ${test_lag} " ) )
159
+ msg_ts(" test lag " , test_lag )
158
160
filtered_data <- data_filteration(test_lag , geo_train_data ,
159
161
geo_test_data , params $ lag_pad )
160
162
train_data <- filtered_data [[1 ]]
161
163
test_data <- filtered_data [[2 ]]
162
164
163
165
if (nrow(train_data ) == 0 || nrow(test_data ) == 0 ) {
164
- msg_ts(str_interp(
165
- " Not enough data to either train or test for test_lag ${test_lag}, skipping"
166
- ))
166
+ msg_ts(" Not enough data to either train or test for test_lag " ,
167
+ test_lag , " , skipping" )
167
168
next
168
169
}
169
170
@@ -238,9 +239,8 @@ run_backfill <- function(df, params,
238
239
# ' @template lag_col-template
239
240
# ' @template issued_col-template
240
241
# '
241
- # ' @importFrom dplyr bind_rows mutate %>%
242
+ # ' @importFrom dplyr bind_rows %>%
242
243
# ' @importFrom parallel detectCores
243
- # ' @importFrom rlang .data :=
244
244
# ' @importFrom stringr str_interp
245
245
# '
246
246
# ' @export
@@ -253,7 +253,7 @@ main <- function(params,
253
253
254
254
indicators_subset <- INDICATORS_AND_SIGNALS
255
255
if (params $ indicators != " all" ) {
256
- indicators_subset <- filter(indicators_subset , .data $ indicator == params $ indicators )
256
+ indicators_subset <- filter(indicators_subset , indicator == params $ indicators )
257
257
}
258
258
if (nrow(indicators_subset ) == 0 ) {
259
259
stop(" no indicators to process" )
@@ -288,62 +288,51 @@ main <- function(params,
288
288
params $ training_start_date <- result $ training_start_date
289
289
params $ training_end_date <- result $ training_end_date
290
290
291
- msg_ts(paste0(
292
- str_interp(" training_start_date is ${params$training_start_date}, " ),
293
- str_interp(" training_end_date is ${params$training_end_date}" )
294
- ))
291
+ msg_ts(" training_start_date is " , params $ training_start_date ,
292
+ " , training_end_date is " , params $ training_end_date )
295
293
296
294
# Loop over every indicator + signal combination.
297
295
for (group_i in seq_len(nrow(indicators_subset ))) {
298
296
input_group <- indicators_subset [group_i ,]
299
- msg_ts(str_interp(
300
- " Processing indicator ${input_group$indicator} signal ${input_group$signal}"
301
- ))
297
+ msg_ts(" Processing indicator " , input_group $ indicator , " signal " , input_group $ signal )
302
298
303
299
files_list <- get_files_list(
304
300
input_group $ indicator , input_group $ signal , params , input_group $ sub_dir
305
301
)
306
302
if (length(files_list ) == 0 ) {
307
- warning(str_interp(
308
- " No files found for indicator ${input_group$indicator} signal ${input_group$signal}, skipping"
309
- ))
303
+ warning(" No files found for indicator indicator " , input_group $ indicator ,
304
+ " signal " , input_group $ signal , " , skipping" )
310
305
next
311
306
}
312
307
313
308
msg_ts(" Reading in and combining associated files" )
314
309
input_data <- lapply(
315
310
files_list ,
316
311
function (file ) {
312
+ # refd_col and issued_col read in as strings
317
313
read_data(file ) %> %
318
- fips_to_geovalue() %> %
319
- mutate(
320
- # Use `glue` syntax to construct a new field by variable,
321
- # from https://stackoverflow.com/a/26003971/14401472
322
- " {refd_col}" : = as.Date(.data [[refd_col ]], " %Y-%m-%d" ),
323
- " {issued_col}" : = as.Date(.data [[issued_col ]], " %Y-%m-%d" )
324
- )
314
+ fips_to_geovalue()
325
315
}
326
316
) %> %
327
317
bind_rows()
328
318
329
319
if (nrow(input_data ) == 0 ) {
330
- warning(str_interp(
331
- " No data available for indicator ${input_group$indicator} signal ${input_group$signal}, skipping"
332
- ))
320
+ warning(" No data available for indicator " , input_group $ indicator ,
321
+ " signal " , input_group $ signal , " , skipping" )
333
322
next
334
323
}
335
324
336
325
# Check data type and required columns
337
326
msg_ts(" Validating input data" )
338
- for ( value_type in params $ value_types ) {
339
- msg_ts(str_interp( " for ${value_type} " ))
340
- result <- validity_checks(
341
- input_data , value_type ,
342
- params $ num_col , params $ denom_col , input_group $ name_suffix ,
343
- refd_col = refd_col , lag_col = lag_col , issued_col = issued_col
344
- )
345
- input_data <- result [[ " df " ]]
346
- }
327
+ # Validate while date fields still stored as strings for speed.
328
+ input_data <- validity_checks(
329
+ input_data , params $ value_types ,
330
+ params $ num_col , params $ denom_col , input_group $ name_suffix ,
331
+ refd_col = refd_col , lag_col = lag_col , issued_col = issued_col
332
+ )
333
+
334
+ input_data [[ refd_col ]] <- as.Date( input_data [[ refd_col ]], " %Y-%m-%d " )
335
+ input_data [[ issued_col ]] <- as.Date( input_data [[ issued_col ]], " %Y-%m-%d " )
347
336
348
337
# Check available training days
349
338
training_days_check(input_data [[issued_col ]], params $ training_days )
0 commit comments