8
8
# ' @template signal_suffixes-template
9
9
# ' @template indicator-template
10
10
# ' @template signal-template
11
- # ' @param training_end_date the most recent training date
12
11
# '
13
12
# ' @importFrom dplyr %>% filter select group_by summarize across everything group_split ungroup
14
13
# ' @importFrom tidyr drop_na
15
14
# ' @importFrom rlang .data .env
16
15
# '
17
16
# ' @export
18
- run_backfill <- function (df , params , training_end_date ,
17
+ run_backfill <- function (df , params ,
19
18
refd_col = " time_value" , lag_col = " lag" , issued_col = " issue_date" ,
20
19
signal_suffixes = c(" " ), indicator = " " , signal = " " ) {
21
20
df <- filter(df , .data $ lag < params $ ref_lag + 30 ) # a rough filtration to save memory
@@ -57,13 +56,14 @@ run_backfill <- function(df, params, training_end_date,
57
56
coef_list [[key ]] <- list ()
58
57
}
59
58
}
60
-
59
+
61
60
msg_ts(" Splitting data into geo groups" )
62
61
group_dfs <- group_split(df , .data $ geo_value )
63
62
64
63
# Build model for each location
65
64
for (subdf in group_dfs ) {
66
65
geo <- subdf $ geo_value [1 ]
66
+
67
67
msg_ts(str_interp(" Processing ${geo} geo group" ))
68
68
69
69
min_refd <- min(subdf [[refd_col ]])
@@ -115,9 +115,9 @@ run_backfill <- function(df, params, training_end_date,
115
115
combined_df <- combined_df %> % filter(.data $ lag < params $ ref_lag )
116
116
117
117
geo_train_data <- combined_df %> %
118
- filter(.data $ issue_date < training_end_date ) %> %
119
- filter(.data $ target_date < = training_end_date ) %> %
120
- filter(.data $ target_date > training_end_date - params $ training_days ) %> %
118
+ filter(.data $ issue_date < params $ training_end_date ) %> %
119
+ filter(.data $ target_date < = params $ training_end_date ) %> %
120
+ filter(.data $ target_date > params $ training_start_date ) %> %
121
121
drop_na()
122
122
geo_test_data <- combined_df %> %
123
123
filter(.data $ issue_date %in% params $ test_dates ) %> %
@@ -135,7 +135,8 @@ run_backfill <- function(df, params, training_end_date,
135
135
indicator = indicator , signal = signal ,
136
136
geo_level = geo_level , signal_suffix = signal_suffix ,
137
137
lambda = params $ lambda , value_type = value_type , geo = geo ,
138
- training_end_date = training_end_date ,
138
+ training_end_date = params $ training_end_date ,
139
+ training_start_date = params $ training_start_date ,
139
140
model_save_dir = params $ cache_dir ,
140
141
taus = params $ taus ,
141
142
lp_solver = params $ lp_solver ,
@@ -178,7 +179,9 @@ run_backfill <- function(df, params, training_end_date,
178
179
lambda = params $ lambda , test_lag = test_lag , geo = geo ,
179
180
value_type = value_type , model_save_dir = params $ cache_dir ,
180
181
indicator = indicator , signal = signal , geo_level = geo_level ,
181
- signal_suffix = signal_suffix , training_end_date = training_end_date ,
182
+ signal_suffix = signal_suffix ,
183
+ training_end_date = params $ training_end_date ,
184
+ training_start_date = params $ training_start_date ,
182
185
train_models = params $ train_models ,
183
186
make_predictions = params $ make_predictions
184
187
)
@@ -199,23 +202,24 @@ run_backfill <- function(df, params, training_end_date,
199
202
}# End for test lags
200
203
}# End for value types
201
204
}# End for signal suffixes
202
-
203
- if (params $ make_predictions ) {
204
- for (value_type in params $ value_types ) {
205
- for (signal_suffix in signal_suffixes ) {
206
- key <- make_key(value_type , signal_suffix )
207
- test_combined <- bind_rows(test_data_list [[key ]])
208
- coef_combined <- bind_rows(coef_list [[key ]])
209
- export_test_result(test_combined , coef_combined ,
210
- indicator , signal ,
211
- geo_level , geo , signal_suffix , params $ lambda ,
212
- training_end_date ,
213
- value_type , export_dir = params $ export_dir )
214
- }
205
+ }# End for geo list
206
+
207
+ if (params $ make_predictions ) {
208
+ for (value_type in params $ value_types ) {
209
+ for (signal_suffix in signal_suffixes ) {
210
+ key <- make_key(value_type , signal_suffix )
211
+ test_combined <- bind_rows(test_data_list [[key ]])
212
+ coef_combined <- bind_rows(coef_list [[key ]])
213
+ export_test_result(test_combined , coef_combined ,
214
+ indicator = indicator , signal = signal ,
215
+ signal_suffix = signal_suffix ,
216
+ geo_level = geo_level , lambda = params $ lambda ,
217
+ training_end_date = params $ training_end_date ,
218
+ training_start_date = params $ training_start_date ,
219
+ value_type = value_type , export_dir = params $ export_dir )
215
220
}
216
221
}
217
-
218
- }# End for geo list
222
+ }
219
223
}# End for geo type
220
224
}
221
225
@@ -236,14 +240,10 @@ main <- function(params) {
236
240
237
241
if (params $ train_models ) {
238
242
msg_ts(" Removing stored models" )
239
- files_list <- list.files(params $ cache_dir , pattern = " *. model" , full.names = TRUE )
243
+ files_list <- list.files(params $ cache_dir , pattern = " [.] model$ " , full.names = TRUE )
240
244
file.remove(files_list )
241
245
}
242
246
243
- training_end_date <- as.Date(readLines(
244
- file.path(params $ cache_dir , " training_end_date.txt" )))
245
- msg_ts(str_interp(" training_end_date is ${training_end_date}" ))
246
-
247
247
# # Set default number of cores for mclapply to half of those available.
248
248
if (params $ parallel ) {
249
249
cores <- detectCores()
@@ -255,7 +255,18 @@ main <- function(params) {
255
255
options(mc.cores = min(params $ parallel_max_cores , max(floor(cores / 2 ), 1L )))
256
256
}
257
257
}
258
-
258
+
259
+ # Training start and end dates are the same for all indicators, so we can fetch
260
+ # at the beginning.
261
+ result <- get_training_date_range(params )
262
+ params $ training_start_date <- result $ training_start_date
263
+ params $ training_end_date <- result $ training_end_date
264
+
265
+ msg_ts(paste0(
266
+ str_interp(" training_start_date is ${params$training_start_date}, " ),
267
+ str_interp(" training_end_date is ${params$training_end_date}" )
268
+ ))
269
+
259
270
# Loop over every indicator + signal combination.
260
271
for (group_i in seq_len(nrow(INDICATORS_AND_SIGNALS ))) {
261
272
input_group <- INDICATORS_AND_SIGNALS [group_i ,]
@@ -302,14 +313,8 @@ main <- function(params) {
302
313
training_days_check(input_data $ issue_date , params $ training_days )
303
314
304
315
# Perform backfill corrections and save result
305
- run_backfill(input_data , params , training_end_date ,
316
+ run_backfill(input_data , params ,
306
317
indicator = input_group $ indicator , signal = input_group $ signal ,
307
318
signal_suffixes = input_group $ name_suffix )
308
-
309
- if (params $ train_models ) {
310
- # Save the training end date to a text file.
311
- writeLines(as.character(TODAY ),
312
- file.path(params $ cache_dir , " training_end_date.txt" ))
313
- }
314
319
}
315
320
}
0 commit comments