Skip to content

Commit 876a77a

Browse files
authored
Merge pull request #1712 from cmu-delphi/ndefries/bc-input-file-dates
Make input file filter rely on user inputs
2 parents a606486 + 97e6314 commit 876a77a

File tree

7 files changed

+131
-19
lines changed

7 files changed

+131
-19
lines changed

backfill_corrections/delphiBackfillCorrection/NAMESPACE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ export(add_dayofweek)
55
export(add_shift)
66
export(add_sqrtscale)
77
export(add_weekofmonth)
8+
export(assert)
89
export(create_dir_not_exist)
910
export(data_filteration)
1011
export(evaluate)

backfill_corrections/delphiBackfillCorrection/R/constants.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ TAUS <- c(0.01, 0.025, 0.1, 0.25, 0.5, 0.75, 0.9, 0.975, 0.99)
33
REF_LAG <- 60
44
TEST_LAGS <- c(1:14, 21, 35, 51)
55
TRAINING_DAYS <- 270
6-
TESTING_WINDOW <- 14
6+
TESTING_WINDOW <- 1
77
LAG_WINDOW <- 5
88
LAMBDA <- 0.1
99
LAG_PAD <- 2

backfill_corrections/delphiBackfillCorrection/R/io.R

Lines changed: 58 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -108,42 +108,86 @@ get_files_list <- function(indicator, signal, params, sub_dir) {
108108
subset_valid_files <- function(files_list, file_type = c("daily", "rollup"), params) {
109109
file_type <- match.arg(file_type)
110110
date_format = "%Y%m%d"
111+
112+
# Put min and max issue date for each file into vectors the same length as
113+
# `files_list`.
111114
switch(file_type,
112115
daily = {
113-
start_dates <- as.Date(
116+
start_issue_dates <- as.Date(
114117
sub("^.*/.*_as_of_([0-9]{8})[.]parquet$", "\\1", files_list),
115118
format = date_format
116119
)
117-
end_dates <- start_dates
120+
end_issue_dates <- start_issue_dates
118121
},
119122
rollup = {
120123
rollup_pattern <- "^.*/.*_from_([0-9]{8})_to_([0-9]{8})[.]parquet$"
121-
start_dates <- as.Date(
124+
start_issue_dates <- as.Date(
122125
sub(rollup_pattern, "\\1", files_list),
123126
format = date_format
124127
)
125-
end_dates <- as.Date(
128+
end_issue_dates <- as.Date(
126129
sub(rollup_pattern, "\\2", files_list),
127130
format = date_format
128131
)
129132
}
130133
)
131134

132-
## TODO: right now, this gets both training and testing data regardless of
133-
# which mode is selected
134-
n_addl_days <- params$ref_lag + params$training_days
135-
start_date <- TODAY - n_addl_days
136-
end_date <- TODAY - 1
137-
135+
# Find the earliest and latest issue dates needed for either training or testing.
136+
result <- get_issue_date_range(params)
137+
start_issue <- result$start_issue
138+
end_issue <- result$end_issue
139+
138140
# Only keep files with data that falls at least somewhat between the desired
139-
# start and end range dates.
141+
# start and end issue dates.
140142
files_list <- files_list[
141-
!(( start_dates < start_date & end_dates < start_date ) |
142-
( start_dates > end_date & end_dates > end_date ))]
143-
143+
!(( start_issue_dates < start_issue & end_issue_dates < start_issue ) |
144+
( start_issue_dates > end_issue & end_issue_dates > end_issue ))]
145+
144146
return(files_list)
145147
}
146148

149+
#' Find the earliest and latest issue dates needed for either training or testing.
150+
#'
151+
#' With current logic, we always need to include data for model training (in
152+
#' case cached models are not available for a "make_predictions"-only run and
153+
#' we need to train new models).
154+
#'
155+
#' We generate test and train data by applying the following filters:
156+
#' - Test data is data where issue_date is in params$test_dates
157+
#' (as a continuous filter, min(params$test_dates) <= issue_date <= max(params$test_dates) )
158+
#' - Train data is data where issue_date < training_end_date; and
159+
#' training_start_date < target_date <= training_end_date
160+
#'
161+
#' Train data doesn't have an explicit lower bound on issue_date, but we can
162+
#' derive one.
163+
#'
164+
#' Since target_date = reference_date + params$ref_lag and issue_date >=
165+
#' reference_date, the requirement that training_start_date < target_date
166+
#' also implies that issue date must be > training_start_date - params$ref_lag
167+
#'
168+
#' @template params-template
169+
get_issue_date_range <- function(params) {
170+
result <- get_training_date_range(params)
171+
172+
# Check that all training data is earlier than the earliest test date.
173+
#
174+
# It's inappropriate to make predictions of historical data based on a model
175+
# trained using future data. If we want to make predictions for an old test
176+
# date t0 (t0 < TODAY), we will always need to train a new model based on
177+
# data t < t0.
178+
assert(
179+
result$training_end_date <= min(params$test_dates),
180+
"training end date must be earlier than the earliest test date to produce valid predictions"
181+
)
182+
183+
## TODO: right now, this gets both training and testing data regardless of
184+
# which mode is selected
185+
start_issue <- result$training_start_date - params$ref_lag
186+
end_issue <- max(params$test_dates)
187+
188+
return(list("start_issue" = start_issue, "end_issue" = end_issue))
189+
}
190+
147191
#' Create pattern to match input files of a given type and signal
148192
#'
149193
#' @template indicator-template

backfill_corrections/delphiBackfillCorrection/R/model.R

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -290,13 +290,15 @@ generate_filename <- function(indicator, signal,
290290
#'
291291
#' @template params-template
292292
get_training_date_range <- function(params) {
293+
default_end_date <- TODAY - params$testing_window + 1
294+
293295
if (params$train_models) {
294296
if (params_element_exists_and_valid(params, "training_end_date")) {
295297
# Use user-provided end date.
296298
training_end_date <- as.Date(params$training_end_date)
297299
} else {
298300
# Default end date is today.
299-
training_end_date <- TODAY
301+
training_end_date <- default_end_date
300302
}
301303
} else {
302304
# Get end date from cached model files. Assumes filename format like
@@ -306,7 +308,7 @@ get_training_date_range <- function(params) {
306308
model_files <- list.files(params$cache_dir, "^202[0-9]{5}_202[0-9]{5}.*[.]model$")
307309
if (length(model_files) == 0) {
308310
# We know we'll be retraining models today.
309-
training_end_date <- TODAY
311+
training_end_date <- default_end_date
310312
} else {
311313
# If only some models are in the cache, they will be used and those
312314
# missing will be regenerated as-of the training end date.

backfill_corrections/delphiBackfillCorrection/R/utils.R

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,8 @@ read_params <- function(path = "params.json", template_path = "params.json.templ
7676
if (!("ref_lag" %in% names(params))) {params$ref_lag <- REF_LAG}
7777
if (!("testing_window" %in% names(params))) {params$testing_window <- TESTING_WINDOW}
7878
if (!("test_dates" %in% names(params)) || length(params$test_dates) == 0) {
79-
start_date <- TODAY - params$testing_window
80-
end_date <- TODAY - 1
79+
start_date <- TODAY - params$testing_window + 1
80+
end_date <- TODAY
8181
params$test_dates <- seq(start_date, end_date, by="days")
8282
} else {
8383
if (length(params$test_dates) != 2) {
@@ -218,3 +218,19 @@ make_key <- function(value_type, signal_suffix) {
218218
params_element_exists_and_valid <- function(params, key) {
219219
return(key %in% names(params) && !is.null(params[[key]]) && !is.na(params[[key]]))
220220
}
221+
222+
#' Assert a logical value
223+
#'
224+
#' Will issue a \code{stop} command if the given statement is false.
225+
#'
226+
#' @param statement a logical value
227+
#' @param msg a character string displayed as an additional message
228+
#'
229+
#' @export
230+
assert <- function(statement, msg="")
231+
{
232+
if (!statement)
233+
{
234+
stop(msg, call.=(msg==""))
235+
}
236+
}

backfill_corrections/delphiBackfillCorrection/man/assert.Rd

Lines changed: 16 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

backfill_corrections/delphiBackfillCorrection/man/get_issue_date_range.Rd

Lines changed: 33 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)