Skip to content

Commit e9b90cd

Browse files
authored
Merge pull request #1132 from cmu-delphi/contingency-autoset-input-files
Allow contingency runs to fetch sparse data if none provided
2 parents f8b46ea + c18b666 commit e9b90cd

File tree

10 files changed

+176
-36
lines changed

10 files changed

+176
-36
lines changed

facebook/contingency-combine.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,7 @@ write_rollup <- function(newly_seen_files, seen_file, output_df, output_file) {
189189

190190
args <- commandArgs(TRUE)
191191

192-
if (length(args) < 2) {
192+
if (length(args) != 2) {
193193
stop("Usage: Rscript contingency-combine.R path/to/individual/files/ path/to/rollup/files/")
194194
}
195195

facebook/delphiFacebook/NAMESPACE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ export(get_filenames_in_range)
1818
export(get_range_prev_full_month)
1919
export(get_range_prev_full_period)
2020
export(get_range_prev_full_week)
21+
export(get_sparse_filenames)
2122
export(jeffreys_se)
2223
export(join_weights)
2324
export(load_archive)

facebook/delphiFacebook/R/contingency_utils.R

Lines changed: 80 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#' Return params file as an R list
22
#'
33
#' Reads a parameters file. Copies global params to contingency params if not
4-
#' already defined.
4+
#' already defined. Uses current date as end_date if not provided.
55
#'
66
#' @param path path to the parameters file; if not present, will try to copy the file
77
#' "params.json.template"
@@ -17,15 +17,21 @@ read_contingency_params <- function(path = "params.json", template_path = "param
1717
contingency_params$start_time <- ymd_hms(
1818
sprintf("%s 00:00:00", contingency_params$start_date), tz = tz_to
1919
)
20+
21+
# Fill in end_date, if missing, with current date.
22+
contingency_params$end_date <- if_else(
23+
is.null(contingency_params$end_date), as.character(Sys.Date()), contingency_params$end_date
24+
)
25+
2026
contingency_params$end_time <- ymd_hms(
2127
sprintf("%s 23:59:59", contingency_params$end_date), tz = tz_to
2228
)
2329

2430
global_params <- c("archive_days", "backfill_days", "static_dir", "cache_dir",
2531
"archive_dir", "weights_in_dir", "input_dir", "debug",
26-
"parallel")
32+
"parallel", "qualtrics")
2733
for (param in global_params) {
28-
if ( is.null(contingency_params[[param]]) ) {
34+
if ( is.null(contingency_params[[param]]) & !is.null(params[[param]]) ) {
2935
contingency_params[[param]] <- params[[param]]
3036
}
3137
}
@@ -55,36 +61,45 @@ read_contingency_params <- function(path = "params.json", template_path = "param
5561
#'
5662
#' @export
5763
update_params <- function(params) {
58-
# Fill in end_time, if missing, with current time.
59-
if (is.null(params$end_time)) {
60-
params$end_time <- Sys.time()
61-
}
62-
6364
# Construct aggregate date range.
6465
if ( !is.null(params$start_date) ) {
66+
# If start_date is provided, use start/end dates exactly as given.
6567
date_range <- list(params$start_time, params$end_time)
6668
} else {
6769
# If start_date is not provided, assume want to use preceding full time period.
6870
date_range <- get_range_prev_full_period(
69-
as_date(params$end_date)
70-
, params$aggregate_range
71+
as_date(params$end_date), params$aggregate_range
7172
)
7273
}
7374

74-
params$input <- get_filenames_in_range(date_range[[1]], date_range[[2]], params)
75-
if ( length(params[["input"]]) == 0 || all(is.na(params[["input"]])) ) {
75+
if ( is.null(params[["input"]]) || length(params$input) == 0 ) {
76+
# If params$input empty or not provided, fetch filenames from input_dir.
77+
params$input <- get_sparse_filenames(date_range[[1]], date_range[[2]], params)
78+
} else {
79+
# If input files provided, subset to those in desired date range.
80+
params$input <- get_filenames_in_range(date_range[[1]], date_range[[2]], params)
81+
}
82+
83+
# Overwrites contents of file of the same name.
84+
writeLines(params$input, "contingency_input.txt")
85+
86+
if ( length(params[["input"]]) == 0 || all(is.na(params$input)) ) {
7687
stop("no input files to read in")
7788
}
7889

7990
params$start_time <- date_range[[1]]
8091
params$end_time <- date_range[[2]]
8192
params$start_date <- as_date(date_range[[1]])
8293
params$end_date <- as_date(date_range[[2]])
83-
94+
8495
return(params)
8596
}
8697

8798
#' Get relevant input data file names from `input_dir`.
99+
#'
100+
#' Only include files containing data that falls at least somewhat between start
101+
#' and end dates, and is from an allowed ("active") survey and not a "dormant"
102+
#' survey.
88103
#'
89104
#' @param start_date Start of desired date range
90105
#' @param end_date End of desired date range
@@ -101,17 +116,33 @@ get_filenames_in_range <- function(start_date, end_date, params) {
101116
start_date <- as_date(start_date) - days(params$backfill_days)
102117
end_date <- as_date(end_date)
103118

104-
if ( is.null(params$input) | length(params$input) == 0 ) {
105-
date_pattern <- "^[0-9]{4}-[0-9]{2}-[0-9]{2}.*[.]csv$"
106-
youtube_pattern <- ".*YouTube[.]csv$"
119+
if ( is.null(params[["input"]]) || length(params$input) == 0 ) {
120+
## Keep all files from active surveys that appear in the input dir.
121+
122+
if ( !is.null(params[["qualtrics"]]) ) {
123+
include_patterns <- names(params$qualtrics$surveys$active)
124+
include_patterns <- gsub(" ", "_", include_patterns, fixed=TRUE)
125+
126+
exclude_patterns <- names(params$qualtrics$surveys$dormant)
127+
exclude_patterns <- gsub(" ", "_", exclude_patterns, fixed=TRUE)
128+
} else {
129+
# If no active/dormant survey info provided, use basic patterns to
130+
# include/exclude survey files.
131+
include_patterns <- c("^[0-9]{4}-[0-9]{2}-[0-9]{2}.*[.]csv$")
132+
exclude_patterns <- c(".*YouTube[.]csv$")
133+
}
107134

108135
filenames <- list.files(path=params$input_dir)
109-
filenames <- filenames[grepl(date_pattern, filenames) & !grepl(youtube_pattern, filenames)]
136+
137+
include_map <- grepl(paste(include_patterns, collapse="|"), filenames)
138+
exclude_map <- grepl(paste(exclude_patterns, collapse="|"), filenames)
139+
filenames <- filenames[include_map & !exclude_map]
110140
} else {
111141
filenames <- params$input
112142
}
113143

114-
file_end_dates <- as_date(substr(filenames, 1, 10))
144+
# Filenames are formatted as "{generation date}.{start date}.{end date}.{survey name}_-_{survey version}.csv".
145+
file_end_dates <- as_date(substr(filenames, 23, 32))
115146
file_start_dates <- as_date(substr(filenames, 12, 21))
116147

117148
# Only keep files with data that falls at least somewhat between the desired
@@ -123,6 +154,37 @@ get_filenames_in_range <- function(start_date, end_date, params) {
123154
return(filenames)
124155
}
125156

157+
#' Get sparse list of input data files from `input_dir`.
158+
#'
159+
#' Finds every fourth + last file by date.
160+
#'
161+
#' @param start_date Start of desired date range
162+
#' @param end_date End of desired date range
163+
#' @param params Params object produced by read_params
164+
#'
165+
#' @return Character vector of filenames
166+
#'
167+
#' @importFrom lubridate as_date
168+
#'
169+
#' @export
170+
get_sparse_filenames <- function(start_date, end_date, params) {
171+
if (params$use_input_asis) { return(params$input) }
172+
173+
filenames <- get_filenames_in_range(start_date, end_date, params)
174+
175+
file_end_dates <- as_date(substr(filenames, 23, 32))
176+
unique_file_end_dates <- sort(unique(file_end_dates))
177+
178+
# Use every fourth date. Always keep last date.
179+
keep_inds <- unique(c(
180+
seq(1, length(unique_file_end_dates), 4L),
181+
length(unique_file_end_dates)))
182+
keep_dates <- unique_file_end_dates[keep_inds]
183+
filenames <- filenames[file_end_dates %in% keep_dates]
184+
185+
return(filenames)
186+
}
187+
126188
#' Check user-set aggregations for basic validity and add a few necessary cols.
127189
#'
128190
#' @param aggregations Data frame with columns `name`, `var_weight`, `metric`,

facebook/delphiFacebook/R/contingency_write.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ add_metadata_vars <- function(data, params, geo_type, groupby_vars) {
148148
#' @noRd
149149
get_file_name <- function(params, geo_type, groupby_vars) {
150150

151-
aggregation_type <- setdiff(groupby_vars, "geo_id")
151+
aggregation_type <- sort(setdiff(groupby_vars, "geo_id"))
152152
if (length(aggregation_type) == 0) aggregation_type <- "overall"
153153

154154
file_name <- paste(

facebook/delphiFacebook/integration-tests/testthat/teardown-run.R

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ file.remove(test_path("archive"))
1818
file.remove(test_path("receiving_full"))
1919
file.remove(test_path("individual_full"))
2020
file.remove(test_path("receiving_contingency_full"))
21+
file.remove(test_path("contingency_input.txt"))
2122

2223
if ( dir.exists(test_path("receiving_contingency_test")) ) {
2324
file.remove(test_path("receiving_contingency_test"))

facebook/delphiFacebook/man/get_filenames_in_range.Rd

Lines changed: 3 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

facebook/delphiFacebook/man/get_sparse_filenames.Rd

Lines changed: 21 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

facebook/delphiFacebook/man/read_contingency_params.Rd

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

facebook/delphiFacebook/unit-tests/testthat/test-contingency-utils.R

Lines changed: 63 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ test_that("testing update_params command", {
1010
use_input_asis = TRUE,
1111
aggregate_range = "month",
1212
end_date = "2020-02-01",
13-
input_dir = "./input"
13+
input_dir = "./static" # Using a directory that doesn't contain any valid data files.
1414
)
1515

1616
expect_error(update_params(params), "no input files to read in")
@@ -30,8 +30,8 @@ test_that("testing update_params command", {
3030
use_input_asis = TRUE,
3131
aggregate_range = "month",
3232
end_date = ymd("2020-01-31"),
33-
end_time = ymd_hms("2020-01-31 23:59:59", tz=timezone),
3433
start_time = ymd_hms("2020-01-01 00:00:00", tz=timezone),
34+
end_time = ymd_hms("2020-01-31 23:59:59", tz=timezone),
3535
start_date = ymd("2020-01-01")
3636
)
3737

@@ -42,13 +42,13 @@ test_that("testing update_params command", {
4242
test_that("testing get_filenames_in_range command", {
4343
tdir <- tempfile()
4444
files <- c(
45-
"2019-11-06.2019-10-30.2020-11-06.Survey_of_COVID-Like_Illness_-_TODEPLOY_......_-_US_Expansion.csv",
46-
"2019-12-31.2019-12-24_With_Translations.csv",
47-
"2020-01-06.2019-12-31_Wave_4.csv",
48-
"2020-01-16.2020-01-09_YouTube.csv",
49-
"2020-01-16.2020-01-09_Wave_4.csv",
50-
"2020-02-06.2020-01-31_Wave_4.csv",
51-
"2020-02-16.2020-02-09_Wave_3.csv"
45+
"2029-01-01.2019-10-30.2019-11-06.Survey_of_COVID-Like_Illness_-_TODEPLOY_......_-_US_Expansion.csv",
46+
"2029-01-01.2019-12-24.2019-12-31_With_Translations.csv",
47+
"2029-01-01.2019-12-31.2020-01-06_Wave_4.csv",
48+
"2029-01-01.2020-01-09.2020-01-16_YouTube.csv",
49+
"2029-01-01.2020-01-09.2020-01-16_Wave_4.csv",
50+
"2029-01-01.2020-01-31.2020-02-06_Wave_4.csv",
51+
"2029-01-01.2020-02-09.2020-02-16_Wave_3.csv"
5252
)
5353

5454
create_dir_not_exist(tdir)
@@ -65,17 +65,66 @@ test_that("testing get_filenames_in_range command", {
6565
date_range <- list(ymd("2020-01-01"), ymd("2020-01-31"))
6666

6767
expected_output <- c(
68-
"2019-12-31.2019-12-24_With_Translations.csv",
69-
"2020-01-06.2019-12-31_Wave_4.csv",
70-
"2020-01-16.2020-01-09_Wave_4.csv",
71-
"2020-02-06.2020-01-31_Wave_4.csv"
68+
"2029-01-01.2019-12-24.2019-12-31_With_Translations.csv",
69+
"2029-01-01.2019-12-31.2020-01-06_Wave_4.csv",
70+
"2029-01-01.2020-01-09.2020-01-16_Wave_4.csv",
71+
"2029-01-01.2020-01-31.2020-02-06_Wave_4.csv"
7272
)
7373

7474
out <- get_filenames_in_range(date_range[[1]], date_range[[2]], params)
75-
7675
expect_equal(out, expected_output)
7776
})
7877

78+
79+
test_that("testing get_sparse_filenames command", {
80+
tdir <- tempfile()
81+
files <- c(
82+
"2021-12-11.2019-12-26.2020-01-01_Wave_4.csv",
83+
"2021-12-11.2019-12-27.2020-01-02_Wave_4.csv",
84+
"2021-12-11.2019-12-28.2020-01-03_Wave_4.csv",
85+
"2021-12-11.2019-12-29.2020-01-04_Wave_4.csv",
86+
"2021-12-11.2019-12-30.2020-01-05_Wave_4.csv",
87+
"2021-12-11.2019-12-30.2020-01-05_Wave_5.csv",
88+
"2021-12-11.2019-12-31.2020-01-06_Wave_4.csv",
89+
"2021-12-11.2019-12-31.2020-01-06_Wave_5.csv",
90+
"2021-12-11.2019-01-01.2020-01-07_Wave_4.csv",
91+
"2021-12-11.2019-01-02.2020-01-08_Wave_4.csv",
92+
"2021-12-11.2019-01-03.2020-01-09_Wave_4.csv",
93+
"2021-12-11.2019-01-04.2020-01-10_Wave_4.csv",
94+
95+
"2011-12-11.2019-10-30.2019-11-06.2020-11-06.Survey_of_COVID-Like_Illness_-_TODEPLOY_......_-_US_Expansion.csv",
96+
"2021-12-11.2020-01-09.2020-01-16_YouTube.csv",
97+
"2021-12-11.2020-01-09.2020-01-16_Wave_4.csv",
98+
"2021-12-11.2020-01-31.2020-02-06_Wave_4.csv",
99+
"2021-12-11.2020-02-09.2020-02-16_Wave_3.csv"
100+
)
101+
102+
create_dir_not_exist(tdir)
103+
for (filename in files) {
104+
write_csv(data.frame(), path = file.path(tdir, filename))
105+
}
106+
107+
params <- list(
108+
input = c(),
109+
use_input_asis = FALSE,
110+
backfill_days = 4,
111+
input_dir = tdir
112+
)
113+
date_range <- list(ymd("2020-01-01"), ymd("2020-01-6"))
114+
115+
expected_output <- c(
116+
"2021-12-11.2019-12-26.2020-01-01_Wave_4.csv",
117+
"2021-12-11.2019-12-30.2020-01-05_Wave_4.csv",
118+
"2021-12-11.2019-12-30.2020-01-05_Wave_5.csv",
119+
"2021-12-11.2019-01-03.2020-01-09_Wave_4.csv",
120+
"2021-12-11.2019-01-04.2020-01-10_Wave_4.csv"
121+
)
122+
123+
out <- get_sparse_filenames(date_range[[1]], date_range[[2]], params)
124+
expect_setequal(out, expected_output)
125+
})
126+
127+
79128
test_that("testing verify_aggs command", {
80129
# Duplicate rows
81130
input_aggs <- tribble(

facebook/delphiFacebook/unit-tests/testthat/test-contingency-write.R

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,11 @@ test_that("testing command to create output filenames", {
6161
out <- get_file_name(params, "nation", c("gender"))
6262
expected <- "DebugOn-DoNotShare_20210101_20210102_monthly_nation_gender.csv"
6363

64+
expect_equal(out, expected)
65+
6466
params$debug <- FALSE
6567
out <- get_file_name(params, "nation", c("gender", "race", "ethnicity"))
6668
expected <- "20210101_20210102_monthly_nation_ethnicity_gender_race.csv"
69+
70+
expect_equal(out, expected)
6771
})

0 commit comments

Comments
 (0)