Merge pull request #1132 from cmu-delphi/contingency-autoset-input-files

krivard · web-flow · commit e9b90cdc7149 · 2021-07-22T16:25:38.000-04:00
Allow contingency runs to fetch sparse data if none provided
diff --git a/facebook/contingency-combine.R b/facebook/contingency-combine.R
@@ -189,7 +189,7 @@ write_rollup <- function(newly_seen_files, seen_file, output_df, output_file) {
 
 args <- commandArgs(TRUE)
 
-if (length(args) < 2) {
+if (length(args) != 2) {
   stop("Usage: Rscript contingency-combine.R path/to/individual/files/ path/to/rollup/files/")
 }
 
diff --git a/facebook/delphiFacebook/NAMESPACE b/facebook/delphiFacebook/NAMESPACE
@@ -18,6 +18,7 @@ export(get_filenames_in_range)
 export(get_range_prev_full_month)
 export(get_range_prev_full_period)
 export(get_range_prev_full_week)
+export(get_sparse_filenames)
 export(jeffreys_se)
 export(join_weights)
 export(load_archive)
diff --git a/facebook/delphiFacebook/R/contingency_utils.R b/facebook/delphiFacebook/R/contingency_utils.R
@@ -1,7 +1,7 @@
 #' Return params file as an R list
 #'
 #' Reads a parameters file. Copies global params to contingency params if not
-#' already defined.
+#' already defined. Uses current date as end_date if not provided.
 #'
 #' @param path    path to the parameters file; if not present, will try to copy the file
 #'                "params.json.template"
@@ -17,15 +17,21 @@ read_contingency_params <- function(path = "params.json", template_path = "param
   contingency_params$start_time <- ymd_hms(
     sprintf("%s 00:00:00", contingency_params$start_date), tz = tz_to
   )
+  
+  # Fill in end_date, if missing, with current date.
+  contingency_params$end_date <- if_else(
+    is.null(contingency_params$end_date), as.character(Sys.Date()), contingency_params$end_date
+  )
+  
   contingency_params$end_time <- ymd_hms(
     sprintf("%s 23:59:59", contingency_params$end_date), tz = tz_to
   )
   
   global_params <- c("archive_days", "backfill_days", "static_dir", "cache_dir", 
                      "archive_dir", "weights_in_dir", "input_dir", "debug", 
-                     "parallel")
+                     "parallel", "qualtrics")
   for (param in global_params) {
-    if ( is.null(contingency_params[[param]]) ) {
+    if ( is.null(contingency_params[[param]]) & !is.null(params[[param]]) ) {
       contingency_params[[param]] <- params[[param]]
     }
   }
@@ -55,36 +61,45 @@ read_contingency_params <- function(path = "params.json", template_path = "param
 #' 
 #' @export
 update_params <- function(params) {
-  # Fill in end_time, if missing, with current time.
-  if (is.null(params$end_time)) {
-    params$end_time <- Sys.time()
-  }
-  
   # Construct aggregate date range.
   if ( !is.null(params$start_date) ) {
+    # If start_date is provided, use start/end dates exactly as given.
     date_range <- list(params$start_time, params$end_time)
   } else {
     # If start_date is not provided, assume want to use preceding full time period.
     date_range <- get_range_prev_full_period(
-      as_date(params$end_date)
-      , params$aggregate_range
+      as_date(params$end_date), params$aggregate_range
     )
   }
   
-  params$input <- get_filenames_in_range(date_range[[1]], date_range[[2]], params)
-  if ( length(params[["input"]]) == 0 || all(is.na(params[["input"]])) ) {
+  if ( is.null(params[["input"]]) || length(params$input) == 0 ) {
+    # If params$input empty or not provided, fetch filenames from input_dir.
+    params$input <- get_sparse_filenames(date_range[[1]], date_range[[2]], params)
+  } else {
+    # If input files provided, subset to those in desired date range.
+    params$input <- get_filenames_in_range(date_range[[1]], date_range[[2]], params)
+  }
+  
+  # Overwrites contents of file of the same name.
+  writeLines(params$input, "contingency_input.txt")
+  
+  if ( length(params[["input"]]) == 0 || all(is.na(params$input)) ) {
     stop("no input files to read in")
   }
   
   params$start_time <- date_range[[1]]
   params$end_time <- date_range[[2]]
   params$start_date <- as_date(date_range[[1]])
   params$end_date <- as_date(date_range[[2]])
-
+  
   return(params)
 }
 
 #' Get relevant input data file names from `input_dir`.
+#' 
+#' Only include files containing data that falls at least somewhat between start
+#' and end dates, and is from an allowed ("active") survey and not a "dormant"
+#' survey.
 #'
 #' @param start_date    Start of desired date range 
 #' @param end_date    End of desired date range 
@@ -101,17 +116,33 @@ get_filenames_in_range <- function(start_date, end_date, params) {
   start_date <- as_date(start_date) - days(params$backfill_days)
   end_date <- as_date(end_date)
   
-  if ( is.null(params$input) | length(params$input) == 0 ) {
-    date_pattern <- "^[0-9]{4}-[0-9]{2}-[0-9]{2}.*[.]csv$"
-    youtube_pattern <- ".*YouTube[.]csv$"
+  if ( is.null(params[["input"]]) || length(params$input) == 0 ) {
+    ## Keep all files from active surveys that appear in the input dir.
+    
+    if ( !is.null(params[["qualtrics"]]) ) {
+      include_patterns <- names(params$qualtrics$surveys$active)
+      include_patterns <- gsub(" ", "_", include_patterns, fixed=TRUE)
+      
+      exclude_patterns <- names(params$qualtrics$surveys$dormant)
+      exclude_patterns <- gsub(" ", "_", exclude_patterns, fixed=TRUE)
+    } else {
+      # If no active/dormant survey info provided, use basic patterns to
+      # include/exclude survey files.
+      include_patterns <- c("^[0-9]{4}-[0-9]{2}-[0-9]{2}.*[.]csv$")
+      exclude_patterns <- c(".*YouTube[.]csv$")
+    }
     
     filenames <- list.files(path=params$input_dir)
-    filenames <- filenames[grepl(date_pattern, filenames) & !grepl(youtube_pattern, filenames)]
+    
+    include_map <- grepl(paste(include_patterns, collapse="|"), filenames)
+    exclude_map <- grepl(paste(exclude_patterns, collapse="|"), filenames)
+    filenames <- filenames[include_map & !exclude_map]
   } else {
     filenames <- params$input
   }
     
-  file_end_dates <- as_date(substr(filenames, 1, 10))
+  # Filenames are formatted as "{generation date}.{start date}.{end date}.{survey name}_-_{survey version}.csv".
+  file_end_dates <- as_date(substr(filenames, 23, 32))
   file_start_dates <- as_date(substr(filenames, 12, 21))
   
   # Only keep files with data that falls at least somewhat between the desired
@@ -123,6 +154,37 @@ get_filenames_in_range <- function(start_date, end_date, params) {
   return(filenames)
 }
 
+#' Get sparse list of input data files from `input_dir`.
+#' 
+#' Finds every fourth + last file by date.
+#'
+#' @param start_date    Start of desired date range 
+#' @param end_date    End of desired date range 
+#' @param params    Params object produced by read_params
+#'
+#' @return Character vector of filenames
+#' 
+#' @importFrom lubridate as_date
+#' 
+#' @export
+get_sparse_filenames <- function(start_date, end_date, params) {
+  if (params$use_input_asis) { return(params$input) }
+  
+  filenames <- get_filenames_in_range(start_date, end_date, params)
+  
+  file_end_dates <- as_date(substr(filenames, 23, 32))
+  unique_file_end_dates <- sort(unique(file_end_dates))
+  
+  # Use every fourth date. Always keep last date.
+  keep_inds <- unique(c(
+    seq(1, length(unique_file_end_dates), 4L),
+    length(unique_file_end_dates)))
+  keep_dates <- unique_file_end_dates[keep_inds]
+  filenames <- filenames[file_end_dates %in% keep_dates]
+  
+  return(filenames)
+}
+
 #' Check user-set aggregations for basic validity and add a few necessary cols.
 #'
 #' @param aggregations Data frame with columns `name`, `var_weight`, `metric`,
diff --git a/facebook/delphiFacebook/R/contingency_write.R b/facebook/delphiFacebook/R/contingency_write.R
@@ -148,7 +148,7 @@ add_metadata_vars <- function(data, params, geo_type, groupby_vars) {
 #' @noRd
 get_file_name <- function(params, geo_type, groupby_vars) {
   
-  aggregation_type <- setdiff(groupby_vars, "geo_id")
+  aggregation_type <- sort(setdiff(groupby_vars, "geo_id"))
   if (length(aggregation_type) == 0) aggregation_type <- "overall"
   
   file_name <- paste(
diff --git a/facebook/delphiFacebook/integration-tests/testthat/teardown-run.R b/facebook/delphiFacebook/integration-tests/testthat/teardown-run.R
@@ -18,6 +18,7 @@ file.remove(test_path("archive"))
 file.remove(test_path("receiving_full"))
 file.remove(test_path("individual_full"))
 file.remove(test_path("receiving_contingency_full"))
+file.remove(test_path("contingency_input.txt"))
 
 if ( dir.exists(test_path("receiving_contingency_test")) ) {
   file.remove(test_path("receiving_contingency_test"))
diff --git a/facebook/delphiFacebook/man/get_filenames_in_range.Rd b/facebook/delphiFacebook/man/get_filenames_in_range.Rd
diff --git a/facebook/delphiFacebook/man/get_sparse_filenames.Rd b/facebook/delphiFacebook/man/get_sparse_filenames.Rd
diff --git a/facebook/delphiFacebook/man/read_contingency_params.Rd b/facebook/delphiFacebook/man/read_contingency_params.Rd
diff --git a/facebook/delphiFacebook/unit-tests/testthat/test-contingency-utils.R b/facebook/delphiFacebook/unit-tests/testthat/test-contingency-utils.R
@@ -10,7 +10,7 @@ test_that("testing update_params command", {
     use_input_asis = TRUE,
     aggregate_range = "month",
     end_date = "2020-02-01",
-    input_dir = "./input"
+    input_dir = "./static" # Using a directory that doesn't contain any valid data files.
   )
   
   expect_error(update_params(params), "no input files to read in")
@@ -30,8 +30,8 @@ test_that("testing update_params command", {
     use_input_asis = TRUE,
     aggregate_range = "month",
     end_date = ymd("2020-01-31"),
-    end_time = ymd_hms("2020-01-31 23:59:59", tz=timezone),
     start_time = ymd_hms("2020-01-01 00:00:00", tz=timezone),
+    end_time = ymd_hms("2020-01-31 23:59:59", tz=timezone),
     start_date = ymd("2020-01-01")
     )
   
@@ -42,13 +42,13 @@ test_that("testing update_params command", {
 test_that("testing get_filenames_in_range command", {
   tdir <- tempfile()
   files <- c(
-    "2019-11-06.2019-10-30.2020-11-06.Survey_of_COVID-Like_Illness_-_TODEPLOY_......_-_US_Expansion.csv",
-    "2019-12-31.2019-12-24_With_Translations.csv",
-    "2020-01-06.2019-12-31_Wave_4.csv",
-    "2020-01-16.2020-01-09_YouTube.csv",
-    "2020-01-16.2020-01-09_Wave_4.csv",
-    "2020-02-06.2020-01-31_Wave_4.csv",
-    "2020-02-16.2020-02-09_Wave_3.csv"
+    "2029-01-01.2019-10-30.2019-11-06.Survey_of_COVID-Like_Illness_-_TODEPLOY_......_-_US_Expansion.csv",
+    "2029-01-01.2019-12-24.2019-12-31_With_Translations.csv",
+    "2029-01-01.2019-12-31.2020-01-06_Wave_4.csv",
+    "2029-01-01.2020-01-09.2020-01-16_YouTube.csv",
+    "2029-01-01.2020-01-09.2020-01-16_Wave_4.csv",
+    "2029-01-01.2020-01-31.2020-02-06_Wave_4.csv",
+    "2029-01-01.2020-02-09.2020-02-16_Wave_3.csv"
   )
   
   create_dir_not_exist(tdir)
@@ -65,17 +65,66 @@ test_that("testing get_filenames_in_range command", {
   date_range <- list(ymd("2020-01-01"), ymd("2020-01-31"))
   
   expected_output <- c(
-    "2019-12-31.2019-12-24_With_Translations.csv",
-    "2020-01-06.2019-12-31_Wave_4.csv",
-    "2020-01-16.2020-01-09_Wave_4.csv",
-    "2020-02-06.2020-01-31_Wave_4.csv"
+    "2029-01-01.2019-12-24.2019-12-31_With_Translations.csv",
+    "2029-01-01.2019-12-31.2020-01-06_Wave_4.csv",
+    "2029-01-01.2020-01-09.2020-01-16_Wave_4.csv",
+    "2029-01-01.2020-01-31.2020-02-06_Wave_4.csv"
   )
   
   out <- get_filenames_in_range(date_range[[1]], date_range[[2]], params)
-  
   expect_equal(out, expected_output)
 })
 
+
+test_that("testing get_sparse_filenames command", {
+  tdir <- tempfile()
+  files <- c(
+    "2021-12-11.2019-12-26.2020-01-01_Wave_4.csv",
+    "2021-12-11.2019-12-27.2020-01-02_Wave_4.csv",
+    "2021-12-11.2019-12-28.2020-01-03_Wave_4.csv",
+    "2021-12-11.2019-12-29.2020-01-04_Wave_4.csv",
+    "2021-12-11.2019-12-30.2020-01-05_Wave_4.csv",
+    "2021-12-11.2019-12-30.2020-01-05_Wave_5.csv",
+    "2021-12-11.2019-12-31.2020-01-06_Wave_4.csv",
+    "2021-12-11.2019-12-31.2020-01-06_Wave_5.csv",
+    "2021-12-11.2019-01-01.2020-01-07_Wave_4.csv",
+    "2021-12-11.2019-01-02.2020-01-08_Wave_4.csv",
+    "2021-12-11.2019-01-03.2020-01-09_Wave_4.csv",
+    "2021-12-11.2019-01-04.2020-01-10_Wave_4.csv",
+    
+    "2011-12-11.2019-10-30.2019-11-06.2020-11-06.Survey_of_COVID-Like_Illness_-_TODEPLOY_......_-_US_Expansion.csv",
+    "2021-12-11.2020-01-09.2020-01-16_YouTube.csv",
+    "2021-12-11.2020-01-09.2020-01-16_Wave_4.csv",
+    "2021-12-11.2020-01-31.2020-02-06_Wave_4.csv",
+    "2021-12-11.2020-02-09.2020-02-16_Wave_3.csv"
+  )
+  
+  create_dir_not_exist(tdir)
+  for (filename in files) {
+    write_csv(data.frame(), path = file.path(tdir, filename))
+  }
+  
+  params <- list(
+    input = c(),
+    use_input_asis = FALSE,
+    backfill_days = 4,
+    input_dir = tdir
+  )
+  date_range <- list(ymd("2020-01-01"), ymd("2020-01-6"))
+  
+  expected_output <- c(
+    "2021-12-11.2019-12-26.2020-01-01_Wave_4.csv",
+    "2021-12-11.2019-12-30.2020-01-05_Wave_4.csv",
+    "2021-12-11.2019-12-30.2020-01-05_Wave_5.csv",
+    "2021-12-11.2019-01-03.2020-01-09_Wave_4.csv",
+    "2021-12-11.2019-01-04.2020-01-10_Wave_4.csv"
+  )
+  
+  out <- get_sparse_filenames(date_range[[1]], date_range[[2]], params)
+  expect_setequal(out, expected_output)
+})
+
+
 test_that("testing verify_aggs command", {
   # Duplicate rows
   input_aggs <- tribble(
diff --git a/facebook/delphiFacebook/unit-tests/testthat/test-contingency-write.R b/facebook/delphiFacebook/unit-tests/testthat/test-contingency-write.R
@@ -61,7 +61,11 @@ test_that("testing command to create output filenames", {
   out <- get_file_name(params, "nation", c("gender"))
   expected <- "DebugOn-DoNotShare_20210101_20210102_monthly_nation_gender.csv"
   
+  expect_equal(out, expected)
+  
   params$debug <- FALSE
   out <- get_file_name(params, "nation", c("gender", "race", "ethnicity"))
   expected <- "20210101_20210102_monthly_nation_ethnicity_gender_race.csv"
+  
+  expect_equal(out, expected)
 })

Original file line number	Diff line number	Diff line change
`@@ -189,7 +189,7 @@ write_rollup <- function(newly_seen_files, seen_file, output_df, output_file) {`
`189`	`189`
`190`	`190`	`args <- commandArgs(TRUE)`
`191`	`191`
`192`		`-if (length(args) < 2) {`
	`192`	`+if (length(args) != 2) {`
`193`	`193`	`stop("Usage: Rscript contingency-combine.R path/to/individual/files/ path/to/rollup/files/")`
`194`	`194`	`}`
`195`	`195`