Allow direct append if never seen file

nmdefries · nmdefries · commit e43fc130a0aa · 2021-05-04T15:35:17.000-04:00
diff --git a/facebook/contingency-combine.R b/facebook/contingency-combine.R
@@ -14,6 +14,7 @@ suppressPackageStartupMessages({
   library(dplyr)
   library(readr)
   library(purrr)
+  library(delphiFacebook)
 })
 
 
@@ -28,7 +29,6 @@ suppressPackageStartupMessages({
 #'   open. By default, selects all `.csv` files with standard table date prefix.
 run_rollup <- function(input_dir, output_dir, pattern = "^[0-9]{8}_[0-9]{8}.*[.]csv$") {
   files <- list.files(input_dir, pattern = pattern)
-
   if (length(files) == 0) {
     stop("No matching data files.")
   }
@@ -38,12 +38,21 @@ run_rollup <- function(input_dir, output_dir, pattern = "^[0-9]{8}_[0-9]{8}.*[.]
   # Reformat files as a list such that input files with same grouping variables
   # (and thus same output file) are in a character vector named with the output
   # file.
-  files <- lapply(split(files, files$rollupname), function(x) {x$filename})
+  files <- lapply(split(files, files$rollup_name), function(x) {x$filename})
+  
+  if (!dir.exists(output_dir)) { dir.create(output_dir) }
+  seen_file <- file.path(output_dir, "seen.txt")
+  seen_files <- load_seen_file(seen_file)
   
   for (output_name in names(files)) {
-    combine_and_save_tables(
-      file.path(input_dir, files[[output_name]]), 
+    browser
+    newly_seen_files <- combine_and_save_tables(
+      seen_files,
+      input_dir,
+      files[[output_name]], 
       file.path(output_dir, output_name))
+    browser()  
+    write(newly_seen_files, seen_file, append=TRUE)
   }
   
   return(NULL)
@@ -55,24 +64,40 @@ get_file_properties <- function(filename) {
   parts <- strsplit(short, "_", fixed = TRUE)[[1]]
   
   group <- parts[3:length(parts)]
-  # Specify compression format in name, to be parsed by `write_csv` later.
-  partialname <- paste0(paste0(group, collapse="_"), ".csv.gz")
+  # Specify compression format via name, to be parsed by `write_csv` later.
+  partial_name <- paste0(paste0(group, collapse="_"), ".csv.gz")
 
   return(data.frame(
     filename=filename,
-    rollupname=partialname))
+    rollup_name=partial_name))
+}
+
+## Helper function to load "seen" file.
+load_seen_file <- function(seen_file) {
+  if (!file.exists(seen_file)) {
+    file.create(seen_file)
+  }
+  
+  seen_files <- readLines(seen_file)
+  return(seen_files)
 }
 
 #' Combine set of input files with existing output file, and save to disk.
 #'
-#' If a date range has been seen before, the input and output data are
+#' If an input filename has been seen before, the input and output data are
 #' deduplicated to use the newer set of data. Output is saved in gzip-compressed
 #' format.
 #'
+#' @param seen_files Vector of filenames that have been previously loaded into
+#'   an output file.
+#' @param input_dir Directory in which to look for survey CSV files, relative to
+#'   the current working directory.
 #' @param input_files Vector of paths to input files that share a set of
 #'   grouping variables.
 #' @param output_file Path to corresponding output file.
-combine_and_save_tables <- function(input_files, output_file) {
+#' 
+#' @return Character vector of newly-seen filenames.
+combine_and_save_tables <- function(seen_files, input_dir, input_files, output_file) {
   cols <- cols(
     .default = col_guess(),
     survey_geo = col_character(),
@@ -90,48 +115,60 @@ combine_and_save_tables <- function(input_files, output_file) {
     county_fips = col_character()
   )
   
+  # Get input data.
   input_df <- map_dfr(
-    input_files,
+    file.path(input_dir, input_files),
     function(f) {
       read_csv(f, col_types = cols)
     }
   )
   
-  if (!file.exists(output_file)) {
-    warning(paste0("Output file ", output_file, " does not exist. Creating a new copy."))
-    # Create an empty starting df with the expected column names, order, and type.
-    output_df <- input_df[FALSE,]
-  } else {
-    output_df <- read_csv(output_file, col_types = cols)
+  if (file.exists(output_file)) {
+    output_names <- names(read_csv(output_file, n_max = 0L))
+    assert(identical(output_names, names(input_df)),
+           paste0("Column names and/or order differ between new and old input for ", output_file))
   }
   
-  # For finding unique group/geo-level/date combinations, use all columns up to
-  # the first "val" column. This generalizes the process of finding unique rows,
-  # when we might be using different grouping variables or different geo levels
-  # (county/state/nation appear in different columns).
-  group_names <- names(output_df)
-  group_names <- group_names[ 1:min(which(startsWith(group_names, "val_")))-1 ]
+  # If no input files have been seen before, we can append directly to the
+  # output file without needing to deduplicate. File is created if it doesn't
+  # already exist.
+  any_prev_seen <- any(input_files %in% seen_files)
   
-  ## Deduplicate, keeping newest version by issue date of each unique row.
-  # Merge the new data with the existing data, taking the last issue date for
-  # any given grouping/geo level/date combo. This prevents duplication in case
-  # of reissues. Note that the order matters: since arrange() uses order(),
-  # which is a stable sort, ties will result in the input data being used in
-  # preference over the existing rollup data.
-  output_df <- bind_rows(output_df, input_df) %>%
-    arrange(issue_date) %>% 
-    group_by(across(all_of(group_names))) %>% 
-    slice_tail() %>% 
-    ungroup()
-  
-  # Automatically uses gzip compression based on output name.
-  write_csv(output_df, output_file)
+  if (!any_prev_seen) {
+    write_csv(input_df, output_file, append=file.exists(output_file))
+  } else {
+    assert(file.exists(output_file),
+           paste0("The output file ", output_file, " does not exist, but ",
+                  "non-zero files using the same grouping have been seen before."))
+    
+    output_df <- read_csv(output_file, col_types = cols)
+    
+    # Use all columns up to the first "val" column to find unique rows.
+    group_names <- names(output_df)
+    ind_first_val_col <- min(which(startsWith(group_names, "val_")))
+    group_names <- group_names[ 1:ind_first_val_col-1 ]
+    
+    ## Deduplicate, keeping newest version by issue date of each unique row.
+    # Merge the new data with the existing data, taking the last issue date for
+    # any given grouping/geo level/date combo. This prevents duplication in case
+    # of reissues. Note that the order matters: since arrange() uses order(),
+    # which is a stable sort, ties will result in the input data being used in
+    # preference over the existing rollup data.
+    output_df <- bind_rows(output_df, input_df) %>%
+      arrange(issue_date) %>% 
+      group_by(across(all_of(group_names))) %>% 
+      slice_tail() %>% 
+      ungroup()
+    
+    # Automatically uses gzip compression based on output file name.
+    write_csv(output_df, output_file)
+  }
   
-  return(NULL)
+  newly_seen <- setdiff(input_files, seen_files)
+  return(newly_seen)
 }
 
 
-
 args <- commandArgs(TRUE)
 
 if (length(args) < 2) {