Use col spec from input df to read output file

nmdefries · nmdefries · commit 165a15f64233 · 2021-05-06T15:10:23.000-04:00
readr's column guessing procedure only uses the first 1000 lines, by
default, of a file to guess variable type for each column. If a column
is completely missing for the first 1000 lines, it is read in as a
logical which causes parsing failures if the column contains non-boolean
values later, outside the type guessing range.

This happens when reading in output files if an indicator was newly
added. To correctly specify these, use the column specification from the
input file/s. All columns included in input files are at least partially
non-missing and sorted alphabetically (indepdendent of missingness), so
we should always see non-missing values in the first 1000 lines.
diff --git a/facebook/contingency-combine.R b/facebook/contingency-combine.R
@@ -114,12 +114,12 @@ combine_tables <- function(seen_file, input_dir, input_files, output_file) {
     county_fips = col_character()
   )
   
-  # Get input data. Make sure `issue_date` is the last column after combining.
+  # Get input data.
   input_df <- map_dfr(
     file.path(input_dir, input_files),
     function(f) {
       read_csv(f, col_types = cols)
-    }) %>% relocate(issue_date, .after=last_col())
+    })
   
   seen_files <- load_seen_file(seen_file)
   if (any(input_files %in% seen_files)) {
@@ -128,6 +128,7 @@ combine_tables <- function(seen_file, input_dir, input_files, output_file) {
                   " files using the same grouping variables have been seen before."))
   }
   
+  cols <- cols_condense(spec(input_df))
   if ( file.exists(output_file) ) {
     output_df <- read_csv(output_file, col_types = cols)
   } else {