Skip to content

Commit 28737b3

Browse files
committed
Use merge process if column names don't match
1 parent ea692bc commit 28737b3

File tree

1 file changed

+15
-8
lines changed

1 file changed

+15
-8
lines changed

facebook/contingency-combine.R

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -128,15 +128,18 @@ combine_tables <- function(seen_file, input_dir, input_files, output_file) {
128128

129129
if (file.exists(output_file)) {
130130
output_names <- names(read_csv(output_file, n_max = 0L))
131-
assert(identical(output_names, names(input_df)),
132-
paste0("Column names and/or order differ between new and old input for ", output_file))
131+
identical_names <- identical(output_names, names(input_df))
132+
} else {
133+
identical_names <- TRUE
133134
}
134135

135136
seen_files <- load_seen_file(seen_file)
136137
any_prev_seen <- any(input_files %in% seen_files)
137138

138-
# If no input files have been seen before, we don't need to deduplicate.
139-
if (any_prev_seen) {
139+
# If input files have been seen before, we need to deduplicate. If there is a
140+
# mismatch between input and output column names/order, we need to explicitly
141+
# merge input and output data to make sure columns match up correctly.
142+
if (any_prev_seen || !identical_names) {
140143
assert(file.exists(output_file),
141144
paste0("The output file ", output_file, " does not exist, but non-zero",
142145
" files using the same grouping variables have been seen before."))
@@ -178,7 +181,7 @@ combine_tables <- function(seen_file, input_dir, input_files, output_file) {
178181
return(list(
179182
output_df=output_df,
180183
newly_seen_files=newly_seen,
181-
any_prev_seen=any_prev_seen))
184+
can_overwrite=any_prev_seen || !identical_names))
182185
}
183186

184187
#' Save a combined dataframe and list of seen files to disk.
@@ -188,19 +191,20 @@ combine_tables <- function(seen_file, input_dir, input_files, output_file) {
188191
#'
189192
#' @param combined_output Named list output from `combine_tables`. Contains an
190193
#' `output` dataframe, a list of newly seen files, and a flag indicating
191-
#' whether any input filenames have been seen before.
194+
#' whether we need to overwrite the existing output file or we can append to
195+
#' it.
192196
#' @param seen_file Path to file listing filenames that have been previously
193197
#' loaded into an output file.
194198
#' @param output_file Path to corresponding output file.
195199
write_rollup <- function(combined_output, seen_file, output_file) {
196200
output_df <- combined_output[["output_df"]]
197201
newly_seen_files <- combined_output[["newly_seen_files"]]
198-
any_prev_seen_files <- combined_output[["any_prev_seen"]]
202+
can_overwrite <- combined_output[["can_overwrite"]]
199203

200204
# If some input files have been seen before, overwrite any existing output
201205
# file. If no input files have been seen before, we can append directly to the
202206
# output file. File is created if it doesn't already exist.
203-
if (any_prev_seen_files) {
207+
if (can_overwrite) {
204208
# Automatically uses gzip compression based on output file name. Overwrites
205209
# existing file of the same name.
206210
write_csv(output_df, output_file)
@@ -225,4 +229,7 @@ if (length(args) < 2) {
225229
input_path <- args[1]
226230
output_path <- args[2]
227231

232+
input_path <- "~/Downloads/0418_tables/"
233+
output_path <- "~/Downloads/rollup_test_FB_press_conf"
234+
228235
invisible(run_rollup(input_path, output_path))

0 commit comments

Comments
 (0)