Skip to content

Commit db6c4bf

Browse files
committed
Create script to combine contingency tables with shared grouping vars
1 parent 237edd6 commit db6c4bf

File tree

1 file changed

+144
-0
lines changed

1 file changed

+144
-0
lines changed

facebook/contingency-combine.R

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
#!/usr/bin/env Rscript
2+
3+
## Combine and compress contingency tables by aggregation.
4+
##
5+
## Usage:
6+
##
7+
## Rscript contingency-combine.R path/to/individual/files/ path/to/rollup/files/
8+
##
9+
## Appends a set of newly-generated contingency tables to a rollup CSV that
10+
## contains all dates for a given set of grouping variables. Can also be used to
11+
## combine a directory of tables spanning multiple time periods.
12+
13+
suppressPackageStartupMessages({
14+
library(dplyr)
15+
library(readr)
16+
library(purrr)
17+
})
18+
19+
20+
#' Fetch all tables in a chosen directory and combine according to grouping
21+
#' used.
22+
#'
23+
#' @param input_dir Directory in which to look for survey CSV files, relative to
24+
#' the current working directory.
25+
#' @param output_dir Directory in which to look for existing rollup files or
26+
#' create new ones, relative to the current working directory.
27+
#' @param pattern Regular expression indicating which files in that directory to
28+
#' open. By default, selects all `.csv` files with standard table date prefix.
29+
run_rollup <- function(input_dir, output_dir, pattern = "^[0-9]{8}_[0-9]{8}.*[.]csv$") {
30+
files <- list.files(input_dir, pattern = pattern)
31+
32+
if (length(files) == 0) {
33+
stop("No matching data files.")
34+
}
35+
36+
files <- map_dfr(files, get_file_properties)
37+
38+
# Reformat files as a list such that input files with same grouping variables
39+
# (and thus same output file) are in a character vector named with the output
40+
# file.
41+
files <- lapply(split(files, files$rollupname), function(x) {x$filename})
42+
43+
for (output_name in names(files)) {
44+
combine_and_save_tables(
45+
file.path(input_dir, files[[output_name]]),
46+
file.path(output_dir, output_name))
47+
}
48+
49+
return(NULL)
50+
}
51+
52+
## Helper function to extract info from each file's filename.
53+
get_file_properties <- function(filename) {
54+
short <- strsplit(filename, ".", fixed = TRUE)[[1]][1]
55+
parts <- strsplit(short, "_", fixed = TRUE)[[1]]
56+
57+
group <- parts[3:length(parts)]
58+
# Specify compression format in name, to be parsed by `write_csv` later.
59+
partialname <- paste0(paste0(group, collapse="_"), ".csv.gz")
60+
61+
return(data.frame(
62+
filename=filename,
63+
rollupname=partialname))
64+
}
65+
66+
#' Combine set of input files with existing output file, and save to disk.
67+
#'
68+
#' If a date range has been seen before, the input and output data are
69+
#' deduplicated to use the newer set of data. Output is saved in gzip-compressed
70+
#' format.
71+
#'
72+
#' @param input_files Vector of paths to input files that share a set of
73+
#' grouping variables.
74+
#' @param output_file Path to corresponding output file.
75+
combine_and_save_tables <- function(input_files, output_file) {
76+
cols <- cols(
77+
.default = col_guess(),
78+
survey_geo = col_character(),
79+
period_type = col_character(),
80+
geo_type = col_character(),
81+
aggregation_type = col_character(),
82+
country = col_character(),
83+
ISO_3 = col_character(),
84+
GID_0 = col_character(),
85+
region = col_character(),
86+
GID_1 = col_character(),
87+
state = col_character(),
88+
state_fips = col_character(),
89+
county = col_character(),
90+
county_fips = col_character()
91+
)
92+
93+
input_df <- map_dfr(
94+
input_files,
95+
function(f) {
96+
read_csv(f, col_types = cols)
97+
}
98+
)
99+
100+
if (!file.exists(output_file)) {
101+
warning(paste0("Output file ", output_file, " does not exist. Creating a new copy."))
102+
# Create an empty starting df with the expected column names, order, and type.
103+
output_df <- input_df[FALSE,]
104+
} else {
105+
output_df <- read_csv(output_file, col_types = cols)
106+
}
107+
108+
# For finding unique group/geo-level/date combinations, use all columns up to
109+
# the first "val" column. This generalizes the process of finding unique rows,
110+
# when we might be using different grouping variables or different geo levels
111+
# (county/state/nation appear in different columns).
112+
group_names <- names(output_df)
113+
group_names <- group_names[ 1:min(which(startsWith(group_names, "val_")))-1 ]
114+
115+
## Deduplicate, keeping newest version by issue date of each unique row.
116+
# Merge the new data with the existing data, taking the last issue date for
117+
# any given grouping/geo level/date combo. This prevents duplication in case
118+
# of reissues. Note that the order matters: since arrange() uses order(),
119+
# which is a stable sort, ties will result in the input data being used in
120+
# preference over the existing rollup data.
121+
output_df <- bind_rows(output_df, input_df) %>%
122+
arrange(issue_date) %>%
123+
group_by(across(all_of(group_names))) %>%
124+
slice_tail() %>%
125+
ungroup()
126+
127+
# Automatically uses gzip compression based on output name.
128+
write_csv(output_df, output_file)
129+
130+
return(NULL)
131+
}
132+
133+
134+
135+
args <- commandArgs(TRUE)
136+
137+
if (length(args) < 2) {
138+
stop("Usage: Rscript contingency-combine.R path/to/individual/files/ path/to/rollup/files/")
139+
}
140+
141+
input_path <- args[1]
142+
output_path <- args[2]
143+
144+
invisible(run_rollup(input_path, output_path))

0 commit comments

Comments
 (0)