Skip to content

Commit 7467891

Browse files
authored
Merge pull request #1011 from davidkretch/output
Add v0.2 metadata to contingency table output files, output NA rows
2 parents 45ea5f1 + 41dcbbe commit 7467891

File tree

5 files changed

+227
-40
lines changed

5 files changed

+227
-40
lines changed

facebook/delphiFacebook/NAMESPACE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ importFrom(dplyr,across)
6363
importFrom(dplyr,all_of)
6464
importFrom(dplyr,anti_join)
6565
importFrom(dplyr,arrange)
66+
importFrom(dplyr,bind_cols)
6667
importFrom(dplyr,bind_rows)
6768
importFrom(dplyr,case_when)
6869
importFrom(dplyr,coalesce)

facebook/delphiFacebook/R/contingency_aggregate.R

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -264,15 +264,10 @@ summarize_aggs <- function(df, crosswalk_data, aggregations, geo_level, params)
264264
}
265265

266266
## Find all unique groups and associated frequencies, saved in column `Freq`.
267-
# Keep rows with missing values initially so that we get the correct column
268-
# names. Explicitly drop groups with missing values in second step.
269267
unique_groups_counts <- as.data.frame(
270268
table(df[, group_vars, with=FALSE], exclude=NULL, dnn=group_vars),
271269
stringsAsFactors=FALSE
272270
)
273-
unique_groups_counts <- unique_groups_counts[
274-
complete.cases(unique_groups_counts[, group_vars]),
275-
]
276271

277272
# Drop groups with less than threshold sample size.
278273
unique_groups_counts <- filter(unique_groups_counts, Freq >= params$num_filter)
@@ -327,9 +322,10 @@ summarize_aggs <- function(df, crosswalk_data, aggregations, geo_level, params)
327322
aggregation <- aggregations$id[row]
328323
group_vars <- aggregations$group_by[[row]]
329324
post_fn <- aggregations$post_fn[[row]]
330-
325+
326+
# Keep only aggregations where the main value, `val`, is present.
331327
dfs_out[[aggregation]] <- dfs_out[[aggregation]][
332-
rowSums(is.na(dfs_out[[aggregation]][, c("val", "sample_size", group_vars)])) == 0,
328+
rowSums(is.na(dfs_out[[aggregation]][, c("val", "sample_size")])) == 0,
333329
]
334330

335331
dfs_out[[aggregation]] <- apply_privacy_censoring(dfs_out[[aggregation]], params)

facebook/delphiFacebook/R/contingency_write.R

Lines changed: 161 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,15 @@
33
#' CSV name includes date specifying start of time period aggregated, geo level,
44
#' and grouping variables.
55
#'
6-
#' @param data a data frame to save; must contain the columns "geo_id", "val",
7-
#' "se", "sample_size", and grouping variables. The first four are saved in the
8-
#' output; day is used for spliting the data into files.
9-
#' @param params a named list, containing the value "export_dir" indicating the
10-
#' directory where the csv should be saved
11-
#' @param geo_level name of the geographic level; used for naming the output file
6+
#' @param data a data frame to save; must contain the columns in
7+
#' `groupby_vars`.
8+
#' @param params a named list, containing the values:
9+
#' "export_dir" - directory where the csv should be saved
10+
#' "static_dir" - directory where the state lookup file is
11+
#' "aggregate_range" - "month", "week", etc.
12+
#' "start_date" - start date of the aggregate range
13+
#' "end_date" - end date of the aggregate range
14+
#' @param geo_type name of the geographic level; used for naming the output file
1215
#' @param groupby_vars character vector of column names used for grouping to
1316
#' calculate aggregations; used for naming the output file
1417
#'
@@ -17,35 +20,167 @@
1720
#' @importFrom stringi stri_trim
1821
#'
1922
#' @export
20-
write_contingency_tables <- function(data, params, geo_level, groupby_vars)
23+
write_contingency_tables <- function(data, params, geo_type, groupby_vars)
2124
{
2225
if (!is.null(data) && nrow(data) != 0) {
23-
data <- arrange(data, across(all_of(groupby_vars)))
26+
27+
# Reorder the group-by columns and sort the dataset by them.
28+
groupby_vars <- c("geo_id", sort(setdiff(groupby_vars, "geo_id")))
29+
data <- data %>%
30+
select(all_of(groupby_vars), everything()) %>%
31+
arrange(across(all_of(groupby_vars)))
2432

2533
# Format reported columns.
26-
data <- mutate_at(data, vars(-c(groupby_vars)),
27-
function(x) {
28-
stri_trim(
29-
formatC(as.numeric(x), digits=7, format="f", drop0trailing=TRUE)
30-
)
31-
})
32-
33-
# Reduce verbosity of grouping vars for output purposes
34-
groupby_vars <- gsub("_", "", sub(
35-
".+?_", "", groupby_vars[groupby_vars != "geo_id"]))
36-
filename <- sprintf("%s_%s.csv", format(params$start_date, "%Y%m%d"),
37-
paste(c(geo_level, groupby_vars), collapse="_"))
38-
file_out <- file.path(params$export_dir, filename)
39-
34+
format_number <- function(x) {
35+
stri_trim(formatC(as.numeric(x), digits=7, format="f", drop0trailing=TRUE))
36+
}
37+
data <- mutate_at(data, vars(-groupby_vars), format_number)
38+
39+
# Add standard geographic and metadata variables to the data.
40+
data <- add_geo_vars(data, params, geo_type)
41+
data <- add_metadata_vars(data, params, geo_type, groupby_vars)
42+
4043
create_dir_not_exist(params$export_dir)
41-
42-
msg_df(sprintf("saving contingency table data to %-35s", filename), data)
43-
write_csv(data, file_out)
44+
45+
file_name <- get_file_name(params, geo_type, groupby_vars)
46+
msg_df(sprintf("saving contingency table data to %-35s", file_name), data)
47+
write_csv(data, file.path(params$export_dir, file_name))
4448

4549
} else {
4650
msg_plain(sprintf(
4751
"no aggregations produced for grouping variables %s (%s); CSV will not be saved",
48-
paste(groupby_vars, collapse=", "), geo_level
52+
paste(groupby_vars, collapse=", "), geo_type
4953
))
5054
}
5155
}
56+
57+
#' Add geographic variables to a dataset, e.g. state and state FIPS codes.
58+
#'
59+
#' @param data A data frame, containing the variables in groupby_vars.
60+
#' @param params A parameters object with the `static_dir` resources folder.
61+
#' @param geo_type "nation", "state".
62+
#'
63+
#' @importFrom dplyr bind_cols left_join select
64+
#' @importFrom readr read_csv cols
65+
#' @noRd
66+
add_geo_vars <- function(data, params, geo_type) {
67+
68+
overall <- "Overall"
69+
70+
first <- data.frame(
71+
country = "United States",
72+
ISO_3 = "USA",
73+
GID_0 = "USA"
74+
)
75+
76+
if (geo_type == "nation") {
77+
78+
rest <- data.frame(
79+
region = overall,
80+
GID_1 = NA_character_,
81+
state = overall,
82+
state_fips = NA_character_,
83+
county = overall,
84+
county_fips = NA_character_
85+
)
86+
87+
} else if (geo_type == "state") {
88+
89+
states <- read_csv(
90+
file.path(params$static_dir, "state_list.csv"),
91+
col_types = cols(.default = "c")
92+
)
93+
94+
rest <- data.frame(
95+
region = toupper(data$geo_id),
96+
state = toupper(data$geo_id),
97+
county = overall,
98+
county_fips = NA_character_
99+
)
100+
101+
rest <- left_join(rest, states, by = "state") %>%
102+
select(region, GID_1, state, state_fips, county, county_fips)
103+
}
104+
105+
geo_vars <- bind_cols(first, rest)
106+
107+
# Insert the geographic variables in place of the "geo_id" variable.
108+
index <- which(names(data) == "geo_id")
109+
before <- if (index > 1) data[1:(index-1)] else NULL
110+
after <- data[(index+1):ncol(data)]
111+
result <- bind_cols(before, geo_vars, after)
112+
113+
return(result)
114+
}
115+
116+
#' Add metadata variables to a dataset, e.g. start and end dates.
117+
#'
118+
#' @param data A data frame, containing the variables in `groupby_vars.`
119+
#' @param params A parameters object containing start & end date, period, etc.
120+
#' @param geo_type "nation", "state", "county".
121+
#' @param groupby_vars A list of variables `data` is aggregated by.
122+
#'
123+
#' @importFrom dplyr bind_cols
124+
#' @noRd
125+
add_metadata_vars <- function(data, params, geo_type, groupby_vars) {
126+
127+
aggregation_type <- setdiff(groupby_vars, "geo_id")
128+
if (length(aggregation_type) == 0) aggregation_type <- "overall"
129+
130+
# Add metadata about this period and level of aggregation.
131+
metadata <- data.frame(
132+
survey_geo = "us",
133+
period_start = format(params$start_date, "%Y%m%d"),
134+
period_end = format(params$end_date, "%Y%m%d"),
135+
period_val = get_period_val(params$aggregate_range, params$start_date),
136+
period_type = get_period_type(params$aggregate_range),
137+
geo_type = geo_type,
138+
aggregation_type = paste(aggregation_type, collapse = "_")
139+
)
140+
data <- bind_cols(metadata, data)
141+
data$issue_date <- format(Sys.Date(), "%Y%m%d")
142+
143+
return(data)
144+
}
145+
146+
#' Get the file name for the given parameters, geography, and set of group-by variables.
147+
#' @noRd
148+
get_file_name <- function(params, geo_type, groupby_vars) {
149+
150+
aggregation_type <- setdiff(groupby_vars, "geo_id")
151+
if (length(aggregation_type) == 0) aggregation_type <- "overall"
152+
153+
file_name <- paste(
154+
format(params$start_date, "%Y%m%d"),
155+
format(params$end_date, "%Y%m%d"),
156+
get_period_type(params$aggregate_range),
157+
geo_type,
158+
paste(aggregation_type, collapse = "_"),
159+
sep = "_"
160+
)
161+
file_name <- paste0(file_name, ".csv")
162+
return(file_name)
163+
}
164+
165+
#' Get the period type for the given range, i.e. "weekly" or "monthly".
166+
#' @noRd
167+
get_period_type <- function(range) {
168+
switch(
169+
range,
170+
"month" = "monthly",
171+
"week" = "weekly",
172+
""
173+
)
174+
}
175+
176+
#' Get the period value (e.g. epiweek number) for the range and start date.
177+
#' @importFrom lubridate epiweek
178+
#' @noRd
179+
get_period_val <- function(range, period_start) {
180+
switch(
181+
range,
182+
"week" = epiweek(period_start),
183+
"month" = as.integer(format(period_start, "%m")),
184+
NA_integer_
185+
)
186+
}

facebook/delphiFacebook/man/write_contingency_tables.Rd

Lines changed: 10 additions & 7 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

facebook/static/state_list.csv

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
"state","GID_1","state_fips"
2+
"AK","USA.2_1","02"
3+
"AL","USA.1_1","01"
4+
"AR","USA.4_1","05"
5+
"AZ","USA.3_1","04"
6+
"CA","USA.5_1","06"
7+
"CO","USA.6_1","08"
8+
"CT","USA.7_1","09"
9+
"DC","USA.9_1","11"
10+
"DE","USA.8_1","10"
11+
"FL","USA.10_1","12"
12+
"GA","USA.11_1","13"
13+
"HI","USA.12_1","15"
14+
"IA","USA.16_1","19"
15+
"ID","USA.13_1","16"
16+
"IL","USA.14_1","17"
17+
"IN","USA.15_1","18"
18+
"KS","USA.17_1","20"
19+
"KY","USA.18_1","21"
20+
"LA","USA.19_1","22"
21+
"MA","USA.22_1","25"
22+
"MD","USA.21_1","24"
23+
"ME","USA.20_1","23"
24+
"MI","USA.23_1","26"
25+
"MN","USA.24_1","27"
26+
"MO","USA.26_1","29"
27+
"MS","USA.25_1","28"
28+
"MT","USA.27_1","30"
29+
"NC","USA.34_1","37"
30+
"ND","USA.35_1","38"
31+
"NE","USA.28_1","31"
32+
"NH","USA.30_1","33"
33+
"NJ","USA.31_1","34"
34+
"NM","USA.32_1","35"
35+
"NV","USA.29_1","32"
36+
"NY","USA.33_1","36"
37+
"OH","USA.36_1","39"
38+
"OK","USA.37_1","40"
39+
"OR","USA.38_1","41"
40+
"PA","USA.39_1","42"
41+
"RI","USA.40_1","44"
42+
"SC","USA.41_1","45"
43+
"SD","USA.42_1","46"
44+
"TN","USA.43_1","47"
45+
"TX","USA.44_1","48"
46+
"UT","USA.45_1","49"
47+
"VA","USA.47_1","51"
48+
"VT","USA.46_1","50"
49+
"WA","USA.48_1","53"
50+
"WI","USA.50_1","55"
51+
"WV","USA.49_1","54"
52+
"WY","USA.51_1","56"

0 commit comments

Comments
 (0)