Skip to content

Commit fd7457c

Browse files
committed
Add metadata to output files, output NA rows
* Update `contingency_aggregate.R` to delete lines which previously removed output for groups with missing values as part of their group-by variables, e.g. missing occupation. Now it will output rows for groups with missing values in the group-by variables. * Update `contingency_write.R` to add start and end dates, period type, aggregation type, and geographic identifiers. * Add `state_list.csv` in the `static` folder to store state FIPS codes and GADM codes. This file is used to add this info to the output datasets. * Update `NAMESPACE` to include an extra import: `dplyr::bind_cols`.
1 parent ac66aaa commit fd7457c

File tree

5 files changed

+225
-40
lines changed

5 files changed

+225
-40
lines changed

facebook/delphiFacebook/NAMESPACE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ importFrom(dplyr,across)
6363
importFrom(dplyr,all_of)
6464
importFrom(dplyr,anti_join)
6565
importFrom(dplyr,arrange)
66+
importFrom(dplyr,bind_cols)
6667
importFrom(dplyr,bind_rows)
6768
importFrom(dplyr,case_when)
6869
importFrom(dplyr,coalesce)

facebook/delphiFacebook/R/contingency_aggregate.R

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -270,9 +270,6 @@ summarize_aggs <- function(df, crosswalk_data, aggregations, geo_level, params)
270270
table(df[, group_vars, with=FALSE], exclude=NULL, dnn=group_vars),
271271
stringsAsFactors=FALSE
272272
)
273-
unique_groups_counts <- unique_groups_counts[
274-
complete.cases(unique_groups_counts[, group_vars]),
275-
]
276273

277274
# Drop groups with less than threshold sample size.
278275
unique_groups_counts <- filter(unique_groups_counts, Freq >= params$num_filter)
@@ -328,10 +325,6 @@ summarize_aggs <- function(df, crosswalk_data, aggregations, geo_level, params)
328325
group_vars <- aggregations$group_by[[row]]
329326
post_fn <- aggregations$post_fn[[row]]
330327

331-
dfs_out[[aggregation]] <- dfs_out[[aggregation]][
332-
rowSums(is.na(dfs_out[[aggregation]][, c("val", "sample_size", group_vars)])) == 0,
333-
]
334-
335328
dfs_out[[aggregation]] <- apply_privacy_censoring(dfs_out[[aggregation]], params)
336329

337330
## Apply the post-function

facebook/delphiFacebook/R/contingency_write.R

Lines changed: 162 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,15 @@
33
#' CSV name includes date specifying start of time period aggregated, geo level,
44
#' and grouping variables.
55
#'
6-
#' @param data a data frame to save; must contain the columns "geo_id", "val",
7-
#' "se", "sample_size", and grouping variables. The first four are saved in the
8-
#' output; day is used for spliting the data into files.
9-
#' @param params a named list, containing the value "export_dir" indicating the
10-
#' directory where the csv should be saved
11-
#' @param geo_level name of the geographic level; used for naming the output file
6+
#' @param data a data frame to save; must contain the columns in
7+
#' `groupby_vars`.
8+
#' @param params a named list, containing the values:
9+
#' "export_dir" - directory where the csv should be saved
10+
#' "static_dir" - directory where the state lookup file is
11+
#' "aggregate_range" - "month", "week", etc.
12+
#' "start_date" - start date of the aggregate range
13+
#' "end_date" - end date of the aggregate range
14+
#' @param geo_type name of the geographic level; used for naming the output file
1215
#' @param groupby_vars character vector of column names used for grouping to
1316
#' calculate aggregations; used for naming the output file
1417
#'
@@ -17,35 +20,168 @@
1720
#' @importFrom stringi stri_trim
1821
#'
1922
#' @export
20-
write_contingency_tables <- function(data, params, geo_level, groupby_vars)
23+
write_contingency_tables <- function(data, params, geo_type, groupby_vars)
2124
{
2225
if (!is.null(data) && nrow(data) != 0) {
23-
data <- arrange(data, across(all_of(groupby_vars)))
26+
27+
# Reorder the group-by columns and sort the dataset by them.
28+
groupby_vars <- c("geo_id", sort(setdiff(groupby_vars, "geo_id")))
29+
data <- data %>%
30+
select(all_of(groupby_vars), everything()) %>%
31+
arrange(across(all_of(groupby_vars)))
2432

2533
# Format reported columns.
26-
data <- mutate_at(data, vars(-c(groupby_vars)),
27-
function(x) {
28-
stri_trim(
29-
formatC(as.numeric(x), digits=7, format="f", drop0trailing=TRUE)
30-
)
31-
})
32-
33-
# Reduce verbosity of grouping vars for output purposes
34-
groupby_vars <- gsub("_", "", sub(
35-
".+?_", "", groupby_vars[groupby_vars != "geo_id"]))
36-
filename <- sprintf("%s_%s.csv", format(params$start_date, "%Y%m%d"),
37-
paste(c(geo_level, groupby_vars), collapse="_"))
38-
file_out <- file.path(params$export_dir, filename)
39-
34+
format_number <- function(x) {
35+
stri_trim(formatC(as.numeric(x), digits=7, format="f", drop0trailing=TRUE))
36+
}
37+
data <- mutate_at(data, vars(-groupby_vars), format_number)
38+
39+
# Add standard geographic and metadata variables to the data.
40+
data <- add_geo_vars(data, params, geo_type)
41+
data <- add_metadata_vars(data, params, geo_type, groupby_vars)
42+
4043
create_dir_not_exist(params$export_dir)
41-
42-
msg_df(sprintf("saving contingency table data to %-35s", filename), data)
43-
write_csv(data, file_out)
44+
45+
file_name <- get_file_name(params, geo_type, groupby_vars)
46+
msg_df(sprintf("saving contingency table data to %-35s", file_name), data)
47+
write_csv(data, file.path(params$export_dir, file_name))
4448

4549
} else {
4650
msg_plain(sprintf(
4751
"no aggregations produced for grouping variables %s (%s); CSV will not be saved",
48-
paste(groupby_vars, collapse=", "), geo_level
52+
paste(groupby_vars, collapse=", "), geo_type
4953
))
5054
}
5155
}
56+
57+
#' Add geographic variables to a dataset, e.g. state and state FIPS codes.
58+
#'
59+
#' @param data A data frame, containing the variables in groupby_vars.
60+
#' @param params A parameters object with the `static_dir` resources folder.
61+
#' @param geo_type "nation", "state".
62+
#'
63+
#' @importFrom dplyr bind_cols left_join select
64+
#' @importFrom readr read_csv cols
65+
#' @noRd
66+
add_geo_vars <- function(data, params, geo_type) {
67+
68+
start <- data.frame(
69+
country = "United States",
70+
ISO_3 = "USA",
71+
GID_0 = "USA"
72+
)
73+
74+
if (geo_type == "nation") {
75+
76+
rest <- data.frame(
77+
region = "overall",
78+
GID_1 = NA_character_,
79+
state = "overall",
80+
state_fips = NA_character_,
81+
county = "overall",
82+
county_fips = NA_character_
83+
)
84+
85+
} else if (geo_type == "state") {
86+
87+
states <- read_csv(
88+
file.path(params$static_dir, "state_list.csv"),
89+
col_types = cols(.default = "c")
90+
)
91+
92+
rest <- data.frame(
93+
region = toupper(data$geo_id),
94+
state = toupper(data$geo_id),
95+
county = "overall",
96+
county_fips = NA_character_
97+
)
98+
99+
rest$state <- toupper(rest$state)
100+
states$state <- toupper(states$state)
101+
102+
rest <- left_join(rest, states, by = "state") %>%
103+
select(region, GID_1, state, state_fips, county, county_fips)
104+
}
105+
106+
geo_vars <- bind_cols(start, rest)
107+
108+
# Insert the geographic variables in place of the "geo_id" variable.
109+
index <- which(names(data) == "geo_id")
110+
before <- if (index > 1) data[1:(index-1)] else NULL
111+
after <- data[(index+1):ncol(data)]
112+
result <- bind_cols(before, geo_vars, after)
113+
114+
return(result)
115+
}
116+
117+
#' Add metadata variables to a dataset, e.g. start and end dates.
118+
#'
119+
#' @param data A data frame, containing the variables in `groupby_vars.`
120+
#' @param params A parameters object containing start & end date, period, etc.
121+
#' @param geo_type "nation", "state", "county".
122+
#' @param groupby_vars A list of variables `data` is aggregated by.
123+
#'
124+
#' @importFrom dplyr bind_cols
125+
#' @noRd
126+
add_metadata_vars <- function(data, params, geo_type, groupby_vars) {
127+
128+
aggregation_type <- setdiff(groupby_vars, "geo_id")
129+
if (length(aggregation_type) == 0) aggregation_type <- "overall"
130+
131+
# Add metadata about this period and level of aggregation.
132+
metadata <- data.frame(
133+
survey_geo = "us",
134+
period_start = format(params$start_date, "%Y%m%d"),
135+
period_end = format(params$end_date, "%Y%m%d"),
136+
period_val = get_period_val(params$aggregate_range, params$start_date),
137+
period_type = get_period_type(params$aggregate_range),
138+
geo_type = paste(geo_type, collapse = "_"),
139+
aggregation_type = paste(aggregation_type, collapse = "_")
140+
)
141+
data <- bind_cols(metadata, data)
142+
data$issue_date <- format(Sys.Date(), "%Y%m%d")
143+
144+
return(data)
145+
}
146+
147+
#' Get the file name for the given parameters, geography, and set of group-by variables.
148+
#' @noRd
149+
get_file_name <- function(params, geo_type, groupby_vars) {
150+
151+
aggregation_type <- setdiff(groupby_vars, "geo_id")
152+
if (length(aggregation_type) == 0) aggregation_type <- "overall"
153+
154+
file_name <- paste(
155+
format(params$start_date, "%Y%m%d"),
156+
format(params$end_date, "%Y%m%d"),
157+
get_period_type(params$aggregate_range),
158+
geo_type,
159+
paste(aggregation_type, collapse = "_"),
160+
sep = "_"
161+
)
162+
file_name <- paste0(file_name, ".csv")
163+
return(file_name)
164+
}
165+
166+
#' Get the period type for the given range, i.e. "weekly" or "monthly".
167+
#' @noRd
168+
get_period_type <- function(range) {
169+
switch(
170+
range,
171+
"month" = "monthly",
172+
"week" = "weekly",
173+
""
174+
)
175+
}
176+
177+
#' Get the period value (e.g. epiweek number) for the range and start date.
178+
#' @importFrom lubridate epiweek
179+
#' @noRd
180+
get_period_val <- function(range, period_start) {
181+
switch(
182+
range,
183+
"week" = epiweek(period_start),
184+
"month" = as.integer(format(period_start, "%m")),
185+
NA_integer_
186+
)
187+
}

facebook/delphiFacebook/man/write_contingency_tables.Rd

Lines changed: 10 additions & 7 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

facebook/static/state_list.csv

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
state,GID_1,state_fips
2+
"ak","USA.2_1","02"
3+
"al","USA.1_1","01"
4+
"ar","USA.4_1","05"
5+
"az","USA.3_1","04"
6+
"ca","USA.5_1","06"
7+
"co","USA.6_1","08"
8+
"ct","USA.7_1","09"
9+
"dc","USA.9_1","11"
10+
"de","USA.8_1","10"
11+
"fl","USA.10_1","12"
12+
"ga","USA.11_1","13"
13+
"hi","USA.12_1","15"
14+
"ia","USA.16_1","19"
15+
"id","USA.13_1","16"
16+
"il","USA.14_1","17"
17+
"in","USA.15_1","18"
18+
"ks","USA.17_1","20"
19+
"ky","USA.18_1","21"
20+
"la","USA.19_1","22"
21+
"ma","USA.22_1","25"
22+
"md","USA.21_1","24"
23+
"me","USA.20_1","23"
24+
"mi","USA.23_1","26"
25+
"mn","USA.24_1","27"
26+
"mo","USA.26_1","29"
27+
"ms","USA.25_1","28"
28+
"mt","USA.27_1","30"
29+
"nc","USA.34_1","37"
30+
"nd","USA.35_1","38"
31+
"ne","USA.28_1","31"
32+
"nh","USA.30_1","33"
33+
"nj","USA.31_1","34"
34+
"nm","USA.32_1","35"
35+
"nv","USA.29_1","32"
36+
"ny","USA.33_1","36"
37+
"oh","USA.36_1","39"
38+
"ok","USA.37_1","40"
39+
"or","USA.38_1","41"
40+
"pa","USA.39_1","42"
41+
"ri","USA.40_1","44"
42+
"sc","USA.41_1","45"
43+
"sd","USA.42_1","46"
44+
"tn","USA.43_1","47"
45+
"tx","USA.44_1","48"
46+
"ut","USA.45_1","49"
47+
"va","USA.47_1","51"
48+
"vt","USA.46_1","50"
49+
"wa","USA.48_1","53"
50+
"wi","USA.50_1","55"
51+
"wv","USA.49_1","54"
52+
"wy","USA.51_1","56"

0 commit comments

Comments
 (0)