Add metadata to output files, output NA rows

davidkretch · davidkretch · commit fd7457cb6246 · 2021-04-14T09:09:30.000-04:00
* Update `contingency_aggregate.R` to delete lines which previously removed output for groups with missing values as part of their group-by variables, e.g. missing occupation. Now it will output rows for groups with missing values in the group-by variables.

* Update `contingency_write.R` to add start and end dates, period type, aggregation type, and geographic identifiers.

* Add `state_list.csv` in the `static` folder to store state FIPS codes and GADM codes. This file is used to add this info to the output datasets.

* Update `NAMESPACE` to include an extra import: `dplyr::bind_cols`.
diff --git a/facebook/delphiFacebook/NAMESPACE b/facebook/delphiFacebook/NAMESPACE
@@ -63,6 +63,7 @@ importFrom(dplyr,across)
 importFrom(dplyr,all_of)
 importFrom(dplyr,anti_join)
 importFrom(dplyr,arrange)
+importFrom(dplyr,bind_cols)
 importFrom(dplyr,bind_rows)
 importFrom(dplyr,case_when)
 importFrom(dplyr,coalesce)
diff --git a/facebook/delphiFacebook/R/contingency_aggregate.R b/facebook/delphiFacebook/R/contingency_aggregate.R
@@ -270,9 +270,6 @@ summarize_aggs <- function(df, crosswalk_data, aggregations, geo_level, params)
     table(df[, group_vars, with=FALSE], exclude=NULL, dnn=group_vars), 
     stringsAsFactors=FALSE
   )
-  unique_groups_counts <- unique_groups_counts[
-    complete.cases(unique_groups_counts[, group_vars]),
-  ]
   
   # Drop groups with less than threshold sample size.
   unique_groups_counts <- filter(unique_groups_counts, Freq >= params$num_filter)
@@ -328,10 +325,6 @@ summarize_aggs <- function(df, crosswalk_data, aggregations, geo_level, params)
     group_vars <- aggregations$group_by[[row]]
     post_fn <- aggregations$post_fn[[row]]
 
-    dfs_out[[aggregation]] <- dfs_out[[aggregation]][
-      rowSums(is.na(dfs_out[[aggregation]][, c("val", "sample_size", group_vars)])) == 0,
-    ]
-
     dfs_out[[aggregation]] <- apply_privacy_censoring(dfs_out[[aggregation]], params)
 
     ## Apply the post-function
diff --git a/facebook/delphiFacebook/R/contingency_write.R b/facebook/delphiFacebook/R/contingency_write.R
@@ -3,12 +3,15 @@
 #' CSV name includes date specifying start of time period aggregated, geo level,
 #' and grouping variables.
 #'
-#' @param data           a data frame to save; must contain the columns "geo_id", "val",
-#'                       "se", "sample_size", and grouping variables. The first four are saved in the
-#'                       output; day is used for spliting the data into files.
-#' @param params         a named list, containing the value "export_dir" indicating the
-#'                       directory where the csv should be saved
-#' @param geo_level      name of the geographic level; used for naming the output file
+#' @param data           a data frame to save; must contain the columns in
+#'                       `groupby_vars`.
+#' @param params         a named list, containing the values:
+#'                       "export_dir" - directory where the csv should be saved
+#'                       "static_dir" - directory where the state lookup file is
+#'                       "aggregate_range" - "month", "week", etc.
+#'                       "start_date" - start date of the aggregate range
+#'                       "end_date" - end date of the aggregate range
+#' @param geo_type       name of the geographic level; used for naming the output file
 #' @param groupby_vars   character vector of column names used for grouping to
 #'                       calculate aggregations; used for naming the output file
 #'
@@ -17,35 +20,168 @@
 #' @importFrom stringi stri_trim
 #'
 #' @export
-write_contingency_tables <- function(data, params, geo_level, groupby_vars)
+write_contingency_tables <- function(data, params, geo_type, groupby_vars)
 {
   if (!is.null(data) && nrow(data) != 0) {
-    data <- arrange(data, across(all_of(groupby_vars)))
+    
+    # Reorder the group-by columns and sort the dataset by them.
+    groupby_vars <- c("geo_id", sort(setdiff(groupby_vars, "geo_id")))
+    data <- data %>%
+      select(all_of(groupby_vars), everything()) %>%
+      arrange(across(all_of(groupby_vars)))
 
     # Format reported columns.
-    data <- mutate_at(data, vars(-c(groupby_vars)),
-                      function(x) {
-                        stri_trim(
-                          formatC(as.numeric(x), digits=7, format="f", drop0trailing=TRUE)
-                        )
-                      })
-
-    # Reduce verbosity of grouping vars for output purposes
-    groupby_vars <- gsub("_", "", sub(
-      ".+?_", "", groupby_vars[groupby_vars != "geo_id"]))
-    filename <- sprintf("%s_%s.csv", format(params$start_date, "%Y%m%d"),
-                        paste(c(geo_level, groupby_vars), collapse="_"))
-    file_out <- file.path(params$export_dir, filename)
-
+    format_number <- function(x) {
+      stri_trim(formatC(as.numeric(x), digits=7, format="f", drop0trailing=TRUE))
+    }
+    data <- mutate_at(data, vars(-groupby_vars), format_number)
+    
+    # Add standard geographic and metadata variables to the data.
+    data <- add_geo_vars(data, params, geo_type)
+    data <- add_metadata_vars(data, params, geo_type, groupby_vars)
+    
     create_dir_not_exist(params$export_dir)
-
-    msg_df(sprintf("saving contingency table data to %-35s", filename), data)
-    write_csv(data, file_out)
+    
+    file_name <- get_file_name(params, geo_type, groupby_vars)
+    msg_df(sprintf("saving contingency table data to %-35s", file_name), data)
+    write_csv(data, file.path(params$export_dir, file_name))
 
   } else {
     msg_plain(sprintf(
       "no aggregations produced for grouping variables %s (%s); CSV will not be saved",
-      paste(groupby_vars, collapse=", "), geo_level
+      paste(groupby_vars, collapse=", "), geo_type
     ))
   }
 }
+
+#' Add geographic variables to a dataset, e.g. state and state FIPS codes.
+#' 
+#' @param data A data frame, containing the variables in groupby_vars.
+#' @param params A parameters object with the `static_dir` resources folder.
+#' @param geo_type "nation", "state".
+#' 
+#' @importFrom dplyr bind_cols left_join select
+#' @importFrom readr read_csv cols
+#' @noRd
+add_geo_vars <- function(data, params, geo_type) {
+  
+  start <- data.frame(
+    country = "United States",
+    ISO_3 = "USA",
+    GID_0 = "USA"
+  )
+  
+  if (geo_type == "nation") {
+    
+    rest <- data.frame(
+      region = "overall",
+      GID_1 = NA_character_,
+      state = "overall",
+      state_fips = NA_character_,
+      county = "overall",
+      county_fips = NA_character_
+    )
+    
+  } else if (geo_type == "state") {
+    
+    states <- read_csv(
+      file.path(params$static_dir, "state_list.csv"),
+      col_types = cols(.default = "c")
+    )
+    
+    rest <- data.frame(
+      region = toupper(data$geo_id),
+      state = toupper(data$geo_id),
+      county = "overall",
+      county_fips = NA_character_
+    )
+    
+    rest$state <- toupper(rest$state)
+    states$state <- toupper(states$state)
+    
+    rest <- left_join(rest, states, by = "state") %>%
+      select(region, GID_1, state, state_fips, county, county_fips)
+  }
+  
+  geo_vars <- bind_cols(start, rest)
+  
+  # Insert the geographic variables in place of the "geo_id" variable.
+  index <- which(names(data) == "geo_id")
+  before <- if (index > 1) data[1:(index-1)] else NULL
+  after <- data[(index+1):ncol(data)]
+  result <- bind_cols(before, geo_vars, after)
+  
+  return(result)
+}
+
+#' Add metadata variables to a dataset, e.g. start and end dates.
+#' 
+#' @param data A data frame, containing the variables in `groupby_vars.`
+#' @param params A parameters object containing start & end date, period, etc.
+#' @param geo_type "nation", "state", "county".
+#' @param groupby_vars A list of variables `data` is aggregated by.
+#' 
+#' @importFrom dplyr bind_cols
+#' @noRd
+add_metadata_vars <- function(data, params, geo_type, groupby_vars) {
+  
+  aggregation_type <- setdiff(groupby_vars, "geo_id")
+  if (length(aggregation_type) == 0) aggregation_type <- "overall"
+  
+  # Add metadata about this period and level of aggregation.
+  metadata <- data.frame(
+    survey_geo = "us",
+    period_start = format(params$start_date, "%Y%m%d"),
+    period_end = format(params$end_date, "%Y%m%d"),
+    period_val = get_period_val(params$aggregate_range, params$start_date),
+    period_type = get_period_type(params$aggregate_range),
+    geo_type = paste(geo_type, collapse = "_"),
+    aggregation_type = paste(aggregation_type, collapse = "_")
+  )
+  data <- bind_cols(metadata, data)
+  data$issue_date <- format(Sys.Date(), "%Y%m%d")
+  
+  return(data)
+}
+
+#' Get the file name for the given parameters, geography, and set of group-by variables.
+#' @noRd
+get_file_name <- function(params, geo_type, groupby_vars) {
+  
+  aggregation_type <- setdiff(groupby_vars, "geo_id")
+  if (length(aggregation_type) == 0) aggregation_type <- "overall"
+  
+  file_name <- paste(
+    format(params$start_date, "%Y%m%d"),
+    format(params$end_date, "%Y%m%d"),
+    get_period_type(params$aggregate_range),
+    geo_type,
+    paste(aggregation_type, collapse = "_"),
+    sep = "_"
+  )
+  file_name <- paste0(file_name, ".csv")
+  return(file_name)
+}
+
+#' Get the period type for the given range, i.e. "weekly" or "monthly".
+#' @noRd
+get_period_type <- function(range) {
+  switch(
+    range,
+    "month" = "monthly",
+    "week" = "weekly",
+    ""
+  )
+}
+
+#' Get the period value (e.g. epiweek number) for the range and start date.
+#' @importFrom lubridate epiweek
+#' @noRd
+get_period_val <- function(range, period_start) {
+  switch(
+    range,
+    "week" = epiweek(period_start),
+    "month" = as.integer(format(period_start, "%m")),
+    NA_integer_
+  )
+}
diff --git a/facebook/delphiFacebook/man/write_contingency_tables.Rd b/facebook/delphiFacebook/man/write_contingency_tables.Rd
diff --git a/facebook/static/state_list.csv b/facebook/static/state_list.csv
@@ -0,0 +1,52 @@
+state,GID_1,state_fips
+"ak","USA.2_1","02"
+"al","USA.1_1","01"
+"ar","USA.4_1","05"
+"az","USA.3_1","04"
+"ca","USA.5_1","06"
+"co","USA.6_1","08"
+"ct","USA.7_1","09"
+"dc","USA.9_1","11"
+"de","USA.8_1","10"
+"fl","USA.10_1","12"
+"ga","USA.11_1","13"
+"hi","USA.12_1","15"
+"ia","USA.16_1","19"
+"id","USA.13_1","16"
+"il","USA.14_1","17"
+"in","USA.15_1","18"
+"ks","USA.17_1","20"
+"ky","USA.18_1","21"
+"la","USA.19_1","22"
+"ma","USA.22_1","25"
+"md","USA.21_1","24"
+"me","USA.20_1","23"
+"mi","USA.23_1","26"
+"mn","USA.24_1","27"
+"mo","USA.26_1","29"
+"ms","USA.25_1","28"
+"mt","USA.27_1","30"
+"nc","USA.34_1","37"
+"nd","USA.35_1","38"
+"ne","USA.28_1","31"
+"nh","USA.30_1","33"
+"nj","USA.31_1","34"
+"nm","USA.32_1","35"
+"nv","USA.29_1","32"
+"ny","USA.33_1","36"
+"oh","USA.36_1","39"
+"ok","USA.37_1","40"
+"or","USA.38_1","41"
+"pa","USA.39_1","42"
+"ri","USA.40_1","44"
+"sc","USA.41_1","45"
+"sd","USA.42_1","46"
+"tn","USA.43_1","47"
+"tx","USA.44_1","48"
+"ut","USA.45_1","49"
+"va","USA.47_1","51"
+"vt","USA.46_1","50"
+"wa","USA.48_1","53"
+"wi","USA.50_1","55"
+"wv","USA.49_1","54"
+"wy","USA.51_1","56"