cmu-delphi
diff --git a/‎facebook/contingency_tables.R
Lines changed: 1 addition & 0 deletions b/‎facebook/contingency_tables.R
Lines changed: 1 addition & 0 deletions
diff --git a/‎facebook/delphiFacebook/DESCRIPTION
Lines changed: 3 additions & 1 deletion b/‎facebook/delphiFacebook/DESCRIPTION
Lines changed: 3 additions & 1 deletion
diff --git a/‎facebook/delphiFacebook/NAMESPACE
Lines changed: 3 additions & 1 deletion b/‎facebook/delphiFacebook/NAMESPACE
Lines changed: 3 additions & 1 deletion
diff --git a/‎facebook/delphiFacebook/R/RcppExports.R
Lines changed: 7 additions & 0 deletions b/‎facebook/delphiFacebook/R/RcppExports.R
Lines changed: 7 additions & 0 deletions
diff --git a/‎facebook/delphiFacebook/R/aggregate.R
Lines changed: 31 additions & 20 deletions b/‎facebook/delphiFacebook/R/aggregate.R
Lines changed: 31 additions & 20 deletions
diff --git a/‎facebook/delphiFacebook/R/contingency_aggregate.R
Lines changed: 17 additions & 27 deletions b/‎facebook/delphiFacebook/R/contingency_aggregate.R
Lines changed: 17 additions & 27 deletions
diff --git a/‎facebook/delphiFacebook/R/contingency_run.R
Lines changed: 12 additions & 32 deletions b/‎facebook/delphiFacebook/R/contingency_run.R
Lines changed: 12 additions & 32 deletions
diff --git a/‎facebook/delphiFacebook/R/contingency_variables.R
Lines changed: 9 additions & 9 deletions b/‎facebook/delphiFacebook/R/contingency_variables.R
Lines changed: 9 additions & 9 deletions
diff --git a/‎facebook/delphiFacebook/R/contingency_write.R
Lines changed: 2 additions & 2 deletions b/‎facebook/delphiFacebook/R/contingency_write.R
Lines changed: 2 additions & 2 deletions
diff --git a/‎facebook/delphiFacebook/R/dates.R
Lines changed: 1 addition & 1 deletion b/‎facebook/delphiFacebook/R/dates.R
Lines changed: 1 addition & 1 deletion
diff --git a/‎facebook/delphiFacebook/R/responses.R
Lines changed: 4 additions & 6 deletions b/‎facebook/delphiFacebook/R/responses.R
Lines changed: 4 additions & 6 deletions
@@ -2,3 +2,4 @@ library(delphiFacebook)
 
 params <- read_contingency_params("params.json")
 run_contingency_tables(params)
+message("run_contingency_tables completed successfully")
@@ -20,11 +20,13 @@ Imports:
     lubridate,
     data.table,
     tibble,
-    purrr
+    purrr,
+    Rcpp
 Suggests:
     knitr (>= 1.15),
     rmarkdown (>= 1.4),
     testthat (>= 1.0.1),
     covr (>= 2.2.2)
+LinkingTo: Rcpp
 RoxygenNote: 7.1.1
 Encoding: UTF-8
@@ -55,6 +55,7 @@ export(write_contingency_tables)
 export(write_data_api)
 export(write_individual)
 import(data.table)
+importFrom(data.table,fread)
 importFrom(dplyr,"%>%")
 importFrom(dplyr,across)
 importFrom(dplyr,all_of)
@@ -107,6 +108,7 @@ importFrom(readr,write_rds)
 importFrom(rlang,.data)
 importFrom(stats,complete.cases)
 importFrom(stats,na.omit)
+importFrom(stats,setNames)
 importFrom(stats,weighted.mean)
 importFrom(stringi,stri_extract)
 importFrom(stringi,stri_replace)
@@ -116,5 +118,5 @@ importFrom(stringi,stri_sub)
 importFrom(stringi,stri_trans_tolower)
 importFrom(stringi,stri_trim)
 importFrom(tibble,add_column)
-importFrom(tibble,as_tibble)
 importFrom(tibble,tribble)
+useDynLib(delphiFacebook, .registration = TRUE)
@@ -0,0 +1,7 @@
+# Generated by using Rcpp::compileAttributes() -> do not edit by hand
+# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
+
+is_selected_cpp <- function(responses, target) {
+    .Call(`_delphiFacebook_is_selected_cpp`, responses, target)
+}
+
@@ -146,23 +146,27 @@ summarize_indicators <- function(df, crosswalk_data, indicators, geo_level,
 #' @param geo_level Name of the geo level (county, state, etc.) for which we are
 #'   aggregating.
 #' @param params Named list of configuration options.
-#' @importFrom dplyr mutate filter
+#' 
+#' @importFrom dplyr mutate filter bind_rows
+#' @importFrom stats setNames
 #' @importFrom rlang .data
 summarize_indicators_day <- function(day_df, indicators, target_day, geo_level, params) {
-  ## Prepare outputs.
-  dfs_out <- list()
+  ## Prepare outputs as list of lists. Saves some time and memory since lists
+  ## are not copied on modify.
   geo_ids <- unique(day_df$geo_id)
-  for (indicator in indicators$name) {
-    dfs_out[[indicator]] <- tibble(
-      geo_id = geo_ids,
-      day = target_day,
-      val = NA_real_,
-      se = NA_real_,
-      sample_size = NA_real_,
-      effective_sample_size = NA_real_
-    )
-  }
-
+  n_geo_ids <- length(geo_ids)
+  fill_list <- list(geo_id = geo_ids,
+                    day = rep(target_day, n_geo_ids),
+                    val = rep(NA_real_, n_geo_ids),
+                    se = rep(NA_real_, n_geo_ids),
+                    sample_size = rep(NA_real_, n_geo_ids),
+                    effective_sample_size = rep(NA_real_, n_geo_ids)
+  )
+  
+  dfs_out <- setNames(
+    rep(list(fill_list), times=length(indicators$name)),
+    indicators$name)
+  
   for (ii in seq_along(geo_ids))
   {
     target_geo <- geo_ids[ii]
@@ -175,8 +179,10 @@ summarize_indicators_day <- function(day_df, indicators, target_day, geo_level,
       var_weight <- indicators$var_weight[row]
       compute_fn <- indicators$compute_fn[[row]]
 
-      ind_df <- sub_df[!is.na(sub_df[[var_weight]]) & !is.na(sub_df[[metric]]), ]
-
+      # Copy only columns we're using.
+      select_cols <- c(metric, var_weight, "weight_in_location")
+      ind_df <- sub_df[, select_cols, with=FALSE][!is.na(sub_df[[var_weight]]) & !is.na(sub_df[[metric]]), ]
+      
       if (nrow(ind_df) > 0)
       {
         s_mix_coef <- params$s_mix_coef
@@ -191,13 +197,18 @@ summarize_indicators_day <- function(day_df, indicators, target_day, geo_level,
           weight = if (indicators$skip_mixing[row]) { mixing$normalized_preweights } else { mixing$weights },
           sample_size = sample_size)
 
-        dfs_out[[indicator]]$val[ii] <- new_row$val
-        dfs_out[[indicator]]$se[ii] <- new_row$se
-        dfs_out[[indicator]]$sample_size[ii] <- sample_size
-        dfs_out[[indicator]]$effective_sample_size[ii] <- new_row$effective_sample_size
+        dfs_out[[indicator]][["val"]][ii] <- new_row$val
+        dfs_out[[indicator]][["se"]][ii] <- new_row$se
+        dfs_out[[indicator]][["sample_size"]][ii] <- sample_size
+        dfs_out[[indicator]][["effective_sample_size"]][ii] <- new_row$effective_sample_size
       }
     }
   }
+  
+  # Convert list of lists to list of tibbles.
+  for (indicator in indicators$name) {
+   dfs_out[[indicator]] <- bind_rows(dfs_out[[indicator]])
+  }
 
   for (row in seq_len(nrow(indicators))) {
     indicator <- indicators$name[row]
 
@@ -34,7 +34,7 @@ produce_aggregates <- function(df, aggregations, cw_list, params) {
   ## table in sorted order so data.table can use a binary search to find
   ## matching dates, rather than a linear scan, and is important for very large
   ## input files.
-  df <- as.data.table(df)
+  df <- as.data.table(df)[!is.na(weight), ]
   setkeyv(df, "start_dt")
 
   # Keep only obs in desired date range.
@@ -151,15 +151,6 @@ post_process_aggs <- function(df, aggregations, cw_list) {
     }
 
     aggregations$geo_level[agg_ind] <- geo_level
-
-    # Multiple choice metrics should also be included in the group_by vars
-    if (startsWith(aggregations$metric[agg_ind], "mc_")) {
-      if ( !(aggregations$metric[agg_ind] %in%
-             aggregations$group_by[agg_ind][[1]]) ) {
-        aggregations$group_by[agg_ind][[1]] <-
-          c(aggregations$group_by[agg_ind][[1]], aggregations$metric[agg_ind])
-      }
-    }
   }
 
   # Remove aggregations using unavailable variables.
@@ -320,33 +311,32 @@ summarize_aggs <- function(df, crosswalk_data, aggregations, geo_level, params)
 #'   being used
 #' @param params Named list of configuration options.
 #'
-#' @importFrom tibble add_column as_tibble
+#' @importFrom tibble add_column
 #' @importFrom dplyr %>%
+#' @importFrom stats setNames
 #'
 #' @export
 summarize_aggregations_group <- function(group_df, aggregations, target_group, geo_level, params) {
-  ## Prepare outputs.
-  dfs_out <- list()
-  for (row in seq_along(aggregations$id)) {
-    aggregation <- aggregations$id[row]
-
-    dfs_out[[aggregation]] <- target_group %>%
-      as.list %>%
-      as_tibble %>%
-      add_column(val=NA_real_,
-                 se=NA_real_,
-                 sample_size=NA_real_,
-                 effective_sample_size=NA_real_,
-                 represented=NA_real_)
-  }
-
+  # Prepare outputs.
+  fill_df <- target_group %>%
+    add_column(val=NA_real_,
+               se=NA_real_,
+               sample_size=NA_real_,
+               effective_sample_size=NA_real_,
+               represented=NA_real_)
+  dfs_out <- setNames(
+    rep(list(fill_df), times=length(aggregations$id)),
+    aggregations$id)
+  
   for (row in seq_along(aggregations$id)) {
     aggregation <- aggregations$id[row]
     metric <- aggregations$metric[row]
     var_weight <- aggregations$var_weight[row]
     compute_fn <- aggregations$compute_fn[[row]]
 
-    agg_df <- group_df[!is.na(group_df[[var_weight]]) & !is.na(group_df[[metric]]), ]
+    # Copy only columns we're using.
+    select_cols <- c(metric, var_weight, "weight_in_location")
+    agg_df <- group_df[, select_cols, with=FALSE][!is.na(eval(as.name(metric))), ]
 
     if (nrow(agg_df) > 0) {
       s_mix_coef <- params$s_mix_coef
 
@@ -14,21 +14,6 @@ run_contingency_tables <- function(params) {
     warning(debug_msg)
   }
 
-  if ( !is.null(params$aggs_in) ) {
-    if ( !file.exists(params$aggs_in) ) {
-      stop("requested aggregate-setting file does not exist")
-    }
-    
-    # Run non-default aggregates. File should create an object called `aggs`.
-    source(params$aggs_in)
-    
-    if ( !exists("aggs") || !inherits(aggs, "data.frame") ) {
-      stop("external aggregate-setting file must create a dataframe `aggs`")
-    }
-  } else {
-    aggs <- get_aggs()
-  }
-  
   ## Set default number of cores for mclapply to the total available number,
   ## because we are greedy and this will typically run on a server.
   if (params$parallel) {
@@ -43,20 +28,14 @@ run_contingency_tables <- function(params) {
     }
   }
 
-  if (params$aggregate_range == "week") {
-    run_contingency_tables_many_periods(params, aggs$week)
-  } else if (params$aggregate_range == "month") {
-    run_contingency_tables_many_periods(params, aggs$month)
-  } else if (params$aggregate_range == "both") {
-    params$aggregate_range <- "week"
-    run_contingency_tables_many_periods(params, aggs$week)
-
-    params$aggregate_range <- "month"
-    run_contingency_tables_many_periods(params, aggs$month)
-  } else {
+  aggs <- get_aggs()
+  
+  if ( length(params[["aggregate_range"]]) != 1 || !(params$aggregate_range %in% c("week", "month")) ) {
     stop(paste0("aggregate_range setting must be provided in params and be one",
-    " of 'week', 'month', or 'both'"))
+    " of 'week' or 'month'"))
   }
+  
+  run_contingency_tables_many_periods(params, aggs[[params$aggregate_range]])
 }
 
 
@@ -82,17 +61,18 @@ run_contingency_tables_many_periods <- function(params, aggregations)
 {
   if (!is.null(params$n_periods)) {
     msg_plain(paste0("Producing CSVs for ", params$n_periods, " time periods"))
-
+    
+    params$end_date <- ifelse(
+      is.null(params$end_date), as.character(Sys.Date()), params$end_date
+    )
+    
+    # Make list of dates to aggregate over.
     if (params$aggregate_range == "month") {
       period_step <- months(1)
     } else {
       period_step <- days(7)
     }
 
-    params$end_date <- ifelse(
-      is.null(params$end_date), as.character(Sys.Date()), params$end_date
-    )
-    # Make list of dates to aggregate over.
     end_dates <- as.character(sort(
       ymd(params$end_date) - period_step * seq(0, params$n_periods - 1)
     ))
 
@@ -180,16 +180,16 @@ code_health <- function(input_data, wave) {
   if ("C1" %in% names(input_data)) {
     comorbidities <- split_options(input_data$C1)
 
-    input_data$comorbidheartdisease <- is_selected(comorbidities, 3)
-    input_data$comorbidcancer <- is_selected(comorbidities, 2)
-    input_data$comorbidkidneydisease <- is_selected(comorbidities, 7)
-    input_data$comorbidlungdisease <- is_selected(comorbidities, 6)
+    input_data$comorbidheartdisease <- is_selected(comorbidities, "3")
+    input_data$comorbidcancer <- is_selected(comorbidities, "2")
+    input_data$comorbidkidneydisease <- is_selected(comorbidities, "7")
+    input_data$comorbidlungdisease <- is_selected(comorbidities, "6")
     input_data$comorbiddiabetes <-
-      is_selected(comorbidities, 1) |
-      is_selected(comorbidities, 12) |
-      is_selected(comorbidities, 10)
-    input_data$comorbidimmuno <- is_selected(comorbidities, 11)
-    input_data$comorbidobese <- is_selected(comorbidities, 13)
+      is_selected(comorbidities, "1") |
+      is_selected(comorbidities, "12") |
+      is_selected(comorbidities, "10")
+    input_data$comorbidimmuno <- is_selected(comorbidities, "11")
+    input_data$comorbidobese <- is_selected(comorbidities, "13")
 
     # Combo vaccine-eligibility
     input_data$eligible <- 
 
@@ -114,8 +114,8 @@ add_geo_vars <- function(data, params, geo_type) {
 
   # Insert the geographic variables in place of the "geo_id" variable.
   index <- which(names(data) == "geo_id")
-  before <- if (index > 1) data[1:(index-1)] else NULL
-  after <- data[(index+1):ncol(data)]
+  before <- if (index > 1) data[, 1:(index-1)] else NULL
+  after <- data[, (index+1):ncol(data)]
   result <- bind_cols(before, geo_vars, after)
 
   return(result)
 
@@ -1,6 +1,6 @@
 # Time zone to use throughout package.
 tz_to <- "America/Los_Angeles"
-wave6_mod_date <- ymd("2021-01-06", tz=tz_to)
+wave6_mod_date <- lubridate::ymd("2021-01-06", tz=tz_to)
 
 #' Get the date of the first day of the previous month.
 #'
 
@@ -17,12 +17,10 @@
 #' @importFrom parallel mclapply
 #' @export
 load_responses_all <- function(params, contingency_run = FALSE) {
-  input_data <- vector("list", length(params$input))
-  
   msg_plain(paste0("Loading ", length(params$input), " CSVs"))
 
   map_fn <- if (params$parallel) { mclapply } else { lapply }
-  input_data <- map_fn(seq_along(input_data), function(i) {
+  input_data <- map_fn(seq_along(params$input), function(i) {
     load_response_one(params$input[i], params, contingency_run)
   })
 
@@ -58,7 +56,7 @@ load_response_one <- function(input_filename, params, contingency_run) {
 
   col_names <- stri_split(read_lines(full_path, n_max = 1L), fixed = ",")[[1]]
   col_names <- stri_replace_all(col_names, "", fixed = "\"")
-
+  
   ## The CSVs have some columns with column-separated fields showing which of
   ## multiple options a user selected; readr would interpret these as thousand
   ## separators by default, so we tell it that no thousands separators are used.
@@ -363,7 +361,7 @@ filter_data_for_aggregation <- function(df, params, lead_days = 12L)
                dplyr::between(.data$hh_number_sick, 0L, 30L),
                dplyr::between(.data$hh_number_total, 1L, 30L),
                .data$hh_number_sick <= .data$hh_number_total,
-               .data$day >= (as.Date(params$start_date) - lead_days),
+               .data$day >= (as.Date(params$start_date) - lead_days)
   )
 
   msg_plain(paste0("Finished filtering data for aggregations"))
@@ -612,7 +610,7 @@ surveyID_to_wave <- Vectorize(function(surveyID) {
                 "SV_6PADB8DyF9SIyXk" = 10,
                 "SV_4VEaeffqQtDo33M" = 11)
 
-  if (surveyID %in% names(waves)) {
+  if ( any(names(waves) == surveyID) ) {
       return(waves[[surveyID]])
   }
Original file line number	Diff line number	Diff line change
`@@ -2,3 +2,4 @@ library(delphiFacebook)`
`2`	`2`
`3`	`3`	`params <- read_contingency_params("params.json")`
`4`	`4`	`run_contingency_tables(params)`
	`5`	`+message("run_contingency_tables completed successfully")`