cmu-delphi
diff --git a/‎ansible/templates/safegraph-params-prod.json.j2
Lines changed: 26 additions & 2 deletions b/‎ansible/templates/safegraph-params-prod.json.j2
Lines changed: 26 additions & 2 deletions
diff --git a/‎ansible/templates/sir_complainsalot-params-prod.json.j2
Lines changed: 2 additions & 2 deletions b/‎ansible/templates/sir_complainsalot-params-prod.json.j2
Lines changed: 2 additions & 2 deletions
diff --git a/‎claims_hosp/delphi_claims_hosp/config.py
Lines changed: 1 addition & 1 deletion b/‎claims_hosp/delphi_claims_hosp/config.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎claims_hosp/params.json.template
Lines changed: 1 addition & 1 deletion b/‎claims_hosp/params.json.template
Lines changed: 1 addition & 1 deletion
diff --git a/‎doctor_visits/delphi_doctor_visits/geo_maps.py
Lines changed: 43 additions & 0 deletions b/‎doctor_visits/delphi_doctor_visits/geo_maps.py
Lines changed: 43 additions & 0 deletions
diff --git a/‎doctor_visits/delphi_doctor_visits/run.py
Lines changed: 6 additions & 6 deletions b/‎doctor_visits/delphi_doctor_visits/run.py
Lines changed: 6 additions & 6 deletions
diff --git a/‎doctor_visits/delphi_doctor_visits/update_sensor.py
Lines changed: 3 additions & 14 deletions b/‎doctor_visits/delphi_doctor_visits/update_sensor.py
Lines changed: 3 additions & 14 deletions
diff --git a/‎doctor_visits/params.json.template
Lines changed: 1 addition & 1 deletion b/‎doctor_visits/params.json.template
Lines changed: 1 addition & 1 deletion
diff --git a/‎facebook/delphiFacebook/NAMESPACE
Lines changed: 0 additions & 1 deletion b/‎facebook/delphiFacebook/NAMESPACE
Lines changed: 0 additions & 1 deletion
diff --git a/‎facebook/delphiFacebook/R/contingency_aggregate.R
Lines changed: 46 additions & 32 deletions b/‎facebook/delphiFacebook/R/contingency_aggregate.R
Lines changed: 46 additions & 32 deletions
@@ -14,7 +14,31 @@
     "sync": true,
     "wip_signal" : []
   },
-  "archive": {
-    "cache_dir": "./cache"
+  "validation": {
+    "common": {
+      "data_source": "safegraph",
+      "span_length": 14,
+      "end_date": "today",
+      "suppressed_errors": [
+        {"signal": "bars_visit_num"},
+        {"signal": "bars_visit_prop"},
+        {"signal": "restaurants_visit_num"},
+        {"signal": "restaurants_visit_prop"}
+      ]
+    },
+    "static": {
+      "minimum_sample_size": 100,
+      "missing_se_allowed": false,
+      "missing_sample_size_allowed": false
+    },
+    "dynamic": {
+      "ref_window_size": 7,
+      "smoothed_signals": [
+        "completely_home_prop_7dav",
+        "full_time_work_prop_7dav",
+        "part_time_work_prop_7dav",
+        "median_home_dwell_time_7dav"
+      ]
+    }
   }
 }
@@ -34,12 +34,12 @@
     "fb-survey": {
       "max_age": 3,
       "maintainers": ["U01069KCRS7"],
-      "retired-signals": ["smoothed_anxious_5d", "smoothed_wanxious_5d", "smoothed_depressed_5d", "smoothed_wdepressed_5d", "smoothed_felt_isolated_5d", "smoothed_wfelt_isolated_5d", "smoothed_large_event_1d", "smoothed_wlarge_event_1d", "smoothed_restaurant_1d", "smoothed_wrestaurant_1d", "smoothed_shop_1d", "smoothed_wshop_1d", "smoothed_spent_time_1d", "smoothed_wspent_time_1d", "smoothed_travel_outside_state_5d", "smoothed_wtravel_outside_state_5d", "smoothed_work_outside_home_1d", "smoothed_wwork_outside_home_1d", "smoothed_wearing_mask", "smoothed_wwearing_mask"]
+      "retired-signals": ["smoothed_anxious_5d", "smoothed_wanxious_5d", "smoothed_depressed_5d", "smoothed_wdepressed_5d", "smoothed_felt_isolated_5d", "smoothed_wfelt_isolated_5d", "smoothed_large_event_1d", "smoothed_wlarge_event_1d", "smoothed_restaurant_1d", "smoothed_wrestaurant_1d", "smoothed_shop_1d", "smoothed_wshop_1d", "smoothed_spent_time_1d", "smoothed_wspent_time_1d", "smoothed_travel_outside_state_5d", "smoothed_wtravel_outside_state_5d", "smoothed_work_outside_home_1d", "smoothed_wwork_outside_home_1d", "smoothed_wearing_mask", "smoothed_wwearing_mask", "smoothed_vaccine_likely_local_health", "smoothed_wvaccine_likely_local_health"]
     },
     "indicator-combination": {
       "max_age": 4,
       "maintainers": ["U01AP8GSWG3","U01069KCRS7"],
-      "retired-signals": ["nmf_day_doc_fbs_ght"]
+      "retired-signals": ["nmf_day_doc_fbs_ght", "nmf_day_doc_fbc_fbs_ght"]
     },
     "quidel": {
       "max_age":6,
 
@@ -63,7 +63,7 @@ class GeoConstants:
     NUM_COUNTIES = 3141 + 52
     NUM_HRRS = 308
     NUM_MSAS = 392 + 52  # MSA + States
-    NUM_STATES = 52  # including DC and PR
+    NUM_STATES = 54  # including DC, PR, VI, GU
     NUM_HHSS = 10
     NUM_NATIONS = 1
 
 
@@ -8,7 +8,7 @@
     "start_date": "2020-02-01",
     "end_date": null,
     "drop_date": null,
-    "n_backfill_days": 60,
+    "n_backfill_days": 70,
     "n_waiting_days": 3,
     "write_se": false,
     "obfuscated_prefix": "foo_obfuscated",
 
@@ -7,6 +7,7 @@
 Created: 2020-04-18
 Last modified: 2020-04-30 by Aaron Rumack (add megacounty code)
 """
+from functools import partial
 
 import pandas as pd
 from delphi_utils.geomap import GeoMapper
@@ -20,6 +21,14 @@ class GeoMaps:
     def __init__(self):
         """Create the underlying GeoMapper."""
         self.gmpr = GeoMapper()
+        self.geo_func = {"county": partial(self.county_to_megacounty,
+                                           threshold_visits=Config.MIN_RECENT_VISITS,
+                                           threshold_len=Config.RECENT_LENGTH),
+                         "state": self.county_to_state,
+                         "msa": self.county_to_msa,
+                         "hrr": self.county_to_hrr,
+                         "hhs": self.county_to_hhs,
+                         "nation": self.county_to_nation}
 
     @staticmethod
     def convert_fips(x):
@@ -61,6 +70,40 @@ def county_to_state(self, data):
 
         return data.groupby("state_id"), "state_id"
 
+    def county_to_hhs(self, data):
+        """Aggregate county data to the HHS region resolution.
+
+        Args:
+            data: dataframe aggregated to the daily-county resolution (all 7 cols expected)
+
+        Returns: tuple of dataframe at the daily-HHS resolution, and geo_id column name
+        """
+        data = self.gmpr.add_geocode(data,
+                                     "fips",
+                                     "hhs",
+                                     from_col="PatCountyFIPS")
+        data.drop(columns="PatCountyFIPS", inplace=True)
+        data = data.groupby(["ServiceDate", "hhs"]).sum().reset_index()
+
+        return data.groupby("hhs"), "hhs"
+
+    def county_to_nation(self, data):
+        """Aggregate county data to the nation resolution.
+
+        Args:
+            data: dataframe aggregated to the daily-county resolution (all 7 cols expected)
+
+        Returns: tuple of dataframe at the daily-nation resolution, and geo_id column name
+        """
+        data = self.gmpr.add_geocode(data,
+                                     "fips",
+                                     "nation",
+                                     from_col="PatCountyFIPS")
+        data.drop(columns="PatCountyFIPS", inplace=True)
+        data = data.groupby(["ServiceDate", "nation"]).sum().reset_index()
+
+        return data.groupby("nation"), "nation"
+
     def county_to_hrr(self, data):
         """Aggregate county data to the HRR resolution.
 
 
@@ -61,14 +61,14 @@ def run_module(params):
     startdate_dt = enddate_dt - timedelta(days=n_backfill_days)
     enddate = str(enddate_dt.date())
     startdate = str(startdate_dt.date())
-    logging.info("drop date:\t\t{dropdate}")
-    logging.info("first sensor date:\t{startdate}")
-    logging.info("last sensor date:\t{enddate}")
-    logging.info("n_backfill_days:\t{n_backfill_days}")
-    logging.info("n_waiting_days:\t{n_waiting_days}")
+    logging.info("drop date:\t\t%s", dropdate)
+    logging.info("first sensor date:\t%s", startdate)
+    logging.info("last sensor date:\t%s", enddate)
+    logging.info("n_backfill_days:\t%s", n_backfill_days)
+    logging.info("n_waiting_days:\t%s", n_waiting_days)
 
     ## geographies
-    geos = ["state", "msa", "hrr", "county"]
+    geos = ["state", "msa", "hrr", "county", "hhs", "nation"]
 
 
     ## print out other vars
 
@@ -78,7 +78,7 @@ def update_sensor(
       startdate: first sensor date (YYYY-mm-dd)
       enddate: last sensor date (YYYY-mm-dd)
       dropdate: data drop date (YYYY-mm-dd)
-      geo: geographic resolution, one of ["county", "state", "msa", "hrr"]
+      geo: geographic resolution, one of ["county", "state", "msa", "hrr", "nation", "hhs"]
       parallel: boolean to run the sensor update in parallel
       weekday: boolean to adjust for weekday effects
       se: boolean to write out standard errors, if true, use an obfuscated name
@@ -132,19 +132,8 @@ def update_sensor(
 
     # get right geography
     geo_map = GeoMaps()
-    if geo.lower() == "county":
-        data_groups, _ = geo_map.county_to_megacounty(
-            data, Config.MIN_RECENT_VISITS, Config.RECENT_LENGTH
-        )
-    elif geo.lower() == "state":
-        data_groups, _ = geo_map.county_to_state(data)
-    elif geo.lower() == "msa":
-        data_groups, _ = geo_map.county_to_msa(data)
-    elif geo.lower() == "hrr":
-        data_groups, _ = geo_map.county_to_hrr(data)
-    else:
-        logging.error(f"{geo} is invalid, pick one of 'county', 'state', 'msa', 'hrr'")
-        return {}
+    mapping_func = geo_map.geo_func[geo.lower()]
+    data_groups, _ = mapping_func(data)
     unique_geo_ids = list(data_groups.groups.keys())
 
     # run sensor fitting code (maybe in parallel)
 
@@ -5,7 +5,7 @@
   "indicator": {
     "input_file": "./input/SYNEDI_AGG_OUTPATIENT_18052020_1455CDT.csv.gz",
     "drop_date": "",
-    "n_backfill_days": 60,
+    "n_backfill_days": 70,
     "n_waiting_days": 3,
     "weekday": [true, false],
     "se": false,
 
@@ -17,7 +17,6 @@ export(filter_complete_responses)
 export(filter_data_for_aggregation)
 export(filter_responses)
 export(floor_epiweek)
-export(get_date_range_from_filenames)
 export(get_filenames_in_range)
 export(get_range_prev_full_month)
 export(get_range_prev_full_period)
 
@@ -24,7 +24,7 @@
 #' @return none
 #'
 #' @import data.table
-#' @importFrom dplyr full_join %>%
+#' @importFrom dplyr full_join %>% select all_of
 #' @importFrom purrr reduce
 #'
 #' @export
@@ -44,9 +44,18 @@ produce_aggregates <- function(df, aggregations, cw_list, params) {
   df <- output[[1]]
   aggregations <- output[[2]]
 
+  ## Keep only columns used in indicators, plus supporting columns.
+  group_vars <- unique( unlist(aggregations$group_by) )
+  df <- select(df,
+               all_of(unique(aggregations$metric)),
+               all_of(unique(aggregations$var_weight)),
+               all_of( group_vars[group_vars != "geo_id"] ),
+               zip5,
+               start_dt)
+
   agg_groups <- unique(aggregations[c("group_by", "geo_level")])
 
-  # For each unique combination of groupby_vars and geo level, run aggregation process once
+  # For each unique combination of group_vars and geo level, run aggregation process once
   # and calculate all desired aggregations on the grouping. Rename columns. Save
   # to individual files
   for (group_ind in seq_along(agg_groups$group_by)) {
@@ -158,37 +167,43 @@ post_process_aggs <- function(df, aggregations, cw_list) {
   #   - multi-select items are converted to a series of binary columns, one for
   # each unique level/response code; multi-select used for grouping are left as-is.
   #   - multiple choice items are left as-is
-  
+
   #### TODO: How do we want to handle multi-select items when used for grouping?
-  agg_groups <- unique(aggregations$group_by)
-  group_cols_to_convert <- unique(do.call(c, agg_groups))
-  group_cols_to_convert <- group_cols_to_convert[startsWith(group_cols_to_convert, "b_")]
-
-  metric_cols_to_convert <- unique(aggregations$metric)
-
-  for (col_var in c(group_cols_to_convert, metric_cols_to_convert)) {
-    if ( is.null(df[[col_var]]) ) {
-      aggregations <- aggregations[aggregations$metric != col_var &
-                                     !mapply(aggregations$group_by,
-                                             FUN=function(x) {col_var %in% x}), ]
-      msg_plain(
-        paste0(
-          col_var, " is not defined. Removing all aggregations that use it. ", 
-          nrow(aggregations), " remaining")
-      )
+  group_vars <- unique( unlist(aggregations$group_by) )
+  group_vars <- group_vars[group_vars != "geo_id"]
+
+  metric_cols <- unique(aggregations$metric)
+  
+  cols_check_available <- unique(c(group_vars, metric_cols))
+  available <- cols_check_available %in% names(df)
+  cols_not_available <- cols_check_available[ !available ]
+  for (col_var in cols_not_available) {
+    # Remove from aggregations
+    aggregations <- aggregations[aggregations$metric != col_var &
+                                   !mapply(aggregations$group_by,
+                                           FUN=function(x) {col_var %in% x}), ]
+    msg_plain(paste0(
+        col_var, " is not defined. Removing all aggregations that use it. ",
+        nrow(aggregations), " remaining")
+    )
+  }
+
+  cols_available <- cols_check_available[ available ]
+  for (col_var in cols_available) {
+    if ( col_var %in% group_vars & !(col_var %in% metric_cols) & !startsWith(col_var, "b_") ) {
       next
     }
 
     if (startsWith(col_var, "b_")) { # Binary
       output <- code_binary(df, aggregations, col_var)
-    } else if (startsWith(col_var, "ms_")) { # Multiselect
-      output <- code_multiselect(df, aggregations, col_var)
     } else if (startsWith(col_var, "n_")) { # Numeric free response
       output <- code_numeric_freeresponse(df, aggregations, col_var)
-    } else if (startsWith(col_var, "mc_")) { # Multiple choice
+    } else if (startsWith(col_var, "ms_")) { # Multi-select
+      output <- code_multiselect(df, aggregations, col_var)
+    } else {
+      # Multiple choice and variables that are formatted differently
       output <- list(df, aggregations)
     }
-    
     df <- output[[1]]
     aggregations <- output[[2]]
   }
@@ -233,28 +248,27 @@ summarize_aggs <- function(df, crosswalk_data, aggregations, geo_level, params)
   ## inefficient; profiling shows the cost to be negligible, so shut it up
   df <- suppressWarnings(inner_join(df, crosswalk_data, by = "zip5"))
 
-  groupby_vars <- aggregations$group_by[[1]]
+  group_vars <- aggregations$group_by[[1]]
 
-  if (all(groupby_vars %in% names(df))) {
-    unique_group_combos <- unique(df[, groupby_vars, with=FALSE])
+  if (all(group_vars %in% names(df))) {
+    unique_group_combos <- unique(df[, group_vars, with=FALSE])
     unique_group_combos <- unique_group_combos[complete.cases(unique_group_combos)]
   } else {
     msg_plain(
       sprintf(
         "not all of groupby columns %s available in data; skipping aggregation",
-        paste(groupby_vars, collapse=", ")
+        paste(group_vars, collapse=", ")
       ))
   }
 
   if ( !exists("unique_group_combos") || nrow(unique_group_combos) == 0 ) {
     return(list())
   }
 
-
   ## Set an index on the groupby var columns so that the groupby step can be
   ## faster; data.table stores the sort order of the column and
   ## uses a binary search to find matching values, rather than a linear scan.
-  setindexv(df, groupby_vars)
+  setindexv(df, group_vars)
 
   calculate_group <- function(ii) {
     target_group <- unique_group_combos[ii]
@@ -287,15 +301,15 @@ summarize_aggs <- function(df, crosswalk_data, aggregations, geo_level, params)
   ## Do post-processing.
   for (row in seq_len(nrow(aggregations))) {
     aggregation <- aggregations$id[row]
-    groupby_vars <- aggregations$group_by[[row]]
+    group_vars <- aggregations$group_by[[row]]
     post_fn <- aggregations$post_fn[[row]]
 
     dfs_out[[aggregation]] <- dfs_out[[aggregation]][
-      rowSums(is.na(dfs_out[[aggregation]][, c("val", "sample_size", groupby_vars)])) == 0,
+      rowSums(is.na(dfs_out[[aggregation]][, c("val", "sample_size", group_vars)])) == 0,
     ]
 
     if (geo_level == "county") {
-      df_megacounties <- megacounty(dfs_out[[aggregation]], params$num_filter, groupby_vars)
+      df_megacounties <- megacounty(dfs_out[[aggregation]], params$num_filter, group_vars)
       dfs_out[[aggregation]] <- bind_rows(dfs_out[[aggregation]], df_megacounties)
     }