cmu-delphi · krivard · Feb 8, 2022 · Jan 11, 2022 · Jan 14, 2022 · Jan 14, 2022
diff --git a/quidel_covidtest/.pylintrc b/quidel_covidtest/.pylintrc
@@ -4,6 +4,7 @@
 disable=logging-format-interpolation,
     too-many-locals,
     too-many-arguments,
+    too-many-branches,
     # Allow pytest functions to be part of a class.
     no-self-use,
     # Allow pytest classes to have one test.

diff --git a/quidel_covidtest/delphi_quidel_covidtest/constants.py b/quidel_covidtest/delphi_quidel_covidtest/constants.py
@@ -3,7 +3,7 @@
 MIN_OBS = 50  # minimum number of observations in order to compute a proportion.
 POOL_DAYS = 7  # number of days in the past (including today) to pool over
 END_FROM_TODAY_MINUS = 5 # report data until - X days
-# Signal names
+# Signal Types
 SMOOTHED_POSITIVE = "covid_ag_smoothed_pct_positive"
 RAW_POSITIVE = "covid_ag_raw_pct_positive"
 SMOOTHED_TEST_PER_DEVICE = "covid_ag_smoothed_test_per_device"
@@ -22,6 +22,7 @@
     HRR,
 ]
 
+# state should be last one
 NONPARENT_GEO_RESOLUTIONS = [
     HHS,
     NATION,
@@ -39,3 +40,12 @@
 #    SMOOTHED_TEST_PER_DEVICE: (True, True),
 #    RAW_TEST_PER_DEVICE: (True, False)
 }
+AGE_GROUPS = [
+    "total",
+    "age_0_4",
+    "age_5_17",
+    "age_18_49",
+    "age_50_64",
+    "age_65plus",
+    "age_0_17",
+]
diff --git a/quidel_covidtest/delphi_quidel_covidtest/data_tools.py b/quidel_covidtest/delphi_quidel_covidtest/data_tools.py
@@ -67,15 +67,14 @@ def _slide_window_sum(arr, k):
     sarr = np.convolve(temp, np.ones(k, dtype=int), 'valid')
     return sarr
 
-
 def _geographical_pooling(tpooled_tests, tpooled_ptests, min_obs):
     """
     Determine how many samples from the parent geography must be borrowed.
 
-    If there are no samples available in the parent, the borrow_prop is 0.  If
-    the parent does not have enough samples, we return a borrow_prop of 1, and
-    the fact that the pooled samples are insufficient are handled in the
-    statistic fitting step.
+    If there are no samples available in the parent, the borrow_prop is 0.
+    If the parent does not have enough samples, we return a borrow_prop of 1.
+    No more samples borrowed from the parent compared to the number of samples
+    we currently have.
 
     Args:
         tpooled_tests: np.ndarray[float]
@@ -93,10 +92,12 @@ def _geographical_pooling(tpooled_tests, tpooled_ptests, min_obs):
     """
     if (np.any(np.isnan(tpooled_tests)) or np.any(np.isnan(tpooled_ptests))):
         raise ValueError('[parent] tests should be non-negative '
-                         'with no np.nan')
+                          'with no np.nan')
     # STEP 1: "TOP UP" USING PARENT LOCATION
     # Number of observations we need to borrow to "top up"
+    # Can't borrow more than total no. observations.
     borrow_tests = np.maximum(min_obs - tpooled_tests, 0)
+    borrow_tests = np.minimum(borrow_tests, tpooled_tests)
     # There are many cases (a, b > 0):
     # Case 1: a / b => no problem
     # Case 2: a / 0 => np.inf => borrow_prop becomes 1
@@ -108,13 +109,14 @@ def _geographical_pooling(tpooled_tests, tpooled_ptests, min_obs):
     with np.errstate(divide='ignore', invalid='ignore'):
         borrow_prop = borrow_tests / tpooled_ptests
         # If there's nothing to borrow, then ya can't borrow
-        borrow_prop[np.isnan(borrow_prop)] = 0
-        # Can't borrow more than total no. observations.
+        borrow_prop[(np.isnan(borrow_prop))
+                    | (tpooled_tests == 0)
+                    | (tpooled_ptests == 0)] = 0
+        # Can't borrow more than total no. observations in the parent state
         # Relies on the fact that np.inf > 1
         borrow_prop[borrow_prop > 1] = 1
     return borrow_prop
 
-
 def raw_positive_prop(positives, tests, min_obs):
     """
     Calculate the proportion of positive tests without any temporal smoothing.

diff --git a/quidel_covidtest/delphi_quidel_covidtest/generate_sensor.py b/quidel_covidtest/delphi_quidel_covidtest/generate_sensor.py
@@ -6,11 +6,12 @@
                          smoothed_tests_per_device,
                          raw_tests_per_device,
                          remove_null_samples)
+from .geo_maps import add_megacounties
 
-MIN_OBS = 50  # minimum number of observations in order to compute a proportion.
-POOL_DAYS = 7
+from .constants import (MIN_OBS, POOL_DAYS)
 
-def generate_sensor_for_nonparent_geo(state_groups, res_key, smooth, device, first_date, last_date):
+def generate_sensor_for_nonparent_geo(state_groups, res_key, smooth, device,
+                                      first_date, last_date, suffix):
     """
     Fit over geo resolutions that don't use a parent state (nation/hhs/state).
 
@@ -21,6 +22,8 @@ def generate_sensor_for_nonparent_geo(state_groups, res_key, smooth, device, fir
             Consider raw or smooth
         device: bool
             Consider test_per_device or pct_positive
+        suffix: str
+            Indicate the age group
     Returns:
         df: pd.DataFrame
     """
@@ -35,27 +38,27 @@ def generate_sensor_for_nonparent_geo(state_groups, res_key, smooth, device, fir
         # smoothed test per device
         if device & smooth:
             stat, se, sample_size = smoothed_tests_per_device(
-                devices=state_group["numUniqueDevices"].values,
-                tests=state_group['totalTest'].values,
+                devices=state_group[f"numUniqueDevices_{suffix}"].values,
+                tests=state_group[f'totalTest_{suffix}'].values,
                 min_obs=MIN_OBS, pool_days=POOL_DAYS)
         # raw test per device
         elif device & (not smooth):
             stat, se, sample_size = raw_tests_per_device(
-                devices=state_group["numUniqueDevices"].values,
-                tests=state_group['totalTest'].values,
+                devices=state_group[f"numUniqueDevices_{suffix}"].values,
+                tests=state_group[f'totalTest_{suffix}'].values,
                 min_obs=MIN_OBS)
         # smoothed pct positive
         elif (not device) & smooth:
             stat, se, sample_size = smoothed_positive_prop(
-                tests=state_group['totalTest'].values,
-                positives=state_group['positiveTest'].values,
+                tests=state_group[f'totalTest_{suffix}'].values,
+                positives=state_group[f'positiveTest_{suffix}'].values,
                 min_obs=MIN_OBS, pool_days=POOL_DAYS)
             stat = stat * 100
         # raw pct positive
         else:
             stat, se, sample_size = raw_positive_prop(
-                tests=state_group['totalTest'].values,
-                positives=state_group['positiveTest'].values,
+                tests=state_group[f'totalTest_{suffix}'].values,
+                positives=state_group[f'positiveTest_{suffix}'].values,
                 min_obs=MIN_OBS)
             stat = stat * 100
 
@@ -68,7 +71,7 @@ def generate_sensor_for_nonparent_geo(state_groups, res_key, smooth, device, fir
     return remove_null_samples(state_df)
 
 def generate_sensor_for_parent_geo(state_groups, data, res_key, smooth,
-                                   device, first_date, last_date):
+                                   device, first_date, last_date, suffix):
     """
     Fit over geo resolutions that use a parent state (county/hrr/msa).
 
@@ -79,15 +82,16 @@ def generate_sensor_for_parent_geo(state_groups, data, res_key, smooth,
             Consider raw or smooth
         device: bool
             Consider test_per_device or pct_positive
+        suffix: str
+            Indicate the age group
     Returns:
         df: pd.DataFrame
     """
     has_parent = True
     res_df = pd.DataFrame(columns=["geo_id", "val", "se", "sample_size"])
-    res_groups = data.groupby(res_key)
-    loc_list = list(res_groups.groups.keys())
-    for loc in loc_list:
-        res_group = res_groups.get_group(loc)
+    if res_key == "fips": # Add rest-of-state report for county level
+        data = add_megacounties(data, smooth)
+    for loc, res_group in data.groupby(res_key):
         parent_state = res_group['state_id'].values[0]
         try:
             parent_group = state_groups.get_group(parent_state)
@@ -104,41 +108,41 @@ def generate_sensor_for_parent_geo(state_groups, data, res_key, smooth,
             if has_parent:
                 if device:
                     stat, se, sample_size = smoothed_tests_per_device(
-                        devices=res_group["numUniqueDevices"].values,
-                        tests=res_group['totalTest'].values,
+                        devices=res_group[f"numUniqueDevices_{suffix}"].values,
+                        tests=res_group[f'totalTest_{suffix}'].values,
                         min_obs=MIN_OBS, pool_days=POOL_DAYS,
-                        parent_devices=res_group["numUniqueDevices_parent"].values,
-                        parent_tests=res_group["totalTest_parent"].values)
+                        parent_devices=res_group[f"numUniqueDevices_{suffix}_parent"].values,
+                        parent_tests=res_group[f"totalTest_{suffix}_parent"].values)
                 else:
                     stat, se, sample_size = smoothed_positive_prop(
-                        tests=res_group['totalTest'].values,
-                        positives=res_group['positiveTest'].values,
+                        tests=res_group[f'totalTest_{suffix}'].values,
+                        positives=res_group[f'positiveTest_{suffix}'].values,
                         min_obs=MIN_OBS, pool_days=POOL_DAYS,
-                        parent_tests=res_group["totalTest_parent"].values,
-                        parent_positives=res_group['positiveTest_parent'].values)
+                        parent_tests=res_group[f"totalTest_{suffix}_parent"].values,
+                        parent_positives=res_group[f'positiveTest_{suffix}_parent'].values)
                     stat = stat * 100
             else:
                 if device:
                     stat, se, sample_size = smoothed_tests_per_device(
-                        devices=res_group["numUniqueDevices"].values,
-                        tests=res_group['totalTest'].values,
+                        devices=res_group[f"numUniqueDevices_{suffix}"].values,
+                        tests=res_group[f'totalTest_{suffix}'].values,
                         min_obs=MIN_OBS, pool_days=POOL_DAYS)
                 else:
                     stat, se, sample_size = smoothed_positive_prop(
-                        tests=res_group['totalTest'].values,
-                        positives=res_group['positiveTest'].values,
+                        tests=res_group[f'totalTest_{suffix}'].values,
+                        positives=res_group[f'positiveTest_{suffix}'].values,
                         min_obs=MIN_OBS, pool_days=POOL_DAYS)
                     stat = stat * 100
         else:
             if device:
                 stat, se, sample_size = raw_tests_per_device(
-                    devices=res_group["numUniqueDevices"].values,
-                    tests=res_group['totalTest'].values,
+                    devices=res_group[f"numUniqueDevices_{suffix}"].values,
+                    tests=res_group[f'totalTest_{suffix}'].values,
                     min_obs=MIN_OBS)
             else:
                 stat, se, sample_size = raw_positive_prop(
-                    tests=res_group['totalTest'].values,
-                    positives=res_group['positiveTest'].values,
+                    tests=res_group[f'totalTest_{suffix}'].values,
+                    positives=res_group[f'positiveTest_{suffix}'].values,
                     min_obs=MIN_OBS)
                 stat = stat * 100
 

diff --git a/quidel_covidtest/delphi_quidel_covidtest/geo_maps.py b/quidel_covidtest/delphi_quidel_covidtest/geo_maps.py
@@ -1,7 +1,13 @@
 """Contains geographic mapping tools."""
+from itertools import product
+from functools import reduce
+
+import pandas as pd
+
 from delphi_utils import GeoMapper
+from .constants import (AGE_GROUPS, MIN_OBS)
 
-DATA_COLS = ['totalTest', 'numUniqueDevices', 'positiveTest', "population"]
+DATA_COLS = ['totalTest', 'numUniqueDevices', 'positiveTest']
 GMPR = GeoMapper()  # Use geo utils
 GEO_KEY_DICT = {
         "county": "fips",
@@ -12,21 +18,52 @@
         "hhs": "hhs"
 }
 
-
 def geo_map(geo_res, df):
     """Map a geocode to a new value."""
     data = df.copy()
     geo_key = GEO_KEY_DICT[geo_res]
     # Add population for each zipcode
     data = GMPR.add_population_column(data, "zip")
     # zip -> geo_res
-    data = GMPR.replace_geocode(data, "zip", geo_key, data_cols=DATA_COLS)
+    data_cols = ["population"]
+    for col, agegroup in product(DATA_COLS, AGE_GROUPS):
+        data_cols.append("_".join([col, agegroup]))
+
+    data = GMPR.replace_geocode(
+        data, from_code="zip", new_code=geo_key, date_col = "timestamp",
+        data_cols=data_cols)
     if geo_res in ["state", "hhs", "nation"]:
         return data, geo_key
     # Add parent state
     data = add_parent_state(data, geo_res, geo_key)
     return data, geo_key
 
+def add_megacounties(data, smooth=False):
+    """Add megacounties to county level report."""
+    assert "fips" in data.columns # Make sure the data is at county level
+
+    # For raw signals, the threshold is MIN_OBS
+    # For smoothed signals, the threshold is MIN_OBS/2
+    if smooth:
+        threshold_visits = MIN_OBS/2
+    else:
+        threshold_visits = MIN_OBS
+    pdList = []
+    for agegroup in AGE_GROUPS:
+        data_cols = [f"{col}_{agegroup}" for col in DATA_COLS]
+        df = GMPR.fips_to_megacounty(data[data_cols + ["timestamp", "fips"]],
+                                     threshold_visits, 1, fips_col="fips",
+                                     thr_col=f"totalTest_{agegroup}",
+                                     date_col="timestamp")
+        df.rename({"megafips": "fips"}, axis=1, inplace=True)
+        megacounties = df[df.fips.str.endswith("000")]
+        pdList.append(megacounties)
+    mega_df = reduce(lambda x, y: pd.merge(
+        x, y, on = ["timestamp", "fips"]), pdList)
+    mega_df = GMPR.add_geocode(mega_df, from_code="fips", new_code="state_id",
+                               from_col="fips", new_col="state_id")
+
+    return pd.concat([data, mega_df])
 
 def add_parent_state(data, geo_res, geo_key):
     """