cmu-delphi · krivard · Feb 8, 2022 · Jan 11, 2022 · Jan 14, 2022 · Jan 14, 2022
diff --git a/quidel_covidtest/.pylintrc b/quidel_covidtest/.pylintrc
@@ -4,6 +4,7 @@
 disable=logging-format-interpolation,
     too-many-locals,
     too-many-arguments,
+    too-many-branches,
     # Allow pytest functions to be part of a class.
     no-self-use,
     # Allow pytest classes to have one test.

diff --git a/quidel_covidtest/delphi_quidel_covidtest/constants.py b/quidel_covidtest/delphi_quidel_covidtest/constants.py
@@ -3,7 +3,7 @@
 MIN_OBS = 50  # minimum number of observations in order to compute a proportion.
 POOL_DAYS = 7  # number of days in the past (including today) to pool over
 END_FROM_TODAY_MINUS = 5 # report data until - X days
-# Signal names
+# Signal Types
 SMOOTHED_POSITIVE = "covid_ag_smoothed_pct_positive"
 RAW_POSITIVE = "covid_ag_raw_pct_positive"
 SMOOTHED_TEST_PER_DEVICE = "covid_ag_smoothed_test_per_device"
@@ -22,6 +22,7 @@
     HRR,
 ]
 
+# state should be last one
 NONPARENT_GEO_RESOLUTIONS = [
     HHS,
     NATION,
@@ -39,3 +40,12 @@
 #    SMOOTHED_TEST_PER_DEVICE: (True, True),
 #    RAW_TEST_PER_DEVICE: (True, False)
 }
+AGE_GROUPS = [
+    "total",
+    "age_0_4",
+    "age_5_17",
+    "age_18_49",
+    "age_50_64",
+    "age_65plus",
+    "age_0_17",
+]
diff --git a/quidel_covidtest/delphi_quidel_covidtest/generate_sensor.py b/quidel_covidtest/delphi_quidel_covidtest/generate_sensor.py
@@ -6,11 +6,12 @@
                          smoothed_tests_per_device,
                          raw_tests_per_device,
                          remove_null_samples)
+from .geo_maps import add_megacounties
 
-MIN_OBS = 50  # minimum number of observations in order to compute a proportion.
-POOL_DAYS = 7
+from .constants import (MIN_OBS, POOL_DAYS)
 
-def generate_sensor_for_nonparent_geo(state_groups, res_key, smooth, device, first_date, last_date):
+def generate_sensor_for_nonparent_geo(state_groups, res_key, smooth, device,
+                                      first_date, last_date, suffix):
     """
     Fit over geo resolutions that don't use a parent state (nation/hhs/state).
 
@@ -21,6 +22,8 @@ def generate_sensor_for_nonparent_geo(state_groups, res_key, smooth, device, fir
             Consider raw or smooth
         device: bool
             Consider test_per_device or pct_positive
+        suffix: str
+            Indicate the age group
     Returns:
         df: pd.DataFrame
     """
@@ -35,27 +38,27 @@ def generate_sensor_for_nonparent_geo(state_groups, res_key, smooth, device, fir
         # smoothed test per device
         if device & smooth:
             stat, se, sample_size = smoothed_tests_per_device(
-                devices=state_group["numUniqueDevices"].values,
-                tests=state_group['totalTest'].values,
+                devices=state_group[f"numUniqueDevices_{suffix}"].values,
+                tests=state_group[f'totalTest_{suffix}'].values,
                 min_obs=MIN_OBS, pool_days=POOL_DAYS)
         # raw test per device
         elif device & (not smooth):
             stat, se, sample_size = raw_tests_per_device(
-                devices=state_group["numUniqueDevices"].values,
-                tests=state_group['totalTest'].values,
+                devices=state_group[f"numUniqueDevices_{suffix}"].values,
+                tests=state_group[f'totalTest_{suffix}'].values,
                 min_obs=MIN_OBS)
         # smoothed pct positive
         elif (not device) & smooth:
             stat, se, sample_size = smoothed_positive_prop(
-                tests=state_group['totalTest'].values,
-                positives=state_group['positiveTest'].values,
+                tests=state_group[f'totalTest_{suffix}'].values,
+                positives=state_group[f'positiveTest_{suffix}'].values,
                 min_obs=MIN_OBS, pool_days=POOL_DAYS)
             stat = stat * 100
         # raw pct positive
         else:
             stat, se, sample_size = raw_positive_prop(
-                tests=state_group['totalTest'].values,
-                positives=state_group['positiveTest'].values,
+                tests=state_group[f'totalTest_{suffix}'].values,
+                positives=state_group[f'positiveTest_{suffix}'].values,
                 min_obs=MIN_OBS)
             stat = stat * 100
 
@@ -68,7 +71,7 @@ def generate_sensor_for_nonparent_geo(state_groups, res_key, smooth, device, fir
     return remove_null_samples(state_df)
 
 def generate_sensor_for_parent_geo(state_groups, data, res_key, smooth,
-                                   device, first_date, last_date):
+                                   device, first_date, last_date, suffix):
     """
     Fit over geo resolutions that use a parent state (county/hrr/msa).
 
@@ -79,11 +82,15 @@ def generate_sensor_for_parent_geo(state_groups, data, res_key, smooth,
             Consider raw or smooth
         device: bool
             Consider test_per_device or pct_positive
+        suffix: str
+            Indicate the age group
     Returns:
         df: pd.DataFrame
     """
     has_parent = True
     res_df = pd.DataFrame(columns=["geo_id", "val", "se", "sample_size"])
+    if res_key == "fips": # Add rest-of-state report for county level
+        data = add_megacounties(data, smooth)
     res_groups = data.groupby(res_key)
     loc_list = list(res_groups.groups.keys())
     for loc in loc_list:
@@ -104,41 +111,41 @@ def generate_sensor_for_parent_geo(state_groups, data, res_key, smooth,
             if has_parent:
                 if device:
                     stat, se, sample_size = smoothed_tests_per_device(
-                        devices=res_group["numUniqueDevices"].values,
-                        tests=res_group['totalTest'].values,
+                        devices=res_group[f"numUniqueDevices_{suffix}"].values,
+                        tests=res_group[f'totalTest_{suffix}'].values,
                         min_obs=MIN_OBS, pool_days=POOL_DAYS,
-                        parent_devices=res_group["numUniqueDevices_parent"].values,
-                        parent_tests=res_group["totalTest_parent"].values)
+                        parent_devices=res_group[f"numUniqueDevices_{suffix}_parent"].values,
+                        parent_tests=res_group[f"totalTest_{suffix}_parent"].values)
                 else:
                     stat, se, sample_size = smoothed_positive_prop(
-                        tests=res_group['totalTest'].values,
-                        positives=res_group['positiveTest'].values,
+                        tests=res_group[f'totalTest_{suffix}'].values,
+                        positives=res_group[f'positiveTest_{suffix}'].values,
                         min_obs=MIN_OBS, pool_days=POOL_DAYS,
-                        parent_tests=res_group["totalTest_parent"].values,
-                        parent_positives=res_group['positiveTest_parent'].values)
+                        parent_tests=res_group[f"totalTest_{suffix}_parent"].values,
+                        parent_positives=res_group[f'positiveTest_{suffix}_parent'].values)
                     stat = stat * 100
             else:
                 if device:
                     stat, se, sample_size = smoothed_tests_per_device(
-                        devices=res_group["numUniqueDevices"].values,
-                        tests=res_group['totalTest'].values,
+                        devices=res_group[f"numUniqueDevices_{suffix}"].values,
+                        tests=res_group[f'totalTest_{suffix}'].values,
                         min_obs=MIN_OBS, pool_days=POOL_DAYS)
                 else:
                     stat, se, sample_size = smoothed_positive_prop(
-                        tests=res_group['totalTest'].values,
-                        positives=res_group['positiveTest'].values,
+                        tests=res_group[f'totalTest_{suffix}'].values,
+                        positives=res_group[f'positiveTest_{suffix}'].values,
                         min_obs=MIN_OBS, pool_days=POOL_DAYS)
                     stat = stat * 100
         else:
             if device:
                 stat, se, sample_size = raw_tests_per_device(
-                    devices=res_group["numUniqueDevices"].values,
-                    tests=res_group['totalTest'].values,
+                    devices=res_group[f"numUniqueDevices_{suffix}"].values,
+                    tests=res_group[f'totalTest_{suffix}'].values,
                     min_obs=MIN_OBS)
             else:
                 stat, se, sample_size = raw_positive_prop(
-                    tests=res_group['totalTest'].values,
-                    positives=res_group['positiveTest'].values,
+                    tests=res_group[f'totalTest_{suffix}'].values,
+                    positives=res_group[f'positiveTest_{suffix}'].values,
                     min_obs=MIN_OBS)
                 stat = stat * 100
 

diff --git a/quidel_covidtest/delphi_quidel_covidtest/geo_maps.py b/quidel_covidtest/delphi_quidel_covidtest/geo_maps.py
@@ -1,7 +1,13 @@
 """Contains geographic mapping tools."""
+from itertools import product
+from functools import reduce
+
+import pandas as pd
+
 from delphi_utils import GeoMapper
+from .constants import (AGE_GROUPS, MIN_OBS)
 
-DATA_COLS = ['totalTest', 'numUniqueDevices', 'positiveTest', "population"]
+DATA_COLS = ['totalTest', 'numUniqueDevices', 'positiveTest']
 GMPR = GeoMapper()  # Use geo utils
 GEO_KEY_DICT = {
         "county": "fips",
@@ -12,21 +18,52 @@
         "hhs": "hhs"
 }
 
-
 def geo_map(geo_res, df):
     """Map a geocode to a new value."""
     data = df.copy()
     geo_key = GEO_KEY_DICT[geo_res]
     # Add population for each zipcode
     data = GMPR.add_population_column(data, "zip")
     # zip -> geo_res
-    data = GMPR.replace_geocode(data, "zip", geo_key, data_cols=DATA_COLS)
+    data_cols = ["population"]
+    for col, agegroup in product(DATA_COLS, AGE_GROUPS):
+        data_cols.append("_".join([col, agegroup]))
+
+    data = GMPR.replace_geocode(
+        data, from_code="zip", new_code=geo_key, date_col = "timestamp",
+        data_cols=data_cols)
     if geo_res in ["state", "hhs", "nation"]:
         return data, geo_key
     # Add parent state
     data = add_parent_state(data, geo_res, geo_key)
     return data, geo_key
 
+def add_megacounties(data, smooth=False):
+    """Add megacounties to county level report."""
+    assert "fips" in data.columns # Make sure the data is at county level
+
+    # For raw signals, the threshold is MIN_OBS
+    # For smoothed signals, the threshold is MIN_OBS/2
+    if smooth:
+        threshold_visits = MIN_OBS/2
+    else:
+        threshold_visits = MIN_OBS
+    pdList = []
+    for agegroup in AGE_GROUPS:
+        data_cols = [f"{col}_{agegroup}" for col in DATA_COLS]
+        df = GMPR.fips_to_megacounty(data[data_cols + ["timestamp", "fips"]],
+                                     threshold_visits, 1, fips_col="fips",
+                                     thr_col=f"totalTest_{agegroup}",
+                                     date_col="timestamp")
+        df.rename({"megafips": "fips"}, axis=1, inplace=True)
+        megacounties = df[df.fips.str.endswith("000")]
+        pdList.append(megacounties)
+    mega_df = reduce(lambda x, y: pd.merge(
+        x, y, on = ["timestamp", "fips"]), pdList)
+    mega_df = GMPR.add_geocode(mega_df, from_code="fips", new_code="state_id",
+                               from_col="fips", new_col="state_id")
+
+    return pd.concat([data, mega_df])
 
 def add_parent_state(data, geo_res, geo_key):
     """

diff --git a/quidel_covidtest/delphi_quidel_covidtest/pull.py b/quidel_covidtest/delphi_quidel_covidtest/pull.py
@@ -8,6 +8,8 @@
 import pandas as pd
 import numpy as np
 
+from .constants import AGE_GROUPS
+
 def get_from_s3(start_date, end_date, bucket, logger):
     """
     Get raw data from aws s3 bucket.
@@ -163,21 +165,21 @@ def preprocess_new_data(start_date, end_date, params, test_mode, logger):
     overall_pos = df[df["OverallResult"] == "positive"].groupby(
         by=["timestamp", "zip"],
         as_index=False)['OverallResult'].count()
-    overall_pos["positiveTest"] = overall_pos["OverallResult"]
+    overall_pos["positiveTest_total"] = overall_pos["OverallResult"]
     overall_pos.drop(labels="OverallResult", axis="columns", inplace=True)
 
     # Compute overallTotal
     overall_total = df.groupby(
         by=["timestamp", "zip"],
         as_index=False)['OverallResult'].count()
-    overall_total["totalTest"] = overall_total["OverallResult"]
+    overall_total["totalTest_total"] = overall_total["OverallResult"]
     overall_total.drop(labels="OverallResult", axis="columns", inplace=True)
 
     # Compute numUniqueDevices
     numUniqueDevices = df.groupby(
         by=["timestamp", "zip"],
         as_index=False)["SofiaSerNum"].agg({"SofiaSerNum": "nunique"}).rename(
-            columns={"SofiaSerNum": "numUniqueDevices"}
+            columns={"SofiaSerNum": "numUniqueDevices_total"}
         )
 
     df_merged = overall_total.merge(
@@ -186,6 +188,55 @@ def preprocess_new_data(start_date, end_date, params, test_mode, logger):
         overall_pos, on=["timestamp", "zip"], how="left"
         ).fillna(0).drop_duplicates()
 
+    # Compute Summary info for age groups
+    df["PatientAge"] = df["PatientAge"].fillna(-1)
+    df.loc[df["PatientAge"] == "<1", "PatientAge"] = 0.5
+    df.loc[df["PatientAge"] == ">85", "PatientAge"] = 100
+    df["PatientAge"] = df["PatientAge"] .astype(float)
+
+    # Should match the suffixes of signal names
+    df["label"] = None
+    df.loc[df["PatientAge"] < 5, "label"] = "age_0_4"
+    df.loc[((df["PatientAge"] >= 5)) & (df["PatientAge"] < 18), "label"] = "age_5_17"
+    df.loc[((df["PatientAge"] >= 18)) & (df["PatientAge"] < 50), "label"] = "age_18_49"
+    df.loc[((df["PatientAge"] >= 50)) & (df["PatientAge"] < 65), "label"] = "age_50_64"
+    df.loc[(df["PatientAge"] >= 65), "label"] = "age_65plus"
+    df.loc[df["PatientAge"] == -1, "label"] = "NA"
+
+    for agegroup in AGE_GROUPS[1:]: # Exclude total
+        if agegroup == "age_0_17":
+            ages = ["age_0_4", "age_5_17"]
+        else:
+            ages = [agegroup]
+        # Compute overallPositive
+        group_pos = df.loc[(df["OverallResult"] == "positive")
+                             & (df["label"].isin(ages))].groupby(
+            by=["timestamp", "zip"],
+            as_index=False)['OverallResult'].count()
+        group_pos[f"positiveTest_{agegroup}"] = group_pos["OverallResult"]
+        group_pos.drop(labels="OverallResult", axis="columns", inplace=True)
+
+        # Compute overallTotal
+        group_total = df.loc[df["label"].isin(ages)].groupby(
+            by=["timestamp", "zip"],
+            as_index=False)['OverallResult'].count()
+        group_total[f"totalTest_{agegroup}"] = group_total["OverallResult"]
+        group_total.drop(labels="OverallResult", axis="columns", inplace=True)
+
+        # Compute numUniqueDevices
+        group_numUniqueDevices = df.loc[df["label"].isin(ages)].groupby(
+            by=["timestamp", "zip"],
+            as_index=False)["SofiaSerNum"].agg({"SofiaSerNum": "nunique"}).rename(
+                columns={"SofiaSerNum": f"numUniqueDevices_{agegroup}"}
+            )
+
+        df_merged = df_merged.merge(
+            group_numUniqueDevices, on=["timestamp", "zip"], how="left"
+            ).merge(
+            group_pos, on=["timestamp", "zip"], how="left"
+            ).merge(
+            group_total, on=["timestamp", "zip"], how="left"
+            ).fillna(0).drop_duplicates()
 
     return df_merged, time_flag