Skip to content

Update quidel covidtest (Add Age Groups Signals, Add rest-of-state reports) #1467

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 31 commits into from
Feb 8, 2022
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
20a46d2
add age groups
Jan 11, 2022
801e2f5
update code for adding megacounties
Jan 14, 2022
15f7690
update unit tests
Jan 14, 2022
010491d
get smoothers out of the complicated loop
Jan 14, 2022
d215d6c
fix a linting
Jan 14, 2022
a120175
fix an error
Jan 14, 2022
b199cad
ignore too-many-branches in pylintrc
Jan 14, 2022
e7870e8
fix a linting error
Jan 14, 2022
07642fa
update signal names, add two super age groups
Jan 14, 2022
1ee31ae
fix a linting error
Jan 14, 2022
4eee961
remove 18-64 age group
Jan 18, 2022
a81050c
add whitespace and add comments
Jan 18, 2022
dc06d9c
add tests for ages 0-17
Jan 18, 2022
0c0f9f5
small udpates for suggested changes
Jan 18, 2022
1707cfb
add state_id for megacounties
Jan 21, 2022
e1226e1
add tests for state_id
Jan 21, 2022
562773d
Add minimal censored counties test and get error?
dshemetov Jan 21, 2022
ef41f6a
add suggested changes
Jan 21, 2022
c81298b
update unit tests based on the current strategy
Jan 23, 2022
7721290
geo_id should be integers in the unit tests
Jan 23, 2022
b7f94c9
udpate geographical pooling
Jan 25, 2022
1fdbe49
update unit tests
Jan 25, 2022
52a4aa6
update unit tests in test_run
Jan 25, 2022
9f3f6c3
delete trailing whitespaces
Jan 25, 2022
49af726
Add a few tests to double check county censoring
dshemetov Jan 26, 2022
3218b1e
Remove faux-breakpoint, update test_data, update test_run
dshemetov Jan 26, 2022
5c6d798
fix the test in test_run
Jan 27, 2022
3e9232e
remove the question in comments
Jan 27, 2022
ab25b35
add tests for values
Jan 27, 2022
8fbff93
add archiver section to quidel params
Feb 2, 2022
963bb5a
fix params
Feb 2, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions quidel_covidtest/.pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
disable=logging-format-interpolation,
too-many-locals,
too-many-arguments,
too-many-branches,
# Allow pytest functions to be part of a class.
no-self-use,
# Allow pytest classes to have one test.
Expand Down
12 changes: 11 additions & 1 deletion quidel_covidtest/delphi_quidel_covidtest/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
MIN_OBS = 50 # minimum number of observations in order to compute a proportion.
POOL_DAYS = 7 # number of days in the past (including today) to pool over
END_FROM_TODAY_MINUS = 5 # report data until - X days
# Signal names
# Signal Types
SMOOTHED_POSITIVE = "covid_ag_smoothed_pct_positive"
RAW_POSITIVE = "covid_ag_raw_pct_positive"
SMOOTHED_TEST_PER_DEVICE = "covid_ag_smoothed_test_per_device"
Expand All @@ -22,6 +22,7 @@
HRR,
]

# state should be last one
NONPARENT_GEO_RESOLUTIONS = [
HHS,
NATION,
Expand All @@ -39,3 +40,12 @@
# SMOOTHED_TEST_PER_DEVICE: (True, True),
# RAW_TEST_PER_DEVICE: (True, False)
}
AGE_GROUPS = [
"total",
"age_0_4",
"age_5_17",
"age_18_49",
"age_50_64",
"age_65plus",
"age_0_17",
]
63 changes: 35 additions & 28 deletions quidel_covidtest/delphi_quidel_covidtest/generate_sensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@
smoothed_tests_per_device,
raw_tests_per_device,
remove_null_samples)
from .geo_maps import add_megacounties

MIN_OBS = 50 # minimum number of observations in order to compute a proportion.
POOL_DAYS = 7
from .constants import (MIN_OBS, POOL_DAYS)

def generate_sensor_for_nonparent_geo(state_groups, res_key, smooth, device, first_date, last_date):
def generate_sensor_for_nonparent_geo(state_groups, res_key, smooth, device,
first_date, last_date, suffix):
"""
Fit over geo resolutions that don't use a parent state (nation/hhs/state).

Expand All @@ -21,6 +22,8 @@ def generate_sensor_for_nonparent_geo(state_groups, res_key, smooth, device, fir
Consider raw or smooth
device: bool
Consider test_per_device or pct_positive
suffix: str
Indicate the age group
Returns:
df: pd.DataFrame
"""
Expand All @@ -35,27 +38,27 @@ def generate_sensor_for_nonparent_geo(state_groups, res_key, smooth, device, fir
# smoothed test per device
if device & smooth:
stat, se, sample_size = smoothed_tests_per_device(
devices=state_group["numUniqueDevices"].values,
tests=state_group['totalTest'].values,
devices=state_group[f"numUniqueDevices_{suffix}"].values,
tests=state_group[f'totalTest_{suffix}'].values,
min_obs=MIN_OBS, pool_days=POOL_DAYS)
# raw test per device
elif device & (not smooth):
stat, se, sample_size = raw_tests_per_device(
devices=state_group["numUniqueDevices"].values,
tests=state_group['totalTest'].values,
devices=state_group[f"numUniqueDevices_{suffix}"].values,
tests=state_group[f'totalTest_{suffix}'].values,
min_obs=MIN_OBS)
# smoothed pct positive
elif (not device) & smooth:
stat, se, sample_size = smoothed_positive_prop(
tests=state_group['totalTest'].values,
positives=state_group['positiveTest'].values,
tests=state_group[f'totalTest_{suffix}'].values,
positives=state_group[f'positiveTest_{suffix}'].values,
min_obs=MIN_OBS, pool_days=POOL_DAYS)
stat = stat * 100
# raw pct positive
else:
stat, se, sample_size = raw_positive_prop(
tests=state_group['totalTest'].values,
positives=state_group['positiveTest'].values,
tests=state_group[f'totalTest_{suffix}'].values,
positives=state_group[f'positiveTest_{suffix}'].values,
min_obs=MIN_OBS)
stat = stat * 100

Expand All @@ -68,7 +71,7 @@ def generate_sensor_for_nonparent_geo(state_groups, res_key, smooth, device, fir
return remove_null_samples(state_df)

def generate_sensor_for_parent_geo(state_groups, data, res_key, smooth,
device, first_date, last_date):
device, first_date, last_date, suffix):
"""
Fit over geo resolutions that use a parent state (county/hrr/msa).

Expand All @@ -79,11 +82,15 @@ def generate_sensor_for_parent_geo(state_groups, data, res_key, smooth,
Consider raw or smooth
device: bool
Consider test_per_device or pct_positive
suffix: str
Indicate the age group
Returns:
df: pd.DataFrame
"""
has_parent = True
res_df = pd.DataFrame(columns=["geo_id", "val", "se", "sample_size"])
if res_key == "fips": # Add rest-of-state report for county level
data = add_megacounties(data, smooth)
res_groups = data.groupby(res_key)
loc_list = list(res_groups.groups.keys())
for loc in loc_list:
Expand All @@ -104,41 +111,41 @@ def generate_sensor_for_parent_geo(state_groups, data, res_key, smooth,
if has_parent:
if device:
stat, se, sample_size = smoothed_tests_per_device(
devices=res_group["numUniqueDevices"].values,
tests=res_group['totalTest'].values,
devices=res_group[f"numUniqueDevices_{suffix}"].values,
tests=res_group[f'totalTest_{suffix}'].values,
min_obs=MIN_OBS, pool_days=POOL_DAYS,
parent_devices=res_group["numUniqueDevices_parent"].values,
parent_tests=res_group["totalTest_parent"].values)
parent_devices=res_group[f"numUniqueDevices_{suffix}_parent"].values,
parent_tests=res_group[f"totalTest_{suffix}_parent"].values)
else:
stat, se, sample_size = smoothed_positive_prop(
tests=res_group['totalTest'].values,
positives=res_group['positiveTest'].values,
tests=res_group[f'totalTest_{suffix}'].values,
positives=res_group[f'positiveTest_{suffix}'].values,
min_obs=MIN_OBS, pool_days=POOL_DAYS,
parent_tests=res_group["totalTest_parent"].values,
parent_positives=res_group['positiveTest_parent'].values)
parent_tests=res_group[f"totalTest_{suffix}_parent"].values,
parent_positives=res_group[f'positiveTest_{suffix}_parent'].values)
stat = stat * 100
else:
if device:
stat, se, sample_size = smoothed_tests_per_device(
devices=res_group["numUniqueDevices"].values,
tests=res_group['totalTest'].values,
devices=res_group[f"numUniqueDevices_{suffix}"].values,
tests=res_group[f'totalTest_{suffix}'].values,
min_obs=MIN_OBS, pool_days=POOL_DAYS)
else:
stat, se, sample_size = smoothed_positive_prop(
tests=res_group['totalTest'].values,
positives=res_group['positiveTest'].values,
tests=res_group[f'totalTest_{suffix}'].values,
positives=res_group[f'positiveTest_{suffix}'].values,
min_obs=MIN_OBS, pool_days=POOL_DAYS)
stat = stat * 100
else:
if device:
stat, se, sample_size = raw_tests_per_device(
devices=res_group["numUniqueDevices"].values,
tests=res_group['totalTest'].values,
devices=res_group[f"numUniqueDevices_{suffix}"].values,
tests=res_group[f'totalTest_{suffix}'].values,
min_obs=MIN_OBS)
else:
stat, se, sample_size = raw_positive_prop(
tests=res_group['totalTest'].values,
positives=res_group['positiveTest'].values,
tests=res_group[f'totalTest_{suffix}'].values,
positives=res_group[f'positiveTest_{suffix}'].values,
min_obs=MIN_OBS)
stat = stat * 100

Expand Down
43 changes: 40 additions & 3 deletions quidel_covidtest/delphi_quidel_covidtest/geo_maps.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
"""Contains geographic mapping tools."""
from itertools import product
from functools import reduce

import pandas as pd

from delphi_utils import GeoMapper
from .constants import (AGE_GROUPS, MIN_OBS)

DATA_COLS = ['totalTest', 'numUniqueDevices', 'positiveTest', "population"]
DATA_COLS = ['totalTest', 'numUniqueDevices', 'positiveTest']
GMPR = GeoMapper() # Use geo utils
GEO_KEY_DICT = {
"county": "fips",
Expand All @@ -12,21 +18,52 @@
"hhs": "hhs"
}


def geo_map(geo_res, df):
"""Map a geocode to a new value."""
data = df.copy()
geo_key = GEO_KEY_DICT[geo_res]
# Add population for each zipcode
data = GMPR.add_population_column(data, "zip")
# zip -> geo_res
data = GMPR.replace_geocode(data, "zip", geo_key, data_cols=DATA_COLS)
data_cols = ["population"]
for col, agegroup in product(DATA_COLS, AGE_GROUPS):
data_cols.append("_".join([col, agegroup]))

data = GMPR.replace_geocode(
data, from_code="zip", new_code=geo_key, date_col = "timestamp",
data_cols=data_cols)
if geo_res in ["state", "hhs", "nation"]:
return data, geo_key
# Add parent state
data = add_parent_state(data, geo_res, geo_key)
return data, geo_key

def add_megacounties(data, smooth=False):
"""Add megacounties to county level report."""
assert "fips" in data.columns # Make sure the data is at county level

# For raw signals, the threshold is MIN_OBS
# For smoothed signals, the threshold is MIN_OBS/2
if smooth:
threshold_visits = MIN_OBS/2
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the reason for this change in thresholding here?

else:
threshold_visits = MIN_OBS
pdList = []
for agegroup in AGE_GROUPS:
data_cols = [f"{col}_{agegroup}" for col in DATA_COLS]
df = GMPR.fips_to_megacounty(data[data_cols + ["timestamp", "fips"]],
threshold_visits, 1, fips_col="fips",
thr_col=f"totalTest_{agegroup}",
date_col="timestamp")
df.rename({"megafips": "fips"}, axis=1, inplace=True)
megacounties = df[df.fips.str.endswith("000")]
pdList.append(megacounties)
mega_df = reduce(lambda x, y: pd.merge(
x, y, on = ["timestamp", "fips"]), pdList)
mega_df = GMPR.add_geocode(mega_df, from_code="fips", new_code="state_id",
from_col="fips", new_col="state_id")

return pd.concat([data, mega_df])

def add_parent_state(data, geo_res, geo_key):
"""
Expand Down
57 changes: 54 additions & 3 deletions quidel_covidtest/delphi_quidel_covidtest/pull.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import pandas as pd
import numpy as np

from .constants import AGE_GROUPS

def get_from_s3(start_date, end_date, bucket, logger):
"""
Get raw data from aws s3 bucket.
Expand Down Expand Up @@ -163,21 +165,21 @@ def preprocess_new_data(start_date, end_date, params, test_mode, logger):
overall_pos = df[df["OverallResult"] == "positive"].groupby(
by=["timestamp", "zip"],
as_index=False)['OverallResult'].count()
overall_pos["positiveTest"] = overall_pos["OverallResult"]
overall_pos["positiveTest_total"] = overall_pos["OverallResult"]
overall_pos.drop(labels="OverallResult", axis="columns", inplace=True)

# Compute overallTotal
overall_total = df.groupby(
by=["timestamp", "zip"],
as_index=False)['OverallResult'].count()
overall_total["totalTest"] = overall_total["OverallResult"]
overall_total["totalTest_total"] = overall_total["OverallResult"]
overall_total.drop(labels="OverallResult", axis="columns", inplace=True)

# Compute numUniqueDevices
numUniqueDevices = df.groupby(
by=["timestamp", "zip"],
as_index=False)["SofiaSerNum"].agg({"SofiaSerNum": "nunique"}).rename(
columns={"SofiaSerNum": "numUniqueDevices"}
columns={"SofiaSerNum": "numUniqueDevices_total"}
)

df_merged = overall_total.merge(
Expand All @@ -186,6 +188,55 @@ def preprocess_new_data(start_date, end_date, params, test_mode, logger):
overall_pos, on=["timestamp", "zip"], how="left"
).fillna(0).drop_duplicates()

# Compute Summary info for age groups
df["PatientAge"] = df["PatientAge"].fillna(-1)
df.loc[df["PatientAge"] == "<1", "PatientAge"] = 0.5
df.loc[df["PatientAge"] == ">85", "PatientAge"] = 100
df["PatientAge"] = df["PatientAge"] .astype(float)

# Should match the suffixes of signal names
df["label"] = None
df.loc[df["PatientAge"] < 5, "label"] = "age_0_4"
df.loc[((df["PatientAge"] >= 5)) & (df["PatientAge"] < 18), "label"] = "age_5_17"
df.loc[((df["PatientAge"] >= 18)) & (df["PatientAge"] < 50), "label"] = "age_18_49"
df.loc[((df["PatientAge"] >= 50)) & (df["PatientAge"] < 65), "label"] = "age_50_64"
df.loc[(df["PatientAge"] >= 65), "label"] = "age_65plus"
df.loc[df["PatientAge"] == -1, "label"] = "NA"

for agegroup in AGE_GROUPS[1:]: # Exclude total
if agegroup == "age_0_17":
ages = ["age_0_4", "age_5_17"]
else:
ages = [agegroup]
# Compute overallPositive
group_pos = df.loc[(df["OverallResult"] == "positive")
& (df["label"].isin(ages))].groupby(
by=["timestamp", "zip"],
as_index=False)['OverallResult'].count()
group_pos[f"positiveTest_{agegroup}"] = group_pos["OverallResult"]
group_pos.drop(labels="OverallResult", axis="columns", inplace=True)

# Compute overallTotal
group_total = df.loc[df["label"].isin(ages)].groupby(
by=["timestamp", "zip"],
as_index=False)['OverallResult'].count()
group_total[f"totalTest_{agegroup}"] = group_total["OverallResult"]
group_total.drop(labels="OverallResult", axis="columns", inplace=True)

# Compute numUniqueDevices
group_numUniqueDevices = df.loc[df["label"].isin(ages)].groupby(
by=["timestamp", "zip"],
as_index=False)["SofiaSerNum"].agg({"SofiaSerNum": "nunique"}).rename(
columns={"SofiaSerNum": f"numUniqueDevices_{agegroup}"}
)

df_merged = df_merged.merge(
group_numUniqueDevices, on=["timestamp", "zip"], how="left"
).merge(
group_pos, on=["timestamp", "zip"], how="left"
).merge(
group_total, on=["timestamp", "zip"], how="left"
).fillna(0).drop_duplicates()

return df_merged, time_flag

Expand Down
Loading