Skip to content

Commit 2c8f5cb

Browse files
authored
Merge pull request #706 from cmu-delphi/fix_quidel_covidtest_newformat_off_main
Fix quidel covidtest newformat (off main)
2 parents e803593 + 2790493 commit 2c8f5cb

File tree

5 files changed

+195
-81
lines changed

5 files changed

+195
-81
lines changed

quidel_covidtest/delphi_quidel_covidtest/data_tools.py

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -70,11 +70,12 @@ def _slide_window_sum(arr, k):
7070

7171
def _geographical_pooling(tpooled_tests, tpooled_ptests, min_obs):
7272
"""
73-
Calculate proportion of parent samples (tests) that must be "borrowed" in order to compute the statistic.
73+
Determine how many samples from the parent geography must be borrowed.
7474
75-
If there are no samples available in the parent, the borrow_prop is 0. If the parent does not
76-
have enough samples, we return a borrow_prop of 1, and the fact that the
77-
pooled samples are insufficient are handled in the statistic fitting step.
75+
If there are no samples available in the parent, the borrow_prop is 0. If
76+
the parent does not have enough samples, we return a borrow_prop of 1, and
77+
the fact that the pooled samples are insufficient are handled in the
78+
statistic fitting step.
7879
7980
Args:
8081
tpooled_tests: np.ndarray[float]
@@ -88,6 +89,7 @@ def _geographical_pooling(tpooled_tests, tpooled_ptests, min_obs):
8889
Returns:
8990
np.ndarray[float]
9091
Same length as tests; proportion of parent observations to borrow.
92+
9193
"""
9294
if (np.any(np.isnan(tpooled_tests)) or np.any(np.isnan(tpooled_ptests))):
9395
print(tpooled_tests)
@@ -117,7 +119,9 @@ def _geographical_pooling(tpooled_tests, tpooled_ptests, min_obs):
117119

118120
def raw_positive_prop(positives, tests, min_obs):
119121
"""
120-
Calculate the proportion of positive tests for a single geographic location, without any temporal smoothing.
122+
Calculate the proportion of positive tests without any temporal smoothing.
123+
124+
This calculation assumes a single geographic location.
121125
122126
If on any day t, tests[t] < min_obs, then we report np.nan.
123127
@@ -147,6 +151,7 @@ def raw_positive_prop(positives, tests, min_obs):
147151
Of the same length as above.
148152
np.ndarray
149153
Sample size used to compute estimates.
154+
150155
"""
151156
positives = positives.astype(float)
152157
tests = tests.astype(float)
@@ -170,7 +175,9 @@ def raw_positive_prop(positives, tests, min_obs):
170175
def smoothed_positive_prop(positives, tests, min_obs, pool_days,
171176
parent_positives=None, parent_tests=None):
172177
"""
173-
Calculate the proportion of negative tests for a single geographic location, with temporal smoothing.
178+
Calculate the proportion of negative tests, with temporal smoothing.
179+
180+
This calculation assumes a single geographic location.
174181
175182
For a given day t, if sum(tests[(t-pool_days+1):(t+1)]) < min_obs, then we
176183
'borrow' min_obs - sum(tests[(t-pool_days+1):(t+1)]) observations from the
@@ -259,7 +266,9 @@ def smoothed_positive_prop(positives, tests, min_obs, pool_days,
259266

260267
def raw_tests_per_device(devices, tests, min_obs):
261268
"""
262-
Calculate the tests per device for a single geographic location, without any temporal smoothing.
269+
Calculate the tests per device, without temporal smoothing.
270+
271+
This calculation assumes a single geographic location.
263272
264273
If on any day t, tests[t] < min_obs, then we report np.nan.
265274
The second and third returned np.ndarray are the standard errors,
@@ -282,6 +291,7 @@ def raw_tests_per_device(devices, tests, min_obs):
282291
Placeholder for standard errors
283292
np.ndarray
284293
Sample size used to compute estimates.
294+
285295
"""
286296
devices = devices.astype(float)
287297
tests = tests.astype(float)
@@ -302,7 +312,9 @@ def raw_tests_per_device(devices, tests, min_obs):
302312
def smoothed_tests_per_device(devices, tests, min_obs, pool_days,
303313
parent_devices=None, parent_tests=None):
304314
"""
305-
Calculate the ratio of tests per device for a single geographic location, with temporal smoothing.
315+
Calculate the ratio of tests per device, with temporal smoothing.
316+
317+
This calculation assumes a single geographic location.
306318
307319
For a given day t, if sum(tests[(t-pool_days+1):(t+1)]) < min_obs, then we
308320
'borrow' min_obs - sum(tests[(t-pool_days+1):(t+1)]) observations from the
@@ -339,6 +351,7 @@ def smoothed_tests_per_device(devices, tests, min_obs, pool_days,
339351
Standard errors, currently uniformly np.nan (placeholder).
340352
np.ndarray
341353
Effective sample size (after temporal and geographic pooling).
354+
342355
"""
343356
devices = devices.astype(float)
344357
tests = tests.astype(float)

quidel_covidtest/delphi_quidel_covidtest/pull.py

Lines changed: 69 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -1,60 +1,59 @@
11
# -*- coding: utf-8 -*-
2-
"""Simply downloads email attachments.
3-
4-
Uses this handy package: https://pypi.org/project/imap-tools/
5-
"""
6-
import io
2+
"""Collect and process Quidel export files."""
73
from os.path import join
84
import os
95
from datetime import datetime, timedelta
6+
import boto3
107

118
import pandas as pd
129
import numpy as np
1310

14-
from imap_tools import MailBox, A, AND
15-
16-
def get_from_email(start_date, end_date, mail_server,
17-
account, sender, password):
11+
def get_from_s3(start_date, end_date, bucket):
1812
"""
19-
Get raw data from email account.
13+
Get raw data from aws s3 bucket.
2014
2115
Args:
2216
start_date: datetime.datetime
23-
pull data from email received from the start date
17+
pull data from file tagged with date on/after the start date
2418
end_date: datetime.datetime
25-
pull data from email received on/before the end date
26-
mail_server: str
27-
account: str
28-
email account to receive new data
29-
sender: str
30-
email account of the sender
31-
password: str
32-
password of the datadrop email
19+
pull data from file tagged with date on/before the end date
20+
bucket: s3.Bucket
21+
the aws s3 bucket that stores quidel data
3322
output:
3423
df: pd.DataFrame
24+
time_flag: datetime.datetime
3525
"""
3626
time_flag = None
37-
df = pd.DataFrame(columns=['SofiaSerNum', 'TestDate', 'Facility', 'City',
38-
'State', 'Zip', 'PatientAge', 'Result1', 'Result2',
39-
'OverallResult', 'County', 'FacilityType', 'Assay',
40-
'SCO1', 'SCO2', 'CLN', 'CSN', 'InstrType',
41-
'StorageDate', 'ResultId', 'SarsTestNumber'])
42-
with MailBox(mail_server).login(account, password, 'INBOX') as mailbox:
43-
for search_date in [start_date + timedelta(days=x)
44-
for x in range((end_date - start_date).days + 1)]:
45-
for message in mailbox.fetch(A(AND(date=search_date.date(), from_=sender))):
46-
for att in message.attachments:
47-
name = att.filename
48-
# Only consider covid tests
49-
if "Sars" not in name:
50-
continue
51-
print("Pulling data received on %s"%search_date.date())
52-
toread = io.BytesIO()
53-
toread.write(att.payload)
54-
toread.seek(0) # reset the pointer
55-
newdf = pd.read_excel(toread) # now read to dataframe
56-
df = df.append(newdf)
57-
time_flag = search_date
27+
selected_columns = ['SofiaSerNum', 'TestDate', 'Facility', 'City',
28+
'State', 'Zip', 'PatientAge', 'Result1',
29+
'Result2', 'OverallResult', 'StorageDate',
30+
'fname']
31+
df = pd.DataFrame(columns=selected_columns)
32+
s3_files = {}
33+
for obj in bucket.objects.all():
34+
if "-sars" in obj.key:
35+
date_string = obj.key.split("/")[1]
36+
yy = int(date_string.split("_")[0])
37+
mm = int(date_string.split("_")[1])
38+
dd = int(date_string.split("_")[2])
39+
received_date = datetime(yy, mm, dd)
40+
s3_files[received_date] = obj.key
41+
42+
n_days = (end_date - start_date).days + 1
43+
for search_date in [start_date + timedelta(days=x) for x in range(n_days)]:
44+
if search_date in s3_files.keys():
45+
# Avoid appending duplicate datasets
46+
if s3_files[search_date] in set(df["fname"].values):
47+
continue
48+
print("Pulling data received on %s"%search_date.date())
49+
obj = bucket.Object(key=s3_files[search_date])
50+
newdf = pd.read_csv(obj.get()["Body"],
51+
parse_dates=["StorageDate", "TestDate"],
52+
low_memory=False)
53+
newdf["fname"] = s3_files[search_date]
54+
df = df.append(newdf[selected_columns])
55+
assert set(df.columns) == set(selected_columns)
56+
time_flag = search_date
5857
return df, time_flag
5958

6059
def fix_zipcode(df):
@@ -99,41 +98,45 @@ def fix_date(df):
9998
df["timestamp"].values[mask] = df["StorageDate"].values[mask]
10099
return df
101100

102-
def preprocess_new_data(start_date, end_date, mail_server, account,
103-
sender, password, test_mode):
101+
def preprocess_new_data(start_date, end_date, params, test_mode):
104102
"""
105-
Pull and pre-process Quidel Covid Test data from datadrop email.
103+
Pull and pre-process Quidel Covid Test data.
106104
107105
Drop unnecessary columns. Temporarily consider the positive rate
108106
sensor only which is related to number of total tests and number
109107
of positive tests.
110108
111109
Args:
112110
start_date: datetime.datetime
113-
pull data from email received from the start date
111+
pull data from file tagged with date on/after start date
114112
end_date: datetime.datetime
115-
pull data from email received on/before the end date
116-
mail_server: str
117-
account: str
118-
email account to receive new data
119-
sender: str
120-
email account of the sender
121-
password: str
122-
password of the datadrop email
113+
pull data from file tagged with date on/before the end date
114+
params: dict
115+
read from params.json
123116
test_mode: bool
124-
pull raw data from email or not
117+
pull raw data from s3 or not
125118
output:
126119
df: pd.DataFrame
127120
time_flag: datetime.date:
128121
the actual pull end date on which we successfully pull the data
129122
"""
130123
if test_mode:
131-
test_data_dir = "./test_data/test_data.xlsx"
132-
df, time_flag = pd.read_excel(test_data_dir), datetime(2020, 8, 17)
124+
test_data_dir = "./test_data/test_data.csv"
125+
df, time_flag = pd.read_csv(
126+
test_data_dir,
127+
parse_dates=["StorageDate", "TestDate"]
128+
), datetime(2020, 8, 17)
133129
else:
134-
# Get new data from email
135-
df, time_flag = get_from_email(start_date, end_date, mail_server,
136-
account, sender, password)
130+
# connect aws s3 bucket
131+
aws_access_key_id = params["aws_credentials"]["aws_access_key_id"]
132+
aws_secret_access_key = params["aws_credentials"]["aws_secret_access_key"]
133+
bucket_name = params["bucket_name"]
134+
135+
s3 = boto3.resource('s3', aws_access_key_id=aws_access_key_id,
136+
aws_secret_access_key=aws_secret_access_key)
137+
bucket = s3.Bucket(bucket_name)
138+
# Get new data from s3
139+
df, time_flag = get_from_s3(start_date, end_date, bucket)
137140

138141
# No new data can be pulled
139142
if time_flag is None:
@@ -187,8 +190,9 @@ def check_intermediate_file(cache_dir, pull_start_date):
187190
return None, pull_start_date
188191

189192
def pull_quidel_covidtest(params):
190-
"""
191-
Pull the quidel covid test data and ecide whether to combine the new data with stored historical records in ./cache.
193+
"""Pull the quidel covid test data.
194+
195+
Conditionally merge new data with historical data from ./cache.
192196
193197
Parameters:
194198
params: dict
@@ -206,14 +210,10 @@ def pull_quidel_covidtest(params):
206210
the first date of the report
207211
datetime.datetime
208212
the last date of the report
213+
209214
"""
210215
cache_dir = params["cache_dir"]
211216

212-
mail_server = params["mail_server"]
213-
account = params["account"]
214-
password = params["password"]
215-
sender = params["sender"]
216-
217217
test_mode = (params["mode"] == "test")
218218

219219
# pull new data only that has not been ingested
@@ -226,11 +226,10 @@ def pull_quidel_covidtest(params):
226226
else:
227227
pull_end_date = datetime.strptime(params["pull_end_date"], '%Y-%m-%d')
228228

229-
# Pull data from the email at 5 digit zipcode level
229+
# Pull data from the file at 5 digit zipcode level
230230
# Use _end_date to check the most recent date that we received data
231231
df, _end_date = preprocess_new_data(
232-
pull_start_date, pull_end_date, mail_server,
233-
account, sender, password, test_mode)
232+
pull_start_date, pull_end_date, params, test_mode)
234233

235234
# Utilize previously stored data
236235
if previous_df is not None:
@@ -268,7 +267,7 @@ def check_export_end_date(input_export_end_date, _end_date,
268267
def check_export_start_date(export_start_date, export_end_date,
269268
export_day_range):
270269
"""
271-
Update export_start_date according to the export_end_date so that it could be export_end_date - export_day_range.
270+
Ensure that the starte date, end date, and day range are mutually consistent.
272271
273272
Parameters:
274273
export_start_date: str
@@ -281,6 +280,7 @@ def check_export_start_date(export_start_date, export_end_date,
281280
Returns:
282281
datetime.datetime
283282
export data until which date
283+
284284
"""
285285
if export_start_date == "":
286286
export_start_date = datetime(2020, 5, 26)

quidel_covidtest/params.json.template

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,11 @@
66
"export_end_date": "",
77
"pull_start_date": "2020-05-26",
88
"pull_end_date":"",
9-
"mail_server": "imap.exchange.andrew.cmu.edu",
10-
"account": "[email protected]",
11-
"password": "",
12-
"sender": "",
9+
"aws_credentials": {
10+
"aws_access_key_id": "",
11+
"aws_secret_access_key": ""
12+
},
13+
"bucket_name": "",
1314
"wip_signal": [""],
1415
"mode": ""
1516
}

0 commit comments

Comments
 (0)