Skip to content

Commit 3637c8e

Browse files
Jingjing TangJingjing Tang
authored andcommitted
Solved the problems in pylint test
1 parent fd66b79 commit 3637c8e

File tree

3 files changed

+188
-73
lines changed

3 files changed

+188
-73
lines changed

quidel_covidtest/delphi_quidel_covidtest/pull.py

Lines changed: 143 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,23 +2,24 @@
22
"""Simply downloads email attachments.
33
Uses this handy package: https://pypi.org/project/imap-tools/
44
"""
5-
from datetime import datetime, timedelta, date
65
import io
6+
from os.path import join
77
import os
8+
from datetime import datetime, timedelta
89

910
import pandas as pd
1011
import numpy as np
1112

1213
from imap_tools import MailBox, A, AND
1314

14-
def get_from_email(start_date: datetime.date, end_date: datetime.date,
15-
mail_server: str, account: str, sender: str, password: str):
15+
def get_from_email(start_date, end_date, mail_server,
16+
account, sender, password):
1617
"""
1718
Get raw data from email account
1819
Args:
19-
start_date: datetime.date
20+
start_date: datetime.datetime
2021
pull data from email received from the start date
21-
end_date: datetime.date
22+
end_date: datetime.datetime
2223
pull data from email received on/before the end date
2324
mail_server: str
2425
account: str
@@ -39,13 +40,13 @@ def get_from_email(start_date: datetime.date, end_date: datetime.date,
3940
with MailBox(mail_server).login(account, password, 'INBOX') as mailbox:
4041
for search_date in [start_date + timedelta(days=x)
4142
for x in range((end_date - start_date).days + 1)]:
42-
for message in mailbox.fetch(A(AND(date=search_date, from_=sender))):
43+
for message in mailbox.fetch(A(AND(date=search_date.date(), from_=sender))):
4344
for att in message.attachments:
4445
name = att.filename
4546
# Only consider covid tests
4647
if "Sars" not in name:
4748
continue
48-
print("Pulling data received on %s"%search_date)
49+
print("Pulling data received on %s"%search_date.date())
4950
toread = io.BytesIO()
5051
toread.write(att.payload)
5152
toread.seek(0) # reset the pointer
@@ -90,18 +91,18 @@ def fix_date(df):
9091
df["timestamp"].values[mask] = df["StorageDate"].values[mask]
9192
return df
9293

93-
def pull_quidel_covidtest(start_date, end_date, mail_server, account,
94-
sender, password, test_mode):
94+
def preprocess_new_data(start_date, end_date, mail_server, account,
95+
sender, password, test_mode):
9596
"""
9697
Pull and pre-process Quidel Covid Test data from datadrop email.
9798
Drop unnecessary columns. Temporarily consider the positive rate
9899
sensor only which is related to number of total tests and number
99100
of positive tests.
100101
101102
Args:
102-
start_date: datetime.date
103+
start_date: datetime.datetime
103104
pull data from email received from the start date
104-
end_date: datetime.date
105+
end_date: datetime.datetime
105106
pull data from email received on/before the end date
106107
mail_server: str
107108
account: str
@@ -119,7 +120,7 @@ def pull_quidel_covidtest(start_date, end_date, mail_server, account,
119120
"""
120121
if test_mode:
121122
test_data_dir = "./test_data/test_data.xlsx"
122-
df, time_flag = pd.read_excel(test_data_dir), date(2020, 8, 17)
123+
df, time_flag = pd.read_excel(test_data_dir), datetime(2020, 8, 17)
123124
else:
124125
# Get new data from email
125126
df, time_flag = get_from_email(start_date, end_date, mail_server,
@@ -166,12 +167,140 @@ def pull_quidel_covidtest(start_date, end_date, mail_server, account,
166167
return df_merged, time_flag
167168

168169
def check_intermediate_file(cache_dir, pull_start_date):
170+
"""
171+
Check whether there is a cache file containing historical data already
172+
"""
169173
for filename in os.listdir(cache_dir):
170174
if ".csv" in filename:
171175
pull_start_date = datetime.strptime(filename.split("_")[2].split(".")[0],
172-
'%Y%m%d').date() + timedelta(days=1)
176+
'%Y%m%d') + timedelta(days=1)
173177
previous_df = pd.read_csv(os.path.join(cache_dir, filename),
174178
sep=",", parse_dates=["timestamp"])
175-
os.remove(os.path.join(cache_dir, filename))
176179
return previous_df, pull_start_date
177180
return None, pull_start_date
181+
182+
def pull_quidel_covidtest(params):
183+
"""
184+
Pull the quidel covid test data. Decide whether to combine the newly
185+
received data with stored historical records in ./cache
186+
187+
Parameters:
188+
params: dict
189+
including all the information read from params.json
190+
END_FROM_TODAY_MINUS: int
191+
report data until - X days
192+
EXPORT_DAY_RANGE: int
193+
number of dates to report
194+
195+
Returns:
196+
DataFrame:
197+
A data frame containinig the pre-process data with columns:
198+
timestamp, numUniqueDevices, positiveTest, totalTest
199+
datetime.datetime
200+
the first date of the report
201+
datetime.datetime
202+
the last date of the report
203+
"""
204+
cache_dir = params["cache_dir"]
205+
206+
mail_server = params["mail_server"]
207+
account = params["account"]
208+
password = params["password"]
209+
sender = params["sender"]
210+
211+
test_mode = (params["mode"] == "test")
212+
213+
# pull new data only that has not been ingested
214+
previous_df, pull_start_date = check_intermediate_file(
215+
cache_dir,
216+
datetime.strptime(params["pull_start_date"], '%Y-%m-%d'))
217+
218+
if params["pull_end_date"] == "":
219+
pull_end_date = datetime.today()
220+
else:
221+
pull_end_date = datetime.strptime(params["pull_end_date"], '%Y-%m-%d')
222+
223+
# Pull data from the email at 5 digit zipcode level
224+
# Use _end_date to check the most recent date that we received data
225+
df, _end_date = preprocess_new_data(
226+
pull_start_date, pull_end_date, mail_server,
227+
account, sender, password, test_mode)
228+
229+
# Utilize previously stored data
230+
if previous_df is not None:
231+
df = previous_df.append(df).groupby(["timestamp", "zip"]).sum().reset_index()
232+
return df, _end_date
233+
234+
def check_export_end_date(input_export_end_date, _end_date,
235+
END_FROM_TODAY_MINUS):
236+
"""
237+
Update the export_end_date according to the data received
238+
By default, set the export end date to be the last pulling date - 5 days
239+
(END_FROM_TODAY_MINUS = 5).
240+
Otherwise, use the required date if it is earlier than the default one.
241+
242+
Parameter:
243+
input_export_end_date: str
244+
read from params
245+
_end_date: datetime.datetime
246+
updated according the data received
247+
END_FROM_TODAY_MINUS: int
248+
report data until - X days
249+
250+
Returns:
251+
datetime.datetime
252+
export data from which date
253+
"""
254+
export_end_date = _end_date - timedelta(days=END_FROM_TODAY_MINUS)
255+
if input_export_end_date != "":
256+
input_export_end_date = datetime.strptime(input_export_end_date, '%Y-%m-%d')
257+
if input_export_end_date < export_end_date:
258+
return input_export_end_date
259+
return export_end_date
260+
261+
def check_export_start_date(export_start_date, export_end_date,
262+
EXPORT_DAY_RANGE):
263+
"""
264+
Update the export_start_date according to the export_end_date so that it
265+
could be export_end_date - EXPORT_DAY_RANGE
266+
267+
Parameters:
268+
export_start_date: str
269+
Read from params
270+
export_end_date: datetime.datetime
271+
Calculated according to the data received
272+
EXPORT_DAY_RANGE: int
273+
Number of days to report
274+
275+
Returns:
276+
datetime.datetime
277+
export data until which date
278+
"""
279+
if export_start_date == "":
280+
export_start_date = datetime(2020, 5, 26)
281+
else:
282+
export_start_date = datetime.strptime(export_start_date, '%Y-%m-%d')
283+
# Only export data from -45 days to -5 days
284+
if (export_end_date - export_start_date).days > EXPORT_DAY_RANGE:
285+
export_start_date = export_end_date - timedelta(days=EXPORT_DAY_RANGE)
286+
287+
if export_start_date < datetime(2020, 5, 26):
288+
return datetime(2020, 5, 26)
289+
return export_start_date
290+
291+
def update_cache_file(df, _end_date, cache_dir):
292+
"""
293+
Update cache file. Remove the old one, export the new one
294+
295+
Parameter:
296+
df: pd.DataFrame
297+
Pre-process file at ZipCode level
298+
_end_date:
299+
The most recent date when the raw data is received
300+
cache_dir:
301+
./cache where the cache file is stored
302+
"""
303+
for fn in os.listdir(cache_dir):
304+
if ".csv" in fn:
305+
os.remove(join(cache_dir, fn))
306+
df.to_csv(join(cache_dir, "pulled_until_%s.csv") % _end_date.strftime("%Y%m%d"), index=False)

quidel_covidtest/delphi_quidel_covidtest/run.py

Lines changed: 14 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,16 @@
44
This module should contain a function called `run_module`, that is executed
55
when the module is run with `python -m MODULE_NAME`.
66
"""
7-
from datetime import datetime, date, timedelta
87
from os.path import join
98

109
import pandas as pd
1110
from delphi_utils import read_params
1211

1312
from .geo_maps import (zip_to_msa, zip_to_hrr, zip_to_county, zip_to_state)
14-
from .pull import pull_quidel_covidtest, check_intermediate_file
13+
from .pull import (pull_quidel_covidtest,
14+
check_export_start_date,
15+
check_export_end_date,
16+
update_cache_file)
1517
from .export import export_csv
1618
from .generate_sensor import (generate_sensor_for_states,
1719
generate_sensor_for_other_geores)
@@ -28,57 +30,23 @@ def run_module():
2830
cache_dir = params["cache_dir"]
2931
export_dir = params["export_dir"]
3032
static_file_dir = params["static_file_dir"]
31-
32-
mail_server = params["mail_server"]
33-
account = params["account"]
34-
password = params["password"]
35-
sender = params["sender"]
36-
37-
test_mode = (params["mode"] == "test")
38-
39-
export_start_date = datetime.strptime(params["export_start_date"], '%Y-%m-%d')
40-
41-
# pull new data only that has not been ingested
42-
previous_df, pull_start_date = check_intermediate_file(
43-
cache_dir,
44-
datetime.strptime(params["pull_start_date"], '%Y-%m-%d').date())
45-
46-
if params["pull_end_date"] == "":
47-
pull_end_date = date.today()
48-
else:
49-
pull_end_date = datetime.strptime(params["pull_end_date"], '%Y-%m-%d').date()
50-
33+
export_start_date = params["export_start_date"]
34+
export_end_date = params["export_end_date"]
5135
map_df = pd.read_csv(
5236
join(static_file_dir, "fips_prop_pop.csv"), dtype={"fips": int}
5337
)
5438

55-
# Pull data from the email at 5 digit zipcode level
56-
# Use _end_date to check the most recent date that we received data
57-
df, _end_date = pull_quidel_covidtest(
58-
pull_start_date, pull_end_date, mail_server,
59-
account, sender, password, test_mode)
39+
# Pull data and update export date
40+
df, _end_date = pull_quidel_covidtest(params)
6041
if _end_date is None:
6142
print("The data is up-to-date. Currently, no new data to be ingested.")
6243
return
44+
export_end_date = check_export_end_date(export_end_date, _end_date,
45+
END_FROM_TODAY_MINUS)
46+
export_start_date = check_export_start_date(export_start_date,
47+
export_end_date, EXPORT_DAY_RANGE)
6348

64-
# Utilize previously stored data
65-
if previous_df is not None:
66-
df = previous_df.append(df).groupby(["timestamp", "zip"]).sum().reset_index()
67-
68-
# By default, set the export end date to be the last pulling date - 5 days
69-
export_end_date = _end_date - timedelta(days=END_FROM_TODAY_MINUS)
70-
if params["export_end_date"] != "":
71-
input_export_end_date = datetime.strptime(params["export_end_date"], '%Y-%m-%d').date()
72-
if input_export_end_date < export_end_date:
73-
export_end_date = input_export_end_date
74-
export_end_date = datetime(export_end_date.year, export_end_date.month, export_end_date.day)
75-
76-
# Only export data from -45 days to -5 days
77-
if (export_end_date - export_start_date).days > EXPORT_DAY_RANGE:
78-
export_start_date = export_end_date - timedelta(days=EXPORT_DAY_RANGE)
79-
80-
first_date = df["timestamp"].min()
81-
last_date = df["timestamp"].max()
49+
first_date, last_date = df["timestamp"].min(), df["timestamp"].max()
8250

8351
# State Level
8452
data = df.copy()
@@ -128,4 +96,4 @@ def run_module():
12896

12997
# Export the cache file if the pipeline runs successfully.
13098
# Otherwise, don't update the cache file
131-
df.to_csv(join(cache_dir, "pulled_until_%s.csv") % _end_date.strftime("%Y%m%d"), index=False)
99+
update_cache_file(df, _end_date, cache_dir)

quidel_covidtest/tests/test_pull.py

Lines changed: 31 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,13 @@
99
fix_zipcode,
1010
fix_date,
1111
pull_quidel_covidtest,
12-
check_intermediate_file
12+
check_intermediate_file,
13+
check_export_end_date,
14+
check_export_start_date
1315
)
1416

17+
END_FROM_TODAY_MINUS = 5
18+
EXPORT_DAY_RANGE = 40
1519

1620
class TestFixData:
1721
def test_fix_zipcode(self):
@@ -36,18 +40,8 @@ class TestingPullData:
3640
def test_pull_quidel_covidtest(self):
3741

3842
params = read_params()
39-
mail_server = params["mail_server"]
40-
account = params["account"]
41-
password = params["password"]
42-
sender = params["sender"]
4343

44-
test_mode = (params["mode"] == "test")
45-
46-
pull_start_date = date(2020, 6, 10)
47-
pull_end_date = date(2020, 6, 12)
48-
49-
df, _ = pull_quidel_covidtest(pull_start_date, pull_end_date, mail_server,
50-
account, sender, password, test_mode)
44+
df, _ = pull_quidel_covidtest(params)
5145

5246
first_date = df["timestamp"].min().date()
5347
last_date = df["timestamp"].max().date()
@@ -67,4 +61,28 @@ def test_check_intermediate_file(self):
6761

6862
previous_df, pull_start_date = check_intermediate_file("./cache/test_cache_without_file", None)
6963
assert previous_df is None
70-
assert pull_start_date is None
64+
assert pull_start_date is None
65+
66+
def test_check_export_end_date(self):
67+
68+
_end_date = datetime(2020, 7, 7)
69+
export_end_dates = ["", "2020-07-07", "2020-06-15"]
70+
tested = []
71+
for export_end_date in export_end_dates:
72+
tested.append(check_export_end_date(export_end_date, _end_date,
73+
END_FROM_TODAY_MINUS))
74+
expected = [datetime(2020, 7, 2), datetime(2020, 7, 2), datetime(2020, 6,15)]
75+
76+
assert tested == expected
77+
78+
def test_check_export_start_date(self):
79+
80+
export_end_date = datetime(2020, 7, 2)
81+
export_start_dates = ["", "2020-06-20", "2020-04-20"]
82+
tested = []
83+
for export_start_date in export_start_dates:
84+
tested.append(check_export_start_date(export_start_date,
85+
export_end_date, EXPORT_DAY_RANGE))
86+
expected = [datetime(2020, 5, 26), datetime(2020, 6, 20), datetime(2020, 5, 26)]
87+
88+
assert tested == expected

0 commit comments

Comments
 (0)