Skip to content

Commit 2e5cb61

Browse files
authored
Merge pull request #1495 from cmu-delphi/dsew_cp_vaccination
Dsew vaccination
2 parents 8c5a845 + f2f778b commit 2e5cb61

File tree

6 files changed

+211
-53
lines changed

6 files changed

+211
-53
lines changed

ansible/templates/dsew_community_profile-params-prod.json.j2

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,9 @@
2828
"naats_total_7dav",
2929
"naats_positivity_7dav",
3030
"confirmed_admissions_covid_1d_prop_7dav",
31-
"confirmed_admissions_covid_1d_7dav"
31+
"confirmed_admissions_covid_1d_7dav",
32+
"doses_admin_7dav",
33+
"booster_doses_admin_7dav"
3234
]
3335
}
3436
}

dsew_community_profile/delphi_dsew_community_profile/constants.py

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,22 +51,50 @@ class Transform:
5151
"total": {
5252
"is_rate" : False,
5353
"api_name": "naats_total_7dav",
54-
"make_prop": False
54+
"make_prop": False,
55+
"cumulative" : False
5556
},
5657
"positivity": {
5758
"is_rate" : True,
5859
"api_name": "naats_positivity_7dav",
59-
"make_prop": False
60+
"make_prop": False,
61+
"cumulative" : False
6062
},
6163
"confirmed covid-19 admissions": {
6264
"is_rate" : False,
6365
"api_name": "confirmed_admissions_covid_1d_7dav",
6466
"make_prop": True,
65-
"api_prop_name": "confirmed_admissions_covid_1d_prop_7dav"
67+
"api_prop_name": "confirmed_admissions_covid_1d_prop_7dav",
68+
"cumulative" : False
69+
},
70+
"fully vaccinated": {
71+
"is_rate" : False,
72+
"api_name": "people_full_vaccinated",
73+
"make_prop": False,
74+
"cumulative" : True
75+
},
76+
"booster dose since": {
77+
"is_rate" : False,
78+
"api_name": "people_booster_doses",
79+
"make_prop": False,
80+
"cumulative" : True
81+
},
82+
"booster doses administered": {
83+
"is_rate" : False,
84+
"api_name": "booster_doses_admin_7dav",
85+
"make_prop": False,
86+
"cumulative" : False
87+
},
88+
"doses administered": {
89+
"is_rate" : False,
90+
"api_name": "doses_admin_7dav",
91+
"make_prop": False,
92+
"cumulative" : False
6693
}
6794
}
6895

69-
COUNTS_7D_SIGNALS = {key for key, value in SIGNALS.items() if not value["is_rate"]}
96+
COUNTS_7D_SIGNALS = {key for key, value in SIGNALS.items() \
97+
if not((value["is_rate"]) or (value["cumulative"]))}
7098

7199
def make_signal_name(key, is_prop=False):
72100
"""Convert a signal key to the corresponding signal name for the API.

dsew_community_profile/delphi_dsew_community_profile/pull.py

Lines changed: 145 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,16 @@
3232
rf'HOSPITAL UTILIZATION: (.*) WEEK \({DATE_RANGE_EXP}\)'
3333
)
3434

35+
# example: "COVID-19 VACCINATION DATA: LAST WEEK (January 5-11)"
36+
RE_DATE_FROM_VAC_HEADER_WEEK= re.compile(
37+
rf'COVID-19 VACCINATION DATA: (.*) WEEK \({DATE_RANGE_EXP}\)'
38+
)
39+
40+
# example: 'COVID-19 VACCINATION DATA: CUMULATIVE (January 11)'
41+
RE_DATE_FROM_VAC_HEADER_CUMULATIVE= re.compile(
42+
rf'COVID-19 VACCINATION DATA: CUMULATIVE (.*)\({DATE_EXP}\)'
43+
)
44+
3545
# example: "NAAT positivity rate - last 7 days (may be an underestimate due to delayed reporting)"
3646
# example: "Total NAATs - last 7 days (may be an underestimate due to delayed reporting)"
3747
RE_COLUMN_FROM_HEADER = re.compile('- (.*) 7 days')
@@ -44,15 +54,27 @@ class DatasetTimes:
4454
positivity_reference_date: datetime.date
4555
total_reference_date: datetime.date
4656
hosp_reference_date: datetime.date
57+
vac_reference_date: datetime.date
58+
cumulative_vac_reference_date: datetime.date
4759

4860
@staticmethod
4961
def from_header(header, publish_date):
5062
"""Convert reference dates in overheader to DatasetTimes."""
51-
def as_date(sub_result):
52-
month = sub_result[2] if sub_result[2] else sub_result[0]
53-
assert month, f"Bad month in header: {header}\nsub_result: {sub_result}"
54-
month_numeric = datetime.datetime.strptime(month, "%B").month
55-
day = sub_result[3]
63+
positivity_reference_date = None
64+
total_reference_date = None
65+
hosp_reference_date = None
66+
vac_reference_date = None
67+
cumulative_vac_reference_date= None
68+
def as_date(sub_result, is_single_date):
69+
if is_single_date:
70+
month = sub_result[0]
71+
day = sub_result[1]
72+
month_numeric = datetime.datetime.strptime(month, "%B").month
73+
else:
74+
month = sub_result[2] if sub_result[2] else sub_result[0]
75+
assert month, f"Bad month in header: {header}\nsub_result: {sub_result}"
76+
month_numeric = datetime.datetime.strptime(month, "%B").month
77+
day = sub_result[3]
5678
year = publish_date.year
5779
# year boundary
5880
if month_numeric > publish_date.month:
@@ -62,51 +84,64 @@ def as_date(sub_result):
6284
if RE_DATE_FROM_TEST_HEADER.match(header):
6385
findall_result = RE_DATE_FROM_TEST_HEADER.findall(header)[0]
6486
column = findall_result[0].lower()
65-
positivity_reference_date = as_date(findall_result[1:5])
87+
positivity_reference_date = as_date(findall_result[1:5], False)
6688
if findall_result[6]:
6789
# Reports published starting 2021-03-17 specify different reference
6890
# dates for positivity and total test volume
69-
total_reference_date = as_date(findall_result[6:10])
91+
total_reference_date = as_date(findall_result[6:10], False)
7092
else:
7193
total_reference_date = positivity_reference_date
72-
73-
hosp_reference_date = None
7494
elif RE_DATE_FROM_HOSP_HEADER.match(header):
7595
findall_result = RE_DATE_FROM_HOSP_HEADER.findall(header)[0]
7696
column = findall_result[0].lower()
77-
hosp_reference_date = as_date(findall_result[1:5])
78-
79-
total_reference_date = None
80-
positivity_reference_date = None
97+
hosp_reference_date = as_date(findall_result[1:5], False)
98+
elif RE_DATE_FROM_VAC_HEADER_WEEK.match(header):
99+
findall_result = RE_DATE_FROM_VAC_HEADER_WEEK.findall(header)[0]
100+
column = findall_result[0].lower()
101+
vac_reference_date = as_date(findall_result[1:5], False)
102+
elif RE_DATE_FROM_VAC_HEADER_CUMULATIVE.match(header):
103+
findall_result = RE_DATE_FROM_VAC_HEADER_CUMULATIVE.findall(header)[0]
104+
column = findall_result[0].lower()
105+
cumulative_vac_reference_date = as_date(findall_result[1:], True)
81106
else:
82107
raise ValueError(f"Couldn't find reference date in header '{header}'")
83-
84108
return DatasetTimes(column, positivity_reference_date,
85-
total_reference_date, hosp_reference_date)
109+
total_reference_date, hosp_reference_date,
110+
cumulative_vac_reference_date, vac_reference_date)
86111
def __getitem__(self, key):
87112
"""Use DatasetTimes like a dictionary."""
113+
ref_list = list(SIGNALS.keys())
88114
if key.lower()=="positivity":
89115
return self.positivity_reference_date
90116
if key.lower()=="total":
91117
return self.total_reference_date
92118
if key.lower()=="confirmed covid-19 admissions":
93119
return self.hosp_reference_date
120+
if key.lower() in ["doses administered","booster doses administered"]:
121+
return self.cumulative_vac_reference_date
122+
if key.lower() in ["fully vaccinated","booster dose since"]:
123+
return self.vac_reference_date
94124
raise ValueError(
95125
f"Bad reference date type request '{key}'; " + \
96-
"need 'total', 'positivity', or 'confirmed covid-19 admissions'"
126+
"need one of: " + " ,".join(ref_list)
97127
)
98128
def __setitem__(self, key, newvalue):
99129
"""Use DatasetTimes like a dictionary."""
130+
ref_list = list(SIGNALS.keys())
100131
if key.lower()=="positivity":
101132
self.positivity_reference_date = newvalue
102133
if key.lower()=="total":
103134
self.total_reference_date = newvalue
104135
if key.lower()=="confirmed covid-19 admissions":
105136
self.hosp_reference_date = newvalue
106-
else:
137+
if key.lower() in ["doses administered","booster doses administered"]:
138+
self.cumulative_vac_reference_date = newvalue
139+
if key.lower() in ["fully vaccinated","booster dose since"]:
140+
self.vac_reference_date = newvalue
141+
if key.lower() not in ref_list:
107142
raise ValueError(
108143
f"Bad reference date type request '{key}'; " + \
109-
"need 'total', 'positivity', or 'confirmed covid-19 admissions'"
144+
"need one of: " + " ,".join(ref_list)
110145
)
111146
def __eq__(self, other):
112147
"""Check equality by value."""
@@ -164,14 +199,21 @@ def skip_overheader(header):
164199
# include "VIRAL (RT-PCR) LAB TESTING: [LAST|PREVIOUS] WEEK (August 24-30, ..."
165200
# include "HOSPITAL UTILIZATION: LAST WEEK (January 2-8)"
166201
return not (isinstance(header, str) and \
167-
(header.startswith("TESTING:") or \
202+
(((header.startswith("TESTING:") or \
168203
header.startswith("VIRAL (RT-PCR) LAB TESTING:") or \
169-
header.startswith("HOSPITAL UTILIZATION:")) and \
204+
header.startswith("HOSPITAL UTILIZATION: ")) and \
170205
# exclude "TESTING: % CHANGE FROM PREVIOUS WEEK" \
171206
# exclude "TESTING: DEMOGRAPHIC DATA" \
172207
# exclude "HOSPITAL UTILIZATION: CHANGE FROM PREVIOUS WEEK" \
173208
# exclude "HOSPITAL UTILIZATION: DEMOGRAPHIC DATA" \
174-
header.find("WEEK (") > 0)
209+
header.find("WEEK (") > 0) or \
210+
# include "COVID-19 VACCINATION DATA: CUMULATIVE (January 25)"
211+
# include "COVID-19 VACCINATION DATA: LAST WEEK (January 25-31)"
212+
(header.startswith("COVID-19 VACCINATION DATA: CUMULATIVE") or
213+
header.startswith("COVID-19 VACCINATION DATA: LAST WEEK") \
214+
)))
215+
216+
175217
def _parse_times_for_sheet(self, sheet):
176218
"""Record reference dates for this sheet."""
177219
# grab reference dates from overheaders
@@ -198,21 +240,32 @@ def _parse_times_for_sheet(self, sheet):
198240
self.times[dt.column][sig] = dt[sig]
199241
else:
200242
self.times[dt.column] = dt
201-
assert len(self.times) == 2, \
202-
f"No times extracted from overheaders:\n{NEWLINE.join(str(s) for s in overheaders)}"
243+
244+
if self.publish_date <= datetime.date(2021, 1, 11):
245+
# No vaccination data available, so we only have hospitalization and testing overheaders
246+
assert len(self.times) == 2, \
247+
f"No times extracted from overheaders:\n{NEWLINE.join(str(s) for s in overheaders)}"
248+
else:
249+
assert len(self.times) == 3, \
250+
f"No times extracted from overheaders:\n{NEWLINE.join(str(s) for s in overheaders)}"
203251

204252
@staticmethod
205253
def retain_header(header):
206254
"""Ignore irrelevant headers."""
207-
return all([
255+
return ((all([
208256
# include "Total NAATs - [last|previous] 7 days ..."
209257
# include "Total RT-PCR diagnostic tests - [last|previous] 7 days ..."
210258
# include "NAAT positivity rate - [last|previous] 7 days ..."
211259
# include "Viral (RT-PCR) lab test positivity rate - [last|previous] 7 days ..."
260+
# include "Booster doses administered - [last|previous] 7 days ..."
261+
# include "Doses administered - [last|previous] 7 days ..."
212262
(header.startswith("Total NAATs") or
213263
header.startswith("NAAT positivity rate") or
214264
header.startswith("Total RT-PCR") or
215-
header.startswith("Viral (RT-PCR)")),
265+
header.startswith("Viral (RT-PCR)") or
266+
header.startswith("Booster") or
267+
header.startswith("Doses administered -")
268+
),
216269
# exclude "NAAT positivity rate - absolute change ..."
217270
header.find("7 days") > 0,
218271
# exclude "NAAT positivity rate - last 7 days - ages <5"
@@ -227,7 +280,25 @@ def retain_header(header):
227280
header.find(" age") < 0,
228281
# exclude "Confirmed COVID-19 admissions per 100 inpatient beds - last 7 days"
229282
header.find(" beds") < 0,
230-
])
283+
])) or (all([
284+
# include "People who are fully vaccinated"
285+
# include "People who have received a booster dose since August 13, 2021"
286+
header.startswith("People who"),
287+
# exclude "People who are fully vaccinated as % of total population"
288+
# exclude "People who have received a booster dose as % of fully vaccinated population"
289+
header.find("%") < 0,
290+
# exclude "People who are fully vaccinated - ages 5-11" ...
291+
# exclude "People who have received a booster dose - ages 65+" ...
292+
header.find(" age") < 0,
293+
# exclude "People who are fully vaccinated - 12-17" ...
294+
header.find("-") < 0,
295+
296+
]) or all([
297+
# include "People with full course administered"
298+
header.startswith("People with full course"),
299+
# exclude "People with full course administered as % of adult population"
300+
header.find("%") < 0,
301+
])))
231302
def _parse_sheet(self, sheet):
232303
"""Extract data frame for this sheet."""
233304
df = pd.read_excel(
@@ -238,24 +309,68 @@ def _parse_sheet(self, sheet):
238309
)
239310
if sheet.row_filter:
240311
df = df.loc[sheet.row_filter(df)]
312+
313+
314+
def select_fn(h):
315+
"""Allow for default to the 7-day in the name of the dataframe column."""
316+
try:
317+
return (RE_COLUMN_FROM_HEADER.findall(h)[0], h, h.lower())
318+
except IndexError:
319+
return ("", h, h.lower())
320+
241321
select = [
242-
(RE_COLUMN_FROM_HEADER.findall(h)[0], h, h.lower())
322+
select_fn(h)
243323
for h in list(df.columns)
244324
if self.retain_header(h)
245325
]
246326

247327
for sig in SIGNALS:
328+
## Check if field is known to be missing
248329
# Hospital admissions not available at the county or CBSA level prior to Jan 8, 2021.
249-
if (sheet.level == "msa" or sheet.level == "county") \
330+
is_hosp_adm_before_jan8 = (sheet.level == "msa" or sheet.level == "county") \
250331
and self.publish_date < datetime.date(2021, 1, 8) \
251-
and sig == "confirmed covid-19 admissions":
332+
and sig == "confirmed covid-19 admissions"
333+
# Booster data not available before November 1 2021.
334+
is_booster_before_nov1 = self.publish_date < datetime.date(2021, 11, 1) \
335+
and (sig in ["booster dose since", "booster doses administered"])
336+
# Booster and weekly doses administered not available below the state level.
337+
is_booster_below_state = ((sheet.level != "hhs" and sheet.level != "state") \
338+
and (sig in ["doses administered", \
339+
"booster doses administered", "booster dose since"]))
340+
# Weekly doses administered not available on or before Apr 29, 2021.
341+
is_dose_admin_apr29 = self.publish_date <= datetime.date(2021, 4, 29) \
342+
and sig == "doses administered"
343+
# People fully vaccinated not available on or before Apr 11, 2021 at the CBSA level.
344+
is_fully_vax_msa_before_apr11 = (sheet.level == "msa" or sheet.level == "county") \
345+
and self.publish_date <= datetime.date(2021, 4, 11) \
346+
and sig == "fully vaccinated"
347+
# People fully vaccinated not available before Jan 15, 2021 at any geo level.
348+
is_fully_vax_before_jan14 = self.publish_date <= datetime.date(2021, 1, 14) \
349+
and sig == "fully vaccinated"
350+
351+
if any([is_hosp_adm_before_jan8,
352+
is_booster_before_nov1,
353+
is_booster_below_state,
354+
is_dose_admin_apr29,
355+
is_fully_vax_msa_before_apr11,
356+
is_fully_vax_before_jan14
357+
]):
252358
self.dfs[(sheet.level, sig, NOT_PROP)] = pd.DataFrame(
253359
columns = ["geo_id", "timestamp", "val", \
254360
"se", "sample_size", "publish_date"]
255361
)
256362
continue
257363

258364
sig_select = [s for s in select if s[-1].find(sig) >= 0]
365+
# The name of the cumulative vaccination was changed after 03/09/2021
366+
# when J&J vaccines were added.
367+
if (sig == "fully vaccinated") and (len(sig_select)==0):
368+
sig_select = [s for s in select if s[-1].find("people with full course") >= 0]
369+
# Since "doses administered" is a substring of another desired header,
370+
# "booster doses administered", we need to more strictly check if "doses administered"
371+
# occurs at the beginning of a header to find the correct match.
372+
if sig == "doses administered":
373+
sig_select = [s for s in select if s[-1].startswith(sig)]
259374
assert len(sig_select) > 0, \
260375
f"No {sig} in any of {select}\n\nAll headers:\n{NEWLINE.join(list(df.columns))}"
261376

@@ -270,11 +385,10 @@ def _parse_sheet(self, sheet):
270385
})
271386
for si in sig_select
272387
])
273-
274388
for sig in COUNTS_7D_SIGNALS:
389+
assert (sheet.level, sig, NOT_PROP) in self.dfs.keys()
275390
self.dfs[(sheet.level, sig, NOT_PROP)]["val"] /= 7 # 7-day total -> 7-day average
276391

277-
278392
def as_cached_filename(params, config):
279393
"""Formulate a filename to uniquely identify this report in the input cache."""
280394
# eg "Community Profile Report 20220128.xlsx"
@@ -299,7 +413,6 @@ def fetch_listing(params):
299413
)
300414
for el in listing if el['filename'].endswith("xlsx")
301415
]
302-
303416
if params['indicator']['reports'] == 'new':
304417
# drop files we already have in the input cache
305418
listing = [el for el in listing if not os.path.exists(el['cached_filename'])]
@@ -364,7 +477,6 @@ def fetch_new_reports(params, logger=None):
364477

365478
# download and parse individual reports
366479
datasets = download_and_parse(listing, logger)
367-
368480
# collect like signals together, keeping most recent publish date
369481
ret = {}
370482
for sig, lst in datasets.items():
@@ -381,7 +493,6 @@ def fetch_new_reports(params, logger=None):
381493

382494
if len(latest_sig_df.index) > 0:
383495
latest_sig_df = latest_sig_df.reset_index(drop=True)
384-
385496
assert all(latest_sig_df.groupby(
386497
["timestamp", "geo_id"]
387498
).size(

dsew_community_profile/input_cache/.gitignore

Lines changed: 0 additions & 1 deletion
This file was deleted.

0 commit comments

Comments
 (0)