32
32
rf'HOSPITAL UTILIZATION: (.*) WEEK \({ DATE_RANGE_EXP } \)'
33
33
)
34
34
35
+ # example: "COVID-19 VACCINATION DATA: LAST WEEK (January 5-11)"
36
+ RE_DATE_FROM_VAC_HEADER_WEEK = re .compile (
37
+ rf'COVID-19 VACCINATION DATA: (.*) WEEK \({ DATE_RANGE_EXP } \)'
38
+ )
39
+
40
+ # example: 'COVID-19 VACCINATION DATA: CUMULATIVE (January 11)'
41
+ RE_DATE_FROM_VAC_HEADER_CUMULATIVE = re .compile (
42
+ rf'COVID-19 VACCINATION DATA: CUMULATIVE (.*)\({ DATE_EXP } \)'
43
+ )
44
+
35
45
# example: "NAAT positivity rate - last 7 days (may be an underestimate due to delayed reporting)"
36
46
# example: "Total NAATs - last 7 days (may be an underestimate due to delayed reporting)"
37
47
RE_COLUMN_FROM_HEADER = re .compile ('- (.*) 7 days' )
@@ -44,15 +54,27 @@ class DatasetTimes:
44
54
positivity_reference_date : datetime .date
45
55
total_reference_date : datetime .date
46
56
hosp_reference_date : datetime .date
57
+ vac_reference_date : datetime .date
58
+ cumulative_vac_reference_date : datetime .date
47
59
48
60
@staticmethod
49
61
def from_header (header , publish_date ):
50
62
"""Convert reference dates in overheader to DatasetTimes."""
51
- def as_date (sub_result ):
52
- month = sub_result [2 ] if sub_result [2 ] else sub_result [0 ]
53
- assert month , f"Bad month in header: { header } \n sub_result: { sub_result } "
54
- month_numeric = datetime .datetime .strptime (month , "%B" ).month
55
- day = sub_result [3 ]
63
+ positivity_reference_date = None
64
+ total_reference_date = None
65
+ hosp_reference_date = None
66
+ vac_reference_date = None
67
+ cumulative_vac_reference_date = None
68
+ def as_date (sub_result , is_single_date ):
69
+ if is_single_date :
70
+ month = sub_result [0 ]
71
+ day = sub_result [1 ]
72
+ month_numeric = datetime .datetime .strptime (month , "%B" ).month
73
+ else :
74
+ month = sub_result [2 ] if sub_result [2 ] else sub_result [0 ]
75
+ assert month , f"Bad month in header: { header } \n sub_result: { sub_result } "
76
+ month_numeric = datetime .datetime .strptime (month , "%B" ).month
77
+ day = sub_result [3 ]
56
78
year = publish_date .year
57
79
# year boundary
58
80
if month_numeric > publish_date .month :
@@ -62,51 +84,64 @@ def as_date(sub_result):
62
84
if RE_DATE_FROM_TEST_HEADER .match (header ):
63
85
findall_result = RE_DATE_FROM_TEST_HEADER .findall (header )[0 ]
64
86
column = findall_result [0 ].lower ()
65
- positivity_reference_date = as_date (findall_result [1 :5 ])
87
+ positivity_reference_date = as_date (findall_result [1 :5 ], False )
66
88
if findall_result [6 ]:
67
89
# Reports published starting 2021-03-17 specify different reference
68
90
# dates for positivity and total test volume
69
- total_reference_date = as_date (findall_result [6 :10 ])
91
+ total_reference_date = as_date (findall_result [6 :10 ], False )
70
92
else :
71
93
total_reference_date = positivity_reference_date
72
-
73
- hosp_reference_date = None
74
94
elif RE_DATE_FROM_HOSP_HEADER .match (header ):
75
95
findall_result = RE_DATE_FROM_HOSP_HEADER .findall (header )[0 ]
76
96
column = findall_result [0 ].lower ()
77
- hosp_reference_date = as_date (findall_result [1 :5 ])
78
-
79
- total_reference_date = None
80
- positivity_reference_date = None
97
+ hosp_reference_date = as_date (findall_result [1 :5 ], False )
98
+ elif RE_DATE_FROM_VAC_HEADER_WEEK .match (header ):
99
+ findall_result = RE_DATE_FROM_VAC_HEADER_WEEK .findall (header )[0 ]
100
+ column = findall_result [0 ].lower ()
101
+ vac_reference_date = as_date (findall_result [1 :5 ], False )
102
+ elif RE_DATE_FROM_VAC_HEADER_CUMULATIVE .match (header ):
103
+ findall_result = RE_DATE_FROM_VAC_HEADER_CUMULATIVE .findall (header )[0 ]
104
+ column = findall_result [0 ].lower ()
105
+ cumulative_vac_reference_date = as_date (findall_result [1 :], True )
81
106
else :
82
107
raise ValueError (f"Couldn't find reference date in header '{ header } '" )
83
-
84
108
return DatasetTimes (column , positivity_reference_date ,
85
- total_reference_date , hosp_reference_date )
109
+ total_reference_date , hosp_reference_date ,
110
+ cumulative_vac_reference_date , vac_reference_date )
86
111
def __getitem__ (self , key ):
87
112
"""Use DatasetTimes like a dictionary."""
113
+ ref_list = list (SIGNALS .keys ())
88
114
if key .lower ()== "positivity" :
89
115
return self .positivity_reference_date
90
116
if key .lower ()== "total" :
91
117
return self .total_reference_date
92
118
if key .lower ()== "confirmed covid-19 admissions" :
93
119
return self .hosp_reference_date
120
+ if key .lower () in ["doses administered" ,"booster doses administered" ]:
121
+ return self .cumulative_vac_reference_date
122
+ if key .lower () in ["fully vaccinated" ,"booster dose since" ]:
123
+ return self .vac_reference_date
94
124
raise ValueError (
95
125
f"Bad reference date type request '{ key } '; " + \
96
- "need 'total', 'positivity', or 'confirmed covid-19 admissions'"
126
+ "need one of: " + " ," . join ( ref_list )
97
127
)
98
128
def __setitem__ (self , key , newvalue ):
99
129
"""Use DatasetTimes like a dictionary."""
130
+ ref_list = list (SIGNALS .keys ())
100
131
if key .lower ()== "positivity" :
101
132
self .positivity_reference_date = newvalue
102
133
if key .lower ()== "total" :
103
134
self .total_reference_date = newvalue
104
135
if key .lower ()== "confirmed covid-19 admissions" :
105
136
self .hosp_reference_date = newvalue
106
- else :
137
+ if key .lower () in ["doses administered" ,"booster doses administered" ]:
138
+ self .cumulative_vac_reference_date = newvalue
139
+ if key .lower () in ["fully vaccinated" ,"booster dose since" ]:
140
+ self .vac_reference_date = newvalue
141
+ if key .lower () not in ref_list :
107
142
raise ValueError (
108
143
f"Bad reference date type request '{ key } '; " + \
109
- "need 'total', 'positivity', or 'confirmed covid-19 admissions'"
144
+ "need one of: " + " ," . join ( ref_list )
110
145
)
111
146
def __eq__ (self , other ):
112
147
"""Check equality by value."""
@@ -164,14 +199,21 @@ def skip_overheader(header):
164
199
# include "VIRAL (RT-PCR) LAB TESTING: [LAST|PREVIOUS] WEEK (August 24-30, ..."
165
200
# include "HOSPITAL UTILIZATION: LAST WEEK (January 2-8)"
166
201
return not (isinstance (header , str ) and \
167
- (header .startswith ("TESTING:" ) or \
202
+ ((( header .startswith ("TESTING:" ) or \
168
203
header .startswith ("VIRAL (RT-PCR) LAB TESTING:" ) or \
169
- header .startswith ("HOSPITAL UTILIZATION:" )) and \
204
+ header .startswith ("HOSPITAL UTILIZATION: " )) and \
170
205
# exclude "TESTING: % CHANGE FROM PREVIOUS WEEK" \
171
206
# exclude "TESTING: DEMOGRAPHIC DATA" \
172
207
# exclude "HOSPITAL UTILIZATION: CHANGE FROM PREVIOUS WEEK" \
173
208
# exclude "HOSPITAL UTILIZATION: DEMOGRAPHIC DATA" \
174
- header .find ("WEEK (" ) > 0 )
209
+ header .find ("WEEK (" ) > 0 ) or \
210
+ # include "COVID-19 VACCINATION DATA: CUMULATIVE (January 25)"
211
+ # include "COVID-19 VACCINATION DATA: LAST WEEK (January 25-31)"
212
+ (header .startswith ("COVID-19 VACCINATION DATA: CUMULATIVE" ) or
213
+ header .startswith ("COVID-19 VACCINATION DATA: LAST WEEK" ) \
214
+ )))
215
+
216
+
175
217
def _parse_times_for_sheet (self , sheet ):
176
218
"""Record reference dates for this sheet."""
177
219
# grab reference dates from overheaders
@@ -198,21 +240,32 @@ def _parse_times_for_sheet(self, sheet):
198
240
self .times [dt .column ][sig ] = dt [sig ]
199
241
else :
200
242
self .times [dt .column ] = dt
201
- assert len (self .times ) == 2 , \
202
- f"No times extracted from overheaders:\n { NEWLINE .join (str (s ) for s in overheaders )} "
243
+
244
+ if self .publish_date <= datetime .date (2021 , 1 , 11 ):
245
+ # No vaccination data available, so we only have hospitalization and testing overheaders
246
+ assert len (self .times ) == 2 , \
247
+ f"No times extracted from overheaders:\n { NEWLINE .join (str (s ) for s in overheaders )} "
248
+ else :
249
+ assert len (self .times ) == 3 , \
250
+ f"No times extracted from overheaders:\n { NEWLINE .join (str (s ) for s in overheaders )} "
203
251
204
252
@staticmethod
205
253
def retain_header (header ):
206
254
"""Ignore irrelevant headers."""
207
- return all ([
255
+ return (( all ([
208
256
# include "Total NAATs - [last|previous] 7 days ..."
209
257
# include "Total RT-PCR diagnostic tests - [last|previous] 7 days ..."
210
258
# include "NAAT positivity rate - [last|previous] 7 days ..."
211
259
# include "Viral (RT-PCR) lab test positivity rate - [last|previous] 7 days ..."
260
+ # include "Booster doses administered - [last|previous] 7 days ..."
261
+ # include "Doses administered - [last|previous] 7 days ..."
212
262
(header .startswith ("Total NAATs" ) or
213
263
header .startswith ("NAAT positivity rate" ) or
214
264
header .startswith ("Total RT-PCR" ) or
215
- header .startswith ("Viral (RT-PCR)" )),
265
+ header .startswith ("Viral (RT-PCR)" ) or
266
+ header .startswith ("Booster" ) or
267
+ header .startswith ("Doses administered -" )
268
+ ),
216
269
# exclude "NAAT positivity rate - absolute change ..."
217
270
header .find ("7 days" ) > 0 ,
218
271
# exclude "NAAT positivity rate - last 7 days - ages <5"
@@ -227,7 +280,25 @@ def retain_header(header):
227
280
header .find (" age" ) < 0 ,
228
281
# exclude "Confirmed COVID-19 admissions per 100 inpatient beds - last 7 days"
229
282
header .find (" beds" ) < 0 ,
230
- ])
283
+ ])) or (all ([
284
+ # include "People who are fully vaccinated"
285
+ # include "People who have received a booster dose since August 13, 2021"
286
+ header .startswith ("People who" ),
287
+ # exclude "People who are fully vaccinated as % of total population"
288
+ # exclude "People who have received a booster dose as % of fully vaccinated population"
289
+ header .find ("%" ) < 0 ,
290
+ # exclude "People who are fully vaccinated - ages 5-11" ...
291
+ # exclude "People who have received a booster dose - ages 65+" ...
292
+ header .find (" age" ) < 0 ,
293
+ # exclude "People who are fully vaccinated - 12-17" ...
294
+ header .find ("-" ) < 0 ,
295
+
296
+ ]) or all ([
297
+ # include "People with full course administered"
298
+ header .startswith ("People with full course" ),
299
+ # exclude "People with full course administered as % of adult population"
300
+ header .find ("%" ) < 0 ,
301
+ ])))
231
302
def _parse_sheet (self , sheet ):
232
303
"""Extract data frame for this sheet."""
233
304
df = pd .read_excel (
@@ -238,24 +309,68 @@ def _parse_sheet(self, sheet):
238
309
)
239
310
if sheet .row_filter :
240
311
df = df .loc [sheet .row_filter (df )]
312
+
313
+
314
+ def select_fn (h ):
315
+ """Allow for default to the 7-day in the name of the dataframe column."""
316
+ try :
317
+ return (RE_COLUMN_FROM_HEADER .findall (h )[0 ], h , h .lower ())
318
+ except IndexError :
319
+ return ("" , h , h .lower ())
320
+
241
321
select = [
242
- ( RE_COLUMN_FROM_HEADER . findall ( h )[ 0 ], h , h . lower () )
322
+ select_fn ( h )
243
323
for h in list (df .columns )
244
324
if self .retain_header (h )
245
325
]
246
326
247
327
for sig in SIGNALS :
328
+ ## Check if field is known to be missing
248
329
# Hospital admissions not available at the county or CBSA level prior to Jan 8, 2021.
249
- if (sheet .level == "msa" or sheet .level == "county" ) \
330
+ is_hosp_adm_before_jan8 = (sheet .level == "msa" or sheet .level == "county" ) \
250
331
and self .publish_date < datetime .date (2021 , 1 , 8 ) \
251
- and sig == "confirmed covid-19 admissions" :
332
+ and sig == "confirmed covid-19 admissions"
333
+ # Booster data not available before November 1 2021.
334
+ is_booster_before_nov1 = self .publish_date < datetime .date (2021 , 11 , 1 ) \
335
+ and (sig in ["booster dose since" , "booster doses administered" ])
336
+ # Booster and weekly doses administered not available below the state level.
337
+ is_booster_below_state = ((sheet .level != "hhs" and sheet .level != "state" ) \
338
+ and (sig in ["doses administered" , \
339
+ "booster doses administered" , "booster dose since" ]))
340
+ # Weekly doses administered not available on or before Apr 29, 2021.
341
+ is_dose_admin_apr29 = self .publish_date <= datetime .date (2021 , 4 , 29 ) \
342
+ and sig == "doses administered"
343
+ # People fully vaccinated not available on or before Apr 11, 2021 at the CBSA level.
344
+ is_fully_vax_msa_before_apr11 = (sheet .level == "msa" or sheet .level == "county" ) \
345
+ and self .publish_date <= datetime .date (2021 , 4 , 11 ) \
346
+ and sig == "fully vaccinated"
347
+ # People fully vaccinated not available before Jan 15, 2021 at any geo level.
348
+ is_fully_vax_before_jan14 = self .publish_date <= datetime .date (2021 , 1 , 14 ) \
349
+ and sig == "fully vaccinated"
350
+
351
+ if any ([is_hosp_adm_before_jan8 ,
352
+ is_booster_before_nov1 ,
353
+ is_booster_below_state ,
354
+ is_dose_admin_apr29 ,
355
+ is_fully_vax_msa_before_apr11 ,
356
+ is_fully_vax_before_jan14
357
+ ]):
252
358
self .dfs [(sheet .level , sig , NOT_PROP )] = pd .DataFrame (
253
359
columns = ["geo_id" , "timestamp" , "val" , \
254
360
"se" , "sample_size" , "publish_date" ]
255
361
)
256
362
continue
257
363
258
364
sig_select = [s for s in select if s [- 1 ].find (sig ) >= 0 ]
365
+ # The name of the cumulative vaccination was changed after 03/09/2021
366
+ # when J&J vaccines were added.
367
+ if (sig == "fully vaccinated" ) and (len (sig_select )== 0 ):
368
+ sig_select = [s for s in select if s [- 1 ].find ("people with full course" ) >= 0 ]
369
+ # Since "doses administered" is a substring of another desired header,
370
+ # "booster doses administered", we need to more strictly check if "doses administered"
371
+ # occurs at the beginning of a header to find the correct match.
372
+ if sig == "doses administered" :
373
+ sig_select = [s for s in select if s [- 1 ].startswith (sig )]
259
374
assert len (sig_select ) > 0 , \
260
375
f"No { sig } in any of { select } \n \n All headers:\n { NEWLINE .join (list (df .columns ))} "
261
376
@@ -270,11 +385,10 @@ def _parse_sheet(self, sheet):
270
385
})
271
386
for si in sig_select
272
387
])
273
-
274
388
for sig in COUNTS_7D_SIGNALS :
389
+ assert (sheet .level , sig , NOT_PROP ) in self .dfs .keys ()
275
390
self .dfs [(sheet .level , sig , NOT_PROP )]["val" ] /= 7 # 7-day total -> 7-day average
276
391
277
-
278
392
def as_cached_filename (params , config ):
279
393
"""Formulate a filename to uniquely identify this report in the input cache."""
280
394
# eg "Community Profile Report 20220128.xlsx"
@@ -299,7 +413,6 @@ def fetch_listing(params):
299
413
)
300
414
for el in listing if el ['filename' ].endswith ("xlsx" )
301
415
]
302
-
303
416
if params ['indicator' ]['reports' ] == 'new' :
304
417
# drop files we already have in the input cache
305
418
listing = [el for el in listing if not os .path .exists (el ['cached_filename' ])]
@@ -364,7 +477,6 @@ def fetch_new_reports(params, logger=None):
364
477
365
478
# download and parse individual reports
366
479
datasets = download_and_parse (listing , logger )
367
-
368
480
# collect like signals together, keeping most recent publish date
369
481
ret = {}
370
482
for sig , lst in datasets .items ():
@@ -381,7 +493,6 @@ def fetch_new_reports(params, logger=None):
381
493
382
494
if len (latest_sig_df .index ) > 0 :
383
495
latest_sig_df = latest_sig_df .reset_index (drop = True )
384
-
385
496
assert all (latest_sig_df .groupby (
386
497
["timestamp" , "geo_id" ]
387
498
).size (
0 commit comments