From 749a6c8fabeca041809a5d1491b9dc0e3772a5d9 Mon Sep 17 00:00:00 2001 From: Ananya-Joshi Date: Tue, 25 Jan 2022 16:26:16 -0500 Subject: [PATCH 01/33] first attempt at adding booster signals --- dsew_community_profile/Makefile | 1 - .../constants.py | 5 +++ .../delphi_dsew_community_profile/pull.py | 32 ++++++++++++++++--- 3 files changed, 32 insertions(+), 6 deletions(-) diff --git a/dsew_community_profile/Makefile b/dsew_community_profile/Makefile index bdea33afd..0ea2bac14 100644 --- a/dsew_community_profile/Makefile +++ b/dsew_community_profile/Makefile @@ -27,4 +27,3 @@ test: clean: rm -rf env - rm -f params.json diff --git a/dsew_community_profile/delphi_dsew_community_profile/constants.py b/dsew_community_profile/delphi_dsew_community_profile/constants.py index 51c62b5ea..00f051878 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/constants.py +++ b/dsew_community_profile/delphi_dsew_community_profile/constants.py @@ -59,7 +59,12 @@ class Transform: "confirmed covid-19 admissions": { "is_rate" : False, "api_name": "confirmed_admissions_covid_1d_7dav" + }, + "booster": { + "is_rate" : False, + "api_name": "booster_7dav" } + } COUNTS_7D_SIGNALS = {key for key, value in SIGNALS.items() if not value["is_rate"]} diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py index a65b26a07..11198b619 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/pull.py +++ b/dsew_community_profile/delphi_dsew_community_profile/pull.py @@ -90,9 +90,11 @@ def __getitem__(self, key): return self.total_reference_date if key.lower()=="confirmed covid-19 admissions": return self.hosp_reference_date + if key.lower()=="booster": + return self.hosp_reference_date raise ValueError( f"Bad reference date type request '{key}'; " + \ - "need 'total', 'positivity', or 'confirmed covid-19 admissions'" + "need 'total', 'positivity', 'booster', or 'confirmed covid-19 admissions'" ) def __setitem__(self, key, newvalue): """Use DatasetTimes like a dictionary.""" @@ -102,10 +104,12 @@ def __setitem__(self, key, newvalue): self.total_reference_date = newvalue if key.lower()=="confirmed covid-19 admissions": self.hosp_reference_date = newvalue + elif key.lower()=="booster": + self.hosp_reference_date = newvalue else: raise ValueError( f"Bad reference date type request '{key}'; " + \ - "need 'total', 'positivity', or 'confirmed covid-19 admissions'" + "need 'total', 'positivity', 'booster', or 'confirmed covid-19 admissions'" ) def __eq__(self, other): """Check equality by value.""" @@ -211,7 +215,9 @@ def retain_header(header): (header.startswith("Total NAATs") or header.startswith("NAAT positivity rate") or header.startswith("Total RT-PCR") or - header.startswith("Viral (RT-PCR)")), + header.startswith("Viral (RT-PCR)") or + header.startswith("Booster") + ), # exclude "NAAT positivity rate - absolute change ..." header.find("7 days") > 0, # exclude "NAAT positivity rate - last 7 days - ages <5" @@ -242,7 +248,7 @@ def _parse_sheet(self, sheet): for h in list(df.columns) if self.retain_header(h) ] - + for sig in SIGNALS: # Hospital admissions not available at the county or CBSA level prior to Jan 8, 2021. if (sheet.level == "msa" or sheet.level == "county") \ @@ -254,7 +260,22 @@ def _parse_sheet(self, sheet): ) continue + + if ((sheet.level != "hhs" and sheet.level != "state") \ + and sig == "booster"): + self.dfs[(sheet.level, sig)] = pd.DataFrame( + columns = ["geo_id", "timestamp", "val", \ + "se", "sample_size", "publish_date"] + ) + continue + sig_select = [s for s in select if s[-1].find(sig) >= 0] + + + + + + assert len(sig_select) > 0, \ f"No {sig} in any of {select}\n\nAll headers:\n{NEWLINE.join(list(df.columns))}" @@ -269,11 +290,12 @@ def _parse_sheet(self, sheet): }) for si in sig_select ]) - + for sig in COUNTS_7D_SIGNALS: self.dfs[(sheet.level, sig)]["val"] /= 7 # 7-day total -> 7-day average + def as_cached_filename(params, config): """Formulate a filename to uniquely identify this report in the input cache.""" return os.path.join( From 7abc758dac340aba9fcfab049d4698680792cc56 Mon Sep 17 00:00:00 2001 From: Ananya-Joshi Date: Tue, 25 Jan 2022 16:51:25 -0500 Subject: [PATCH 02/33] temporary changes --- dsew_community_profile/delphi_dsew_community_profile/pull.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py index 11198b619..b4295ed59 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/pull.py +++ b/dsew_community_profile/delphi_dsew_community_profile/pull.py @@ -212,6 +212,7 @@ def retain_header(header): # include "Total RT-PCR diagnostic tests - [last|previous] 7 days ..." # include "NAAT positivity rate - [last|previous] 7 days ..." # include "Viral (RT-PCR) lab test positivity rate - [last|previous] 7 days ..." + # include "Booster doses administerd - [last|previous] 7 days ..." (header.startswith("Total NAATs") or header.startswith("NAAT positivity rate") or header.startswith("Total RT-PCR") or @@ -305,6 +306,7 @@ def as_cached_filename(params, config): def fetch_listing(params): """Generate the list of report files to process.""" + print(requests.get(DOWNLOAD_LISTING).json()) listing = requests.get(DOWNLOAD_LISTING).json()['metadata']['attachments'] # drop the pdf files From 5b5f19ee7601351a28a8325d6e65b8fb467fa4bc Mon Sep 17 00:00:00 2001 From: Ananya-Joshi Date: Wed, 26 Jan 2022 17:13:47 -0500 Subject: [PATCH 03/33] lint changes --- .../delphi_dsew_community_profile/pull.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py index b4295ed59..efe41d0b3 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/pull.py +++ b/dsew_community_profile/delphi_dsew_community_profile/pull.py @@ -249,7 +249,6 @@ def _parse_sheet(self, sheet): for h in list(df.columns) if self.retain_header(h) ] - for sig in SIGNALS: # Hospital admissions not available at the county or CBSA level prior to Jan 8, 2021. if (sheet.level == "msa" or sheet.level == "county") \ @@ -271,12 +270,6 @@ def _parse_sheet(self, sheet): continue sig_select = [s for s in select if s[-1].find(sig) >= 0] - - - - - - assert len(sig_select) > 0, \ f"No {sig} in any of {select}\n\nAll headers:\n{NEWLINE.join(list(df.columns))}" @@ -291,12 +284,9 @@ def _parse_sheet(self, sheet): }) for si in sig_select ]) - for sig in COUNTS_7D_SIGNALS: self.dfs[(sheet.level, sig)]["val"] /= 7 # 7-day total -> 7-day average - - def as_cached_filename(params, config): """Formulate a filename to uniquely identify this report in the input cache.""" return os.path.join( @@ -306,7 +296,6 @@ def as_cached_filename(params, config): def fetch_listing(params): """Generate the list of report files to process.""" - print(requests.get(DOWNLOAD_LISTING).json()) listing = requests.get(DOWNLOAD_LISTING).json()['metadata']['attachments'] # drop the pdf files @@ -383,7 +372,7 @@ def fetch_new_reports(params, logger=None): # download and parse individual reports datasets = download_and_parse(listing, logger) - + print(datasets, datasets.items()) # collect like signals together, keeping most recent publish date ret = {} for sig, lst in datasets.items(): From e51f756429d480b59dd6bb7e0c32ecf2bed8f964 Mon Sep 17 00:00:00 2001 From: Ananya-Joshi Date: Mon, 31 Jan 2022 15:29:12 -0500 Subject: [PATCH 04/33] Changes after Katie's Review --- dsew_community_profile/Makefile | 1 + dsew_community_profile/delphi_dsew_community_profile/pull.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/dsew_community_profile/Makefile b/dsew_community_profile/Makefile index 0ea2bac14..bdea33afd 100644 --- a/dsew_community_profile/Makefile +++ b/dsew_community_profile/Makefile @@ -27,3 +27,4 @@ test: clean: rm -rf env + rm -f params.json diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py index efe41d0b3..75670e698 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/pull.py +++ b/dsew_community_profile/delphi_dsew_community_profile/pull.py @@ -372,7 +372,6 @@ def fetch_new_reports(params, logger=None): # download and parse individual reports datasets = download_and_parse(listing, logger) - print(datasets, datasets.items()) # collect like signals together, keeping most recent publish date ret = {} for sig, lst in datasets.items(): From 470a48f1aa27659e152b4c38729e183c95403153 Mon Sep 17 00:00:00 2001 From: Ananya-Joshi Date: Wed, 2 Feb 2022 11:36:23 -0500 Subject: [PATCH 05/33] Added 4 indicators for vaccination --- .../constants.py | 17 ++++- .../delphi_dsew_community_profile/pull.py | 64 +++++++++++++++---- dsew_community_profile/input_cache/.gitignore | 1 - dsew_community_profile/params.json.template | 4 +- 4 files changed, 67 insertions(+), 19 deletions(-) delete mode 100644 dsew_community_profile/input_cache/.gitignore diff --git a/dsew_community_profile/delphi_dsew_community_profile/constants.py b/dsew_community_profile/delphi_dsew_community_profile/constants.py index 00f051878..29f0235be 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/constants.py +++ b/dsew_community_profile/delphi_dsew_community_profile/constants.py @@ -60,11 +60,22 @@ class Transform: "is_rate" : False, "api_name": "confirmed_admissions_covid_1d_7dav" }, - "booster": { + "fully vaccinated": { "is_rate" : False, - "api_name": "booster_7dav" + "api_name": "full_vaccinated_7dav" + }, + "booster dose since": { + "is_rate" : False, + "api_name": "booster_doses_7dav" + }, + "booster doses administered": { + "is_rate" : False, + "api_name": "total_booster_7dav" + }, + "doses administered": { + "is_rate" : False, + "api_name": "total_doses_7dav" } - } COUNTS_7D_SIGNALS = {key for key, value in SIGNALS.items() if not value["is_rate"]} diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py index 75670e698..7a6e90f3a 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/pull.py +++ b/dsew_community_profile/delphi_dsew_community_profile/pull.py @@ -84,32 +84,32 @@ def as_date(sub_result): total_reference_date, hosp_reference_date) def __getitem__(self, key): """Use DatasetTimes like a dictionary.""" + ref_list = list(SIGNALS.keys()) + _ = [ref_list.remove(x) for x in ["positivity", "total"]] if key.lower()=="positivity": return self.positivity_reference_date if key.lower()=="total": return self.total_reference_date - if key.lower()=="confirmed covid-19 admissions": - return self.hosp_reference_date - if key.lower()=="booster": + if key.lower() in ref_list: return self.hosp_reference_date raise ValueError( f"Bad reference date type request '{key}'; " + \ - "need 'total', 'positivity', 'booster', or 'confirmed covid-19 admissions'" + "need one of: " + " ,".join(ref_list) ) def __setitem__(self, key, newvalue): """Use DatasetTimes like a dictionary.""" + ref_list = list(SIGNALS.keys()) + _ = [ref_list.remove(x) for x in ["positivity", "total"]] if key.lower()=="positivity": self.positivity_reference_date = newvalue if key.lower()=="total": self.total_reference_date = newvalue - if key.lower()=="confirmed covid-19 admissions": - self.hosp_reference_date = newvalue - elif key.lower()=="booster": + if key.lower() in ref_list: self.hosp_reference_date = newvalue else: raise ValueError( f"Bad reference date type request '{key}'; " + \ - "need 'total', 'positivity', 'booster', or 'confirmed covid-19 admissions'" + "need one of: " + " ,".join(ref_list) ) def __eq__(self, other): """Check equality by value.""" @@ -207,17 +207,19 @@ def _parse_times_for_sheet(self, sheet): @staticmethod def retain_header(header): """Ignore irrelevant headers.""" - return all([ + return ((all([ # include "Total NAATs - [last|previous] 7 days ..." # include "Total RT-PCR diagnostic tests - [last|previous] 7 days ..." # include "NAAT positivity rate - [last|previous] 7 days ..." # include "Viral (RT-PCR) lab test positivity rate - [last|previous] 7 days ..." # include "Booster doses administerd - [last|previous] 7 days ..." + # include "Doses administered - [last|previous] 7 days ..." (header.startswith("Total NAATs") or header.startswith("NAAT positivity rate") or header.startswith("Total RT-PCR") or header.startswith("Viral (RT-PCR)") or - header.startswith("Booster") + header.startswith("Booster") or + header.startswith("Doses administered -") ), # exclude "NAAT positivity rate - absolute change ..." header.find("7 days") > 0, @@ -233,7 +235,17 @@ def retain_header(header): header.find(" age") < 0, # exclude "Confirmed COVID-19 admissions per 100 inpatient beds - last 7 days" header.find(" beds") < 0, - ]) + ])) or all([ + # include "People who are fully vaccinated" + # include "People who have received a booster dose since August 13, 2021" + header.startswith("People who"), + # exclude "People who are fully vaccinated as % of total population" + # exclude "People who have received a booster dose as % of fully vaccinated population" + header.find("%") < 0, + # exclude "People who are fully vaccinated - ages 5-11" ... + # exclude "People who have received a booster dose - ages 65+" ... + header.find(" age") < 0, + ])) def _parse_sheet(self, sheet): """Extract data frame for this sheet.""" df = pd.read_excel( @@ -244,11 +256,21 @@ def _parse_sheet(self, sheet): ) if sheet.row_filter: df = df.loc[sheet.row_filter(df)] + + + def select_fn(h): + """Allow for default to the 7-day in the name of the dataframe column.""" + try: + return (RE_COLUMN_FROM_HEADER.findall(h)[0], h, h.lower()) + except IndexError: + return ("last", h, h.lower()) + select = [ - (RE_COLUMN_FROM_HEADER.findall(h)[0], h, h.lower()) + select_fn(h) for h in list(df.columns) if self.retain_header(h) ] + for sig in SIGNALS: # Hospital admissions not available at the county or CBSA level prior to Jan 8, 2021. if (sheet.level == "msa" or sheet.level == "county") \ @@ -261,8 +283,20 @@ def _parse_sheet(self, sheet): continue + + # Booster data not available before November 2021. + if self.publish_date < datetime.date(2021, 11, 1) \ + and (sig in ["booster dose since", "booster doses administered"]) : + self.dfs[(sheet.level, sig)] = pd.DataFrame( + columns = ["geo_id", "timestamp", "val", \ + "se", "sample_size", "publish_date"] + ) + continue + + # Booster and weekly doses administered not available below the state level. if ((sheet.level != "hhs" and sheet.level != "state") \ - and sig == "booster"): + and (sig in ["doses administered", \ + "booster doses administered", "booster dose since"])): self.dfs[(sheet.level, sig)] = pd.DataFrame( columns = ["geo_id", "timestamp", "val", \ "se", "sample_size", "publish_date"] @@ -270,6 +304,9 @@ def _parse_sheet(self, sheet): continue sig_select = [s for s in select if s[-1].find(sig) >= 0] + + if sig == "doses administered": + sig_select = [s for s in select if s[-1].startswith(sig)] assert len(sig_select) > 0, \ f"No {sig} in any of {select}\n\nAll headers:\n{NEWLINE.join(list(df.columns))}" @@ -388,7 +425,6 @@ def fetch_new_reports(params, logger=None): if len(latest_sig_df.index) > 0: latest_sig_df = latest_sig_df.reset_index(drop=True) - assert all(latest_sig_df.groupby( ["timestamp", "geo_id"] ).size( diff --git a/dsew_community_profile/input_cache/.gitignore b/dsew_community_profile/input_cache/.gitignore deleted file mode 100644 index 7c1222033..000000000 --- a/dsew_community_profile/input_cache/.gitignore +++ /dev/null @@ -1 +0,0 @@ -*.xlsx diff --git a/dsew_community_profile/params.json.template b/dsew_community_profile/params.json.template index 3a64d71ab..4415eae7a 100644 --- a/dsew_community_profile/params.json.template +++ b/dsew_community_profile/params.json.template @@ -11,7 +11,9 @@ "export_signals": [ "confirmed covid-19 admissions", "total", - "positivity" + "positivity", + "doses", + "people" ] }, "validation": { From f9b62b53486181102cc8fa9d9467c310f30d29f5 Mon Sep 17 00:00:00 2001 From: Ananya-Joshi Date: Wed, 2 Feb 2022 21:29:46 -0500 Subject: [PATCH 06/33] working with the new overheaders --- .../delphi_dsew_community_profile/pull.py | 95 ++++++++++++++++--- dsew_community_profile/params.json.template | 6 +- 2 files changed, 87 insertions(+), 14 deletions(-) diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py index 7a6e90f3a..bd513fe4b 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/pull.py +++ b/dsew_community_profile/delphi_dsew_community_profile/pull.py @@ -31,6 +31,21 @@ rf'HOSPITAL UTILIZATION: (.*) WEEK \({DATE_RANGE_EXP}\)' ) +# example: "COVID-19 VACCINATION DATA: LAST WEEK (January 5-11)" +RE_DATE_FROM_VAC_HEADER_WEEK= re.compile( + rf'COVID-19 VACCINATION DATA: (.*) WEEK \({DATE_RANGE_EXP}\)' +) + + +RE_BAD_DATE_FROM_VAC_HEADER_WEEK= re.compile( + rf'COVID-19 VACCINATION DATA: DEMOGRAPHIC DATA LAST WEEK' +) + +# example: 'COVID-19 VACCINATION DATA: CUMULATIVE (January 11)' +RE_DATE_FROM_VAC_HEADER_DAY= re.compile( + rf'COVID-19 VACCINATION DATA: CUMULATIVE (.*)\({DATE_EXP}\)' +) + # example: "NAAT positivity rate - last 7 days (may be an underestimate due to delayed reporting)" # example: "Total NAATs - last 7 days (may be an underestimate due to delayed reporting)" RE_COLUMN_FROM_HEADER = re.compile('- (.*) 7 days') @@ -43,10 +58,20 @@ class DatasetTimes: positivity_reference_date: datetime.date total_reference_date: datetime.date hosp_reference_date: datetime.date + vac_reference_date: datetime.date + vac_reference_day: datetime.date @staticmethod def from_header(header, publish_date): """Convert reference dates in overheader to DatasetTimes.""" + def as_day(sub_result): + month = sub_result[0] + assert month, f"Bad month in header: {header}\nsub_result: {sub_result}" + month_numeric = datetime.datetime.strptime(month, "%B").month + day = sub_result[1] + year = publish_date.year + return datetime.datetime.strptime(f"{year}-{month}-{day}", "%Y-%B-%d").date() + def as_date(sub_result): month = sub_result[2] if sub_result[2] else sub_result[0] assert month, f"Bad month in header: {header}\nsub_result: {sub_result}" @@ -68,30 +93,54 @@ def as_date(sub_result): total_reference_date = as_date(findall_result[6:10]) else: total_reference_date = positivity_reference_date - hosp_reference_date = None + vac_reference_date = None + vac_reference_day = None elif RE_DATE_FROM_HOSP_HEADER.match(header): findall_result = RE_DATE_FROM_HOSP_HEADER.findall(header)[0] + print(findall_result) column = findall_result[0].lower() hosp_reference_date = as_date(findall_result[1:5]) - total_reference_date = None positivity_reference_date = None + vac_reference_date = None + vac_reference_day = None + elif RE_DATE_FROM_VAC_HEADER_WEEK.match(header): + findall_result = RE_DATE_FROM_VAC_HEADER_WEEK.findall(header)[0] + column = findall_result[0].lower() + vac_reference_date = as_date(findall_result[1:5]) + total_reference_date = None + positivity_reference_date = None + hosp_reference_date = None + vac_reference_day = None + elif RE_DATE_FROM_VAC_HEADER_DAY.match(header): + findall_result = RE_DATE_FROM_VAC_HEADER_DAY.findall(header)[0] + column = findall_result[0].lower() + vac_reference_day = as_day(findall_result[1:]) + total_reference_date = None + positivity_reference_date = None + hosp_reference_date = None + vac_reference_date = None else: raise ValueError(f"Couldn't find reference date in header '{header}'") + print("ret vals: ", column, positivity_reference_date, + total_reference_date, hosp_reference_date, vac_reference_day, vac_reference_date) return DatasetTimes(column, positivity_reference_date, - total_reference_date, hosp_reference_date) + total_reference_date, hosp_reference_date, vac_reference_day, vac_reference_date) def __getitem__(self, key): """Use DatasetTimes like a dictionary.""" ref_list = list(SIGNALS.keys()) - _ = [ref_list.remove(x) for x in ["positivity", "total"]] if key.lower()=="positivity": return self.positivity_reference_date if key.lower()=="total": return self.total_reference_date - if key.lower() in ref_list: + if key.lower()=="confirmed covid-19 admissions": return self.hosp_reference_date + if key.lower() in ["doses administered","booster doses administered"]: + return self.vac_reference_day + if key.lower() in ["fully vaccinated","booster dose since"]: + return self.vac_reference_date raise ValueError( f"Bad reference date type request '{key}'; " + \ "need one of: " + " ,".join(ref_list) @@ -99,14 +148,19 @@ def __getitem__(self, key): def __setitem__(self, key, newvalue): """Use DatasetTimes like a dictionary.""" ref_list = list(SIGNALS.keys()) - _ = [ref_list.remove(x) for x in ["positivity", "total"]] if key.lower()=="positivity": self.positivity_reference_date = newvalue if key.lower()=="total": self.total_reference_date = newvalue if key.lower() in ref_list: self.hosp_reference_date = newvalue - else: + if key.lower()=="confirmed covid-19 admissions": + self.hosp_reference_date = newvalue + if key.lower() in ["doses administered","booster doses administered"]: + self.vac_reference_day = newvalue + if key.lower() in ["fully vaccinated","booster dose since"]: + self.vac_reference_date = newvalue + if key.lower() not in ref_list: raise ValueError( f"Bad reference date type request '{key}'; " + \ "need one of: " + " ,".join(ref_list) @@ -166,15 +220,27 @@ def skip_overheader(header): # include "TESTING: [LAST|PREVIOUS] WEEK (October 24-30, Test Volume October 20-26)" # include "VIRAL (RT-PCR) LAB TESTING: [LAST|PREVIOUS] WEEK (August 24-30, ..." # include "HOSPITAL UTILIZATION: LAST WEEK (January 2-8)" + return not (isinstance(header, str) and \ - (header.startswith("TESTING:") or \ + (((header.startswith("TESTING:") or \ header.startswith("VIRAL (RT-PCR) LAB TESTING:") or \ - header.startswith("HOSPITAL UTILIZATION:")) and \ + header.startswith("HOSPITAL UTILIZATION: ")) and \ # exclude "TESTING: % CHANGE FROM PREVIOUS WEEK" \ # exclude "TESTING: DEMOGRAPHIC DATA" \ # exclude "HOSPITAL UTILIZATION: CHANGE FROM PREVIOUS WEEK" \ # exclude "HOSPITAL UTILIZATION: DEMOGRAPHIC DATA" \ - header.find("WEEK (") > 0) + # exclude "COVID-19 VACCINATION DATA: DEMOGRAPHIC DATA % CHANGE FROM PREVIOUS WEEK" \ + # exclude "COVID-19 VACCINATION DATA: DEMOGRAPHIC DATA CUMULATIVE" + header.find("WEEK (") > 0) or \ + # include "COVID-19 VACCINATION DATA: CUMULATIVE (January 25)" + # include "COVID-19 VACCINATION DATA: DEMOGRAPHIC DATA LAST WEEK" + (header.startswith("COVID-19 VACCINATION DATA: CUMULATIVE") or + header.startswith("COVID-19 VACCINATION DATA: LAST WEEK") \ + ))) + + + + def _parse_times_for_sheet(self, sheet): """Record reference dates for this sheet.""" # grab reference dates from overheaders @@ -201,7 +267,12 @@ def _parse_times_for_sheet(self, sheet): self.times[dt.column][sig] = dt[sig] else: self.times[dt.column] = dt - assert len(self.times) == 2, \ + print(self.times) + print("these are self.times") + for key, value in self.times.items(): + print(key, value) + print("done") + assert len(self.times) == 3, \ f"No times extracted from overheaders:\n{NEWLINE.join(str(s) for s in overheaders)}" @staticmethod @@ -263,7 +334,7 @@ def select_fn(h): try: return (RE_COLUMN_FROM_HEADER.findall(h)[0], h, h.lower()) except IndexError: - return ("last", h, h.lower()) + return ("", h, h.lower()) select = [ select_fn(h) diff --git a/dsew_community_profile/params.json.template b/dsew_community_profile/params.json.template index 4415eae7a..3469bc2d7 100644 --- a/dsew_community_profile/params.json.template +++ b/dsew_community_profile/params.json.template @@ -12,8 +12,10 @@ "confirmed covid-19 admissions", "total", "positivity", - "doses", - "people" + "doses administered", + "booster doses administered", + "fully vaccinated", + "booster dose since", ] }, "validation": { From 499a5f4be0aa50bb8dd2c386c31af676bbf44cc2 Mon Sep 17 00:00:00 2001 From: Ananya-Joshi Date: Wed, 2 Feb 2022 21:50:20 -0500 Subject: [PATCH 07/33] changes to tests and lint --- .../delphi_dsew_community_profile/pull.py | 19 ++++++--------- dsew_community_profile/tests/test_pull.py | 24 ++++++++++--------- 2 files changed, 20 insertions(+), 23 deletions(-) diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py index bd513fe4b..70641fd94 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/pull.py +++ b/dsew_community_profile/delphi_dsew_community_profile/pull.py @@ -38,7 +38,7 @@ RE_BAD_DATE_FROM_VAC_HEADER_WEEK= re.compile( - rf'COVID-19 VACCINATION DATA: DEMOGRAPHIC DATA LAST WEEK' + r'COVID-19 VACCINATION DATA: DEMOGRAPHIC DATA LAST WEEK' ) # example: 'COVID-19 VACCINATION DATA: CUMULATIVE (January 11)' @@ -66,8 +66,6 @@ def from_header(header, publish_date): """Convert reference dates in overheader to DatasetTimes.""" def as_day(sub_result): month = sub_result[0] - assert month, f"Bad month in header: {header}\nsub_result: {sub_result}" - month_numeric = datetime.datetime.strptime(month, "%B").month day = sub_result[1] year = publish_date.year return datetime.datetime.strptime(f"{year}-{month}-{day}", "%Y-%B-%d").date() @@ -94,7 +92,7 @@ def as_date(sub_result): else: total_reference_date = positivity_reference_date hosp_reference_date = None - vac_reference_date = None + vac_reference_date = None vac_reference_day = None elif RE_DATE_FROM_HOSP_HEADER.match(header): findall_result = RE_DATE_FROM_HOSP_HEADER.findall(header)[0] @@ -103,7 +101,7 @@ def as_date(sub_result): hosp_reference_date = as_date(findall_result[1:5]) total_reference_date = None positivity_reference_date = None - vac_reference_date = None + vac_reference_date = None vac_reference_day = None elif RE_DATE_FROM_VAC_HEADER_WEEK.match(header): findall_result = RE_DATE_FROM_VAC_HEADER_WEEK.findall(header)[0] @@ -160,7 +158,7 @@ def __setitem__(self, key, newvalue): self.vac_reference_day = newvalue if key.lower() in ["fully vaccinated","booster dose since"]: self.vac_reference_date = newvalue - if key.lower() not in ref_list: + if key.lower() not in ref_list: raise ValueError( f"Bad reference date type request '{key}'; " + \ "need one of: " + " ,".join(ref_list) @@ -220,7 +218,6 @@ def skip_overheader(header): # include "TESTING: [LAST|PREVIOUS] WEEK (October 24-30, Test Volume October 20-26)" # include "VIRAL (RT-PCR) LAB TESTING: [LAST|PREVIOUS] WEEK (August 24-30, ..." # include "HOSPITAL UTILIZATION: LAST WEEK (January 2-8)" - return not (isinstance(header, str) and \ (((header.startswith("TESTING:") or \ header.startswith("VIRAL (RT-PCR) LAB TESTING:") or \ @@ -229,18 +226,16 @@ def skip_overheader(header): # exclude "TESTING: DEMOGRAPHIC DATA" \ # exclude "HOSPITAL UTILIZATION: CHANGE FROM PREVIOUS WEEK" \ # exclude "HOSPITAL UTILIZATION: DEMOGRAPHIC DATA" \ - # exclude "COVID-19 VACCINATION DATA: DEMOGRAPHIC DATA % CHANGE FROM PREVIOUS WEEK" \ - # exclude "COVID-19 VACCINATION DATA: DEMOGRAPHIC DATA CUMULATIVE" + # exclude "COVID-19 VACCINATION DATA: DEMOGRAPHIC DATA %"\ + # exclude "COVID-19 VACCINATION DATA: DEMOGRAPHIC DATA CUMULATIVE" header.find("WEEK (") > 0) or \ # include "COVID-19 VACCINATION DATA: CUMULATIVE (January 25)" # include "COVID-19 VACCINATION DATA: DEMOGRAPHIC DATA LAST WEEK" - (header.startswith("COVID-19 VACCINATION DATA: CUMULATIVE") or + (header.startswith("COVID-19 VACCINATION DATA: CUMULATIVE") or header.startswith("COVID-19 VACCINATION DATA: LAST WEEK") \ ))) - - def _parse_times_for_sheet(self, sheet): """Record reference dates for this sheet.""" # grab reference dates from overheaders diff --git a/dsew_community_profile/tests/test_pull.py b/dsew_community_profile/tests/test_pull.py index 60f0fa5dd..88fdb439e 100644 --- a/dsew_community_profile/tests/test_pull.py +++ b/dsew_community_profile/tests/test_pull.py @@ -16,35 +16,37 @@ class TestPull: def test_DatasetTimes(self): examples = [ - example(DatasetTimes("xyzzy", date(2021, 10, 30), date(2021, 10, 20), date(2021, 10, 22)), - DatasetTimes("xyzzy", date(2021, 10, 30), date(2021, 10, 20), date(2021, 10, 22))), + example(DatasetTimes("xyzzy", date(2021, 10, 30), date(2021, 10, 20), date(2021, 10, 22), date(2021, 10, 23), date(2021, 10, 24)), + DatasetTimes("xyzzy", date(2021, 10, 30), date(2021, 10, 20), date(2021, 10, 22), date(2021, 10, 23), date(2021, 10, 24))), ] for ex in examples: assert ex.given == ex.expected, "Equality" - dt = DatasetTimes("xyzzy", date(2021, 10, 30), date(2021, 10, 20), date(2021, 10, 22)) + dt = DatasetTimes("xyzzy", date(2021, 10, 30), date(2021, 10, 20), date(2021, 10, 22), date(2021, 10, 23), date(2021, 10, 24)) assert dt["positivity"] == date(2021, 10, 30), "positivity" assert dt["total"] == date(2021, 10, 20), "total" assert dt["confirmed covid-19 admissions"] == date(2021, 10, 22), "confirmed covid-19 admissions" + assert dt["doses administered"] == date(2021, 10, 24), "doses administered" + assert dt["fully vaccinated"] == date(2021, 10, 23), "fully vaccinated" with pytest.raises(ValueError): dt["xyzzy"] def test_DatasetTimes_from_header(self): examples = [ example("TESTING: LAST WEEK (October 24-30, Test Volume October 20-26)", - DatasetTimes("last", date(2021, 10, 30), date(2021, 10, 26), None)), + DatasetTimes("last", date(2021, 10, 30), date(2021, 10, 26), None, None, None)), example("TESTING: PREVIOUS WEEK (October 24-30, Test Volume October 20-26)", - DatasetTimes("previous", date(2021, 10, 30), date(2021, 10, 26), None)), + DatasetTimes("previous", date(2021, 10, 30), date(2021, 10, 26), None, None, None)), example("TESTING: LAST WEEK (October 24-November 30, Test Volume October 20-26)", - DatasetTimes("last", date(2021, 11, 30), date(2021, 10, 26), None)), + DatasetTimes("last", date(2021, 11, 30), date(2021, 10, 26), None, None, None)), example("VIRAL (RT-PCR) LAB TESTING: LAST WEEK (June 7-13, Test Volume June 3-9 )", - DatasetTimes("last", date(2021, 6, 13), date(2021, 6, 9), None)), + DatasetTimes("last", date(2021, 6, 13), date(2021, 6, 9), None, None, None)), example("VIRAL (RT-PCR) LAB TESTING: LAST WEEK (March 7-13)", - DatasetTimes("last", date(2021, 3, 13), date(2021, 3, 13), None)), + DatasetTimes("last", date(2021, 3, 13), date(2021, 3, 13), None, None, None)), example("HOSPITAL UTILIZATION: LAST WEEK (June 2-8)", - DatasetTimes("last", None, None, date(2021, 6, 8))), + DatasetTimes("last", None, None, date(2021, 6, 8), None, None)), example("HOSPITAL UTILIZATION: LAST WEEK (June 28-July 8)", - DatasetTimes("last", None, None, date(2021, 7, 8))) + DatasetTimes("last", None, None, date(2021, 7, 8), None, None)) ] for ex in examples: assert DatasetTimes.from_header(ex.given, date(2021, 12, 31)) == ex.expected, ex.given @@ -52,7 +54,7 @@ def test_DatasetTimes_from_header(self): # test year boundary examples = [ example("TESTING: LAST WEEK (October 24-30, Test Volume October 20-26)", - DatasetTimes("last", date(2020, 10, 30), date(2020, 10, 26), None)), + DatasetTimes("last", date(2020, 10, 30), date(2020, 10, 26), None, None, None)), ] for ex in examples: assert DatasetTimes.from_header(ex.given, date(2021, 1, 1)) == ex.expected, ex.given From 44f2101216838b8a1632d4b14fb7aa76bee515f1 Mon Sep 17 00:00:00 2001 From: Ananya-Joshi Date: Wed, 2 Feb 2022 21:53:47 -0500 Subject: [PATCH 08/33] removed comma in the json file --- dsew_community_profile/params.json.template | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dsew_community_profile/params.json.template b/dsew_community_profile/params.json.template index 9d3b5bd4c..6d8d8f113 100644 --- a/dsew_community_profile/params.json.template +++ b/dsew_community_profile/params.json.template @@ -15,7 +15,7 @@ "doses administered", "booster doses administered", "fully vaccinated", - "booster dose since", + "booster dose since" ] }, "validation": { From 53482f35e028fb0ea243abcf5d64a8d19be4bfed Mon Sep 17 00:00:00 2001 From: Ananya-Joshi Date: Thu, 3 Feb 2022 08:38:46 -0500 Subject: [PATCH 09/33] removed print statements --- dsew_community_profile/delphi_dsew_community_profile/pull.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py index ea10efc3c..0b300de92 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/pull.py +++ b/dsew_community_profile/delphi_dsew_community_profile/pull.py @@ -263,11 +263,6 @@ def _parse_times_for_sheet(self, sheet): self.times[dt.column][sig] = dt[sig] else: self.times[dt.column] = dt - print(self.times) - print("these are self.times") - for key, value in self.times.items(): - print(key, value) - print("done") assert len(self.times) == 3, \ f"No times extracted from overheaders:\n{NEWLINE.join(str(s) for s in overheaders)}" From 7c0564bd843a62e6d09d146ee7ffa2c783dc5f57 Mon Sep 17 00:00:00 2001 From: Ananya-Joshi Date: Thu, 3 Feb 2022 08:40:07 -0500 Subject: [PATCH 10/33] remove print --- dsew_community_profile/delphi_dsew_community_profile/pull.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py index 0b300de92..53c224a12 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/pull.py +++ b/dsew_community_profile/delphi_dsew_community_profile/pull.py @@ -97,7 +97,6 @@ def as_date(sub_result): vac_reference_day = None elif RE_DATE_FROM_HOSP_HEADER.match(header): findall_result = RE_DATE_FROM_HOSP_HEADER.findall(header)[0] - print(findall_result) column = findall_result[0].lower() hosp_reference_date = as_date(findall_result[1:5]) total_reference_date = None @@ -122,9 +121,6 @@ def as_date(sub_result): vac_reference_date = None else: raise ValueError(f"Couldn't find reference date in header '{header}'") - - print("ret vals: ", column, positivity_reference_date, - total_reference_date, hosp_reference_date, vac_reference_day, vac_reference_date) return DatasetTimes(column, positivity_reference_date, total_reference_date, hosp_reference_date, vac_reference_day, vac_reference_date) def __getitem__(self, key): From 6a935b4c46d4dd6977b88f4880a3115e679f15fe Mon Sep 17 00:00:00 2001 From: Ananya-Joshi Date: Thu, 3 Feb 2022 16:37:55 -0500 Subject: [PATCH 11/33] removed cumulative from 7 day --- .../delphi_dsew_community_profile/constants.py | 15 ++++++++++----- .../delphi_dsew_community_profile/pull.py | 3 ++- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/dsew_community_profile/delphi_dsew_community_profile/constants.py b/dsew_community_profile/delphi_dsew_community_profile/constants.py index 1c3e73b91..eca4d8bd6 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/constants.py +++ b/dsew_community_profile/delphi_dsew_community_profile/constants.py @@ -66,23 +66,28 @@ class Transform: }, "fully vaccinated": { "is_rate" : False, - "api_name": "full_vaccinated_7dav" + "api_name": "total_full_vaccinated", + "make_prop": False }, "booster dose since": { "is_rate" : False, - "api_name": "booster_doses_7dav" + "api_name": "total_booster_doses", + "make_prop": False }, "booster doses administered": { "is_rate" : False, - "api_name": "total_booster_7dav" + "api_name": "booster_doses_admin_7dav", + "make_prop": False }, "doses administered": { "is_rate" : False, - "api_name": "total_doses_7dav" + "api_name": "doses_admin_7dav", + "make_prop": False } } -COUNTS_7D_SIGNALS = {key for key, value in SIGNALS.items() if not value["is_rate"]} +COUNTS_7D_SIGNALS = {key for key, value in SIGNALS.items() \ + if not((value["is_rate"]) or ("total" in value["api_name"]))} def make_signal_name(key, is_prop=False): """Convert a signal key to the corresponding signal name for the API. diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py index 53c224a12..db0bd06bc 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/pull.py +++ b/dsew_community_profile/delphi_dsew_community_profile/pull.py @@ -380,7 +380,8 @@ def select_fn(h): for si in sig_select ]) for sig in COUNTS_7D_SIGNALS: - self.dfs[(sheet.level, sig, NOT_PROP)]["val"] /= 7 # 7-day total -> 7-day average + if (sheet.level, sig, NOT_PROP) in self.dfs.keys(): + self.dfs[(sheet.level, sig, NOT_PROP)]["val"] /= 7 # 7-day total -> 7-day average def as_cached_filename(params, config): """Formulate a filename to uniquely identify this report in the input cache.""" From 95bb2524b54d8cf3b8e4524bcd9bd08a8a2b7d40 Mon Sep 17 00:00:00 2001 From: Ananya-Joshi Date: Sat, 5 Feb 2022 22:05:44 -0500 Subject: [PATCH 12/33] removed unused line --- dsew_community_profile/delphi_dsew_community_profile/pull.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py index db0bd06bc..e94ffe673 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/pull.py +++ b/dsew_community_profile/delphi_dsew_community_profile/pull.py @@ -37,11 +37,6 @@ rf'COVID-19 VACCINATION DATA: (.*) WEEK \({DATE_RANGE_EXP}\)' ) - -RE_BAD_DATE_FROM_VAC_HEADER_WEEK= re.compile( - r'COVID-19 VACCINATION DATA: DEMOGRAPHIC DATA LAST WEEK' -) - # example: 'COVID-19 VACCINATION DATA: CUMULATIVE (January 11)' RE_DATE_FROM_VAC_HEADER_DAY= re.compile( rf'COVID-19 VACCINATION DATA: CUMULATIVE (.*)\({DATE_EXP}\)' From 731ed9f3a95dd34e57a022db81ca50d56df6f5ae Mon Sep 17 00:00:00 2001 From: Ananya-Joshi Date: Sat, 5 Feb 2022 22:12:29 -0500 Subject: [PATCH 13/33] added cumulative flag to remove some signals from COUNTS_7D_SIGNALS --- .../constants.py | 23 ++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/dsew_community_profile/delphi_dsew_community_profile/constants.py b/dsew_community_profile/delphi_dsew_community_profile/constants.py index eca4d8bd6..2d18823eb 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/constants.py +++ b/dsew_community_profile/delphi_dsew_community_profile/constants.py @@ -51,43 +51,50 @@ class Transform: "total": { "is_rate" : False, "api_name": "naats_total_7dav", - "make_prop": False + "make_prop": False, + "cumulative" : False }, "positivity": { "is_rate" : True, "api_name": "naats_positivity_7dav", - "make_prop": False + "make_prop": False, + "cumulative" : False }, "confirmed covid-19 admissions": { "is_rate" : False, "api_name": "confirmed_admissions_covid_1d_7dav", "make_prop": True, - "api_prop_name": "confirmed_admissions_covid_1d_prop_7dav" + "api_prop_name": "confirmed_admissions_covid_1d_prop_7dav", + "cumulative" : False }, "fully vaccinated": { "is_rate" : False, "api_name": "total_full_vaccinated", - "make_prop": False + "make_prop": False, + "cumulative" : True }, "booster dose since": { "is_rate" : False, "api_name": "total_booster_doses", - "make_prop": False + "make_prop": False, + "cumulative" : True }, "booster doses administered": { "is_rate" : False, "api_name": "booster_doses_admin_7dav", - "make_prop": False + "make_prop": False, + "cumulative" : False }, "doses administered": { "is_rate" : False, "api_name": "doses_admin_7dav", - "make_prop": False + "make_prop": False, + "cumulative" : False } } COUNTS_7D_SIGNALS = {key for key, value in SIGNALS.items() \ - if not((value["is_rate"]) or ("total" in value["api_name"]))} + if not((value["is_rate"]) or (value["cumulative"]))} def make_signal_name(key, is_prop=False): """Convert a signal key to the corresponding signal name for the API. From 94522c2f72344aca2ebcf0236d979ad2cd9d66a3 Mon Sep 17 00:00:00 2001 From: Ananya-Joshi Date: Sat, 5 Feb 2022 22:14:25 -0500 Subject: [PATCH 14/33] Change the header specification Co-authored-by: nmdefries <42820733+nmdefries@users.noreply.github.com> --- dsew_community_profile/delphi_dsew_community_profile/pull.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py index e94ffe673..247f9a963 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/pull.py +++ b/dsew_community_profile/delphi_dsew_community_profile/pull.py @@ -218,11 +218,9 @@ def skip_overheader(header): # exclude "TESTING: DEMOGRAPHIC DATA" \ # exclude "HOSPITAL UTILIZATION: CHANGE FROM PREVIOUS WEEK" \ # exclude "HOSPITAL UTILIZATION: DEMOGRAPHIC DATA" \ - # exclude "COVID-19 VACCINATION DATA: DEMOGRAPHIC DATA %"\ - # exclude "COVID-19 VACCINATION DATA: DEMOGRAPHIC DATA CUMULATIVE" header.find("WEEK (") > 0) or \ # include "COVID-19 VACCINATION DATA: CUMULATIVE (January 25)" - # include "COVID-19 VACCINATION DATA: DEMOGRAPHIC DATA LAST WEEK" + # include "COVID-19 VACCINATION DATA: LAST WEEK (January 25-31)" (header.startswith("COVID-19 VACCINATION DATA: CUMULATIVE") or header.startswith("COVID-19 VACCINATION DATA: LAST WEEK") \ ))) From 3eb0381077528d8523c5f243d1c531dd9422b135 Mon Sep 17 00:00:00 2001 From: Ananya-Joshi Date: Sat, 5 Feb 2022 22:29:43 -0500 Subject: [PATCH 15/33] changed if statement to assert in COUNTS_7D_SIGNALS creation --- .../delphi_dsew_community_profile/pull.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py index 247f9a963..313cd7c36 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/pull.py +++ b/dsew_community_profile/delphi_dsew_community_profile/pull.py @@ -38,7 +38,7 @@ ) # example: 'COVID-19 VACCINATION DATA: CUMULATIVE (January 11)' -RE_DATE_FROM_VAC_HEADER_DAY= re.compile( +RE_DATE_FROM_VAC_HEADER_CUMULATIVE= re.compile( rf'COVID-19 VACCINATION DATA: CUMULATIVE (.*)\({DATE_EXP}\)' ) @@ -106,8 +106,8 @@ def as_date(sub_result): positivity_reference_date = None hosp_reference_date = None vac_reference_day = None - elif RE_DATE_FROM_VAC_HEADER_DAY.match(header): - findall_result = RE_DATE_FROM_VAC_HEADER_DAY.findall(header)[0] + elif RE_DATE_FROM_VAC_HEADER_CUMULATIVE.match(header): + findall_result = RE_DATE_FROM_VAC_HEADER_CUMULATIVE.findall(header)[0] column = findall_result[0].lower() vac_reference_day = as_day(findall_result[1:]) total_reference_date = None @@ -338,7 +338,7 @@ def select_fn(h): # Booster data not available before November 2021. if self.publish_date < datetime.date(2021, 11, 1) \ and (sig in ["booster dose since", "booster doses administered"]) : - self.dfs[(sheet.level, sig)] = pd.DataFrame( + self.dfs[(sheet.level, sig, NOT_PROP)] = pd.DataFrame( columns = ["geo_id", "timestamp", "val", \ "se", "sample_size", "publish_date"] ) @@ -348,7 +348,7 @@ def select_fn(h): if ((sheet.level != "hhs" and sheet.level != "state") \ and (sig in ["doses administered", \ "booster doses administered", "booster dose since"])): - self.dfs[(sheet.level, sig)] = pd.DataFrame( + self.dfs[(sheet.level, sig, NOT_PROP)] = pd.DataFrame( columns = ["geo_id", "timestamp", "val", \ "se", "sample_size", "publish_date"] ) @@ -373,8 +373,8 @@ def select_fn(h): for si in sig_select ]) for sig in COUNTS_7D_SIGNALS: - if (sheet.level, sig, NOT_PROP) in self.dfs.keys(): - self.dfs[(sheet.level, sig, NOT_PROP)]["val"] /= 7 # 7-day total -> 7-day average + assert((sheet.level, sig, NOT_PROP) in self.dfs.keys()) + self.dfs[(sheet.level, sig, NOT_PROP)]["val"] /= 7 # 7-day total -> 7-day average def as_cached_filename(params, config): """Formulate a filename to uniquely identify this report in the input cache.""" From 8a1e3577dddb20af77c5e2f1213b0375f9e729d4 Mon Sep 17 00:00:00 2001 From: Ananya-Joshi Date: Sat, 5 Feb 2022 22:33:03 -0500 Subject: [PATCH 16/33] changing as_day to be part of as_date using Nat's backward compatibility suggestion --- .../delphi_dsew_community_profile/pull.py | 23 +++++++++---------- 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py index 313cd7c36..641098320 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/pull.py +++ b/dsew_community_profile/delphi_dsew_community_profile/pull.py @@ -60,17 +60,16 @@ class DatasetTimes: @staticmethod def from_header(header, publish_date): """Convert reference dates in overheader to DatasetTimes.""" - def as_day(sub_result): - month = sub_result[0] - day = sub_result[1] - year = publish_date.year - return datetime.datetime.strptime(f"{year}-{month}-{day}", "%Y-%B-%d").date() - - def as_date(sub_result): - month = sub_result[2] if sub_result[2] else sub_result[0] - assert month, f"Bad month in header: {header}\nsub_result: {sub_result}" - month_numeric = datetime.datetime.strptime(month, "%B").month - day = sub_result[3] + def as_date(sub_result, is_single_date): + if is_single_date: + month = sub_result[0] + day = sub_result[1] + month_numeric = datetime.datetime.strptime(month, "%B").month + else: + month = sub_result[2] if sub_result[2] else sub_result[0] + assert month, f"Bad month in header: {header}\nsub_result: {sub_result}" + month_numeric = datetime.datetime.strptime(month, "%B").month + day = sub_result[3] year = publish_date.year # year boundary if month_numeric > publish_date.month: @@ -109,7 +108,7 @@ def as_date(sub_result): elif RE_DATE_FROM_VAC_HEADER_CUMULATIVE.match(header): findall_result = RE_DATE_FROM_VAC_HEADER_CUMULATIVE.findall(header)[0] column = findall_result[0].lower() - vac_reference_day = as_day(findall_result[1:]) + vac_reference_day = as_date(findall_result[1:], True) total_reference_date = None positivity_reference_date = None hosp_reference_date = None From 0f93ea7728eebcb2e6d77a5a82ae1d55c8301333 Mon Sep 17 00:00:00 2001 From: Ananya-Joshi Date: Sat, 5 Feb 2022 22:41:45 -0500 Subject: [PATCH 17/33] lint --- .../delphi_dsew_community_profile/pull.py | 41 ++++++++----------- 1 file changed, 16 insertions(+), 25 deletions(-) diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py index 641098320..b94e31f08 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/pull.py +++ b/dsew_community_profile/delphi_dsew_community_profile/pull.py @@ -55,11 +55,16 @@ class DatasetTimes: total_reference_date: datetime.date hosp_reference_date: datetime.date vac_reference_date: datetime.date - vac_reference_day: datetime.date + cumulative_vac_reference_date: datetime.date @staticmethod def from_header(header, publish_date): """Convert reference dates in overheader to DatasetTimes.""" + positivity_reference_date = None + total_reference_date = None + positivity_reference_date = None + hosp_reference_date = None + cumulative_vac_reference_date= None def as_date(sub_result, is_single_date): if is_single_date: month = sub_result[0] @@ -79,44 +84,30 @@ def as_date(sub_result, is_single_date): if RE_DATE_FROM_TEST_HEADER.match(header): findall_result = RE_DATE_FROM_TEST_HEADER.findall(header)[0] column = findall_result[0].lower() - positivity_reference_date = as_date(findall_result[1:5]) + positivity_reference_date = as_date(findall_result[1:5], False) if findall_result[6]: # Reports published starting 2021-03-17 specify different reference # dates for positivity and total test volume - total_reference_date = as_date(findall_result[6:10]) + total_reference_date = as_date(findall_result[6:10], False) else: total_reference_date = positivity_reference_date - hosp_reference_date = None - vac_reference_date = None - vac_reference_day = None elif RE_DATE_FROM_HOSP_HEADER.match(header): findall_result = RE_DATE_FROM_HOSP_HEADER.findall(header)[0] column = findall_result[0].lower() - hosp_reference_date = as_date(findall_result[1:5]) - total_reference_date = None - positivity_reference_date = None - vac_reference_date = None - vac_reference_day = None + hosp_reference_date = as_date(findall_result[1:5], False) elif RE_DATE_FROM_VAC_HEADER_WEEK.match(header): findall_result = RE_DATE_FROM_VAC_HEADER_WEEK.findall(header)[0] column = findall_result[0].lower() - vac_reference_date = as_date(findall_result[1:5]) - total_reference_date = None - positivity_reference_date = None - hosp_reference_date = None - vac_reference_day = None + vac_reference_date = as_date(findall_result[1:5], False) elif RE_DATE_FROM_VAC_HEADER_CUMULATIVE.match(header): findall_result = RE_DATE_FROM_VAC_HEADER_CUMULATIVE.findall(header)[0] column = findall_result[0].lower() - vac_reference_day = as_date(findall_result[1:], True) - total_reference_date = None - positivity_reference_date = None - hosp_reference_date = None - vac_reference_date = None + cumulative_vac_reference_date = as_date(findall_result[1:], True) else: raise ValueError(f"Couldn't find reference date in header '{header}'") return DatasetTimes(column, positivity_reference_date, - total_reference_date, hosp_reference_date, vac_reference_day, vac_reference_date) + total_reference_date, hosp_reference_date, + cumulative_vac_reference_date, vac_reference_date) def __getitem__(self, key): """Use DatasetTimes like a dictionary.""" ref_list = list(SIGNALS.keys()) @@ -127,7 +118,7 @@ def __getitem__(self, key): if key.lower()=="confirmed covid-19 admissions": return self.hosp_reference_date if key.lower() in ["doses administered","booster doses administered"]: - return self.vac_reference_day + return self.cumulative_vac_reference_date if key.lower() in ["fully vaccinated","booster dose since"]: return self.vac_reference_date raise ValueError( @@ -146,7 +137,7 @@ def __setitem__(self, key, newvalue): if key.lower()=="confirmed covid-19 admissions": self.hosp_reference_date = newvalue if key.lower() in ["doses administered","booster doses administered"]: - self.vac_reference_day = newvalue + self.cumulative_vac_reference_date = newvalue if key.lower() in ["fully vaccinated","booster dose since"]: self.vac_reference_date = newvalue if key.lower() not in ref_list: @@ -372,7 +363,7 @@ def select_fn(h): for si in sig_select ]) for sig in COUNTS_7D_SIGNALS: - assert((sheet.level, sig, NOT_PROP) in self.dfs.keys()) + assert (sheet.level, sig, NOT_PROP) in self.dfs.keys() self.dfs[(sheet.level, sig, NOT_PROP)]["val"] /= 7 # 7-day total -> 7-day average def as_cached_filename(params, config): From d2f28b61b6c11a5d45493ad2e1c38443a84871e1 Mon Sep 17 00:00:00 2001 From: Ananya-Joshi Date: Sat, 5 Feb 2022 22:48:48 -0500 Subject: [PATCH 18/33] changes to skip overheader test to pass --- .../delphi_dsew_community_profile/pull.py | 2 +- dsew_community_profile/tests/test_pull.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py index b94e31f08..50c836a2e 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/pull.py +++ b/dsew_community_profile/delphi_dsew_community_profile/pull.py @@ -62,8 +62,8 @@ def from_header(header, publish_date): """Convert reference dates in overheader to DatasetTimes.""" positivity_reference_date = None total_reference_date = None - positivity_reference_date = None hosp_reference_date = None + vac_reference_date = None cumulative_vac_reference_date= None def as_date(sub_result, is_single_date): if is_single_date: diff --git a/dsew_community_profile/tests/test_pull.py b/dsew_community_profile/tests/test_pull.py index d242706ad..1af270709 100644 --- a/dsew_community_profile/tests/test_pull.py +++ b/dsew_community_profile/tests/test_pull.py @@ -80,6 +80,12 @@ def test_Dataset_skip_overheader(self): example("HOSPITAL UTILIZATION: CHANGE FROM PREVIOUS WEEK", True), example("HOSPITAL UTILIZATION: DEMOGRAPHIC DATA", + True), + example("COVID-19 VACCINATION DATA: CUMULATIVE (January 25)", + False), + example("COVID-19 VACCINATION DATA: LAST WEEK (January 25-31)", + False), + example("COVID-19 VACCINATION DATA: DEMOGRAPHIC DATA", True) ] for ex in examples: From 733bdd6195ee6432260a5018aab35a505c49da50 Mon Sep 17 00:00:00 2001 From: Ananya-Joshi Date: Mon, 7 Feb 2022 09:06:12 -0500 Subject: [PATCH 19/33] added more tests for the new vaccination overheaders --- dsew_community_profile/tests/test_pull.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/dsew_community_profile/tests/test_pull.py b/dsew_community_profile/tests/test_pull.py index 1af270709..7b6c8acba 100644 --- a/dsew_community_profile/tests/test_pull.py +++ b/dsew_community_profile/tests/test_pull.py @@ -46,7 +46,11 @@ def test_DatasetTimes_from_header(self): example("HOSPITAL UTILIZATION: LAST WEEK (June 2-8)", DatasetTimes("last", None, None, date(2021, 6, 8), None, None)), example("HOSPITAL UTILIZATION: LAST WEEK (June 28-July 8)", - DatasetTimes("last", None, None, date(2021, 7, 8), None, None)) + DatasetTimes("last", None, None, date(2021, 7, 8), None, None)), + example("COVID-19 VACCINATION DATA: CUMULATIVE (January 25)", + DatasetTimes("", None, None, None, date(2021, 1, 25), None)), + example("COVID-19 VACCINATION DATA: LAST WEEK (January 25-31)", + DatasetTimes("last", None, None, None, None, date(2021, 1, 25))) ] for ex in examples: assert DatasetTimes.from_header(ex.given, date(2021, 12, 31)) == ex.expected, ex.given From 42a2b805b5129ae6197eb6dc97ba9b3ab14c6461 Mon Sep 17 00:00:00 2001 From: Ananya-Joshi Date: Mon, 7 Feb 2022 09:25:00 -0500 Subject: [PATCH 20/33] added smoothened signals to the json template and to the ansible template params --- ansible/templates/dsew_community_profile-params-prod.json.j2 | 4 +++- dsew_community_profile/params.json.template | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/ansible/templates/dsew_community_profile-params-prod.json.j2 b/ansible/templates/dsew_community_profile-params-prod.json.j2 index ec3e254c3..13b8476a3 100644 --- a/ansible/templates/dsew_community_profile-params-prod.json.j2 +++ b/ansible/templates/dsew_community_profile-params-prod.json.j2 @@ -28,7 +28,9 @@ "naats_total_7dav", "naats_positivity_7dav", "confirmed_admissions_covid_1d_prop_7dav", - "confirmed_admissions_covid_1d_7dav" + "confirmed_admissions_covid_1d_7dav", + "doses_admin_7dav", + "booster_doses_admin_7dav" ] } } diff --git a/dsew_community_profile/params.json.template b/dsew_community_profile/params.json.template index 6d8d8f113..460024b53 100644 --- a/dsew_community_profile/params.json.template +++ b/dsew_community_profile/params.json.template @@ -38,7 +38,9 @@ "naats_total_7dav", "naats_positivity_7dav", "confirmed_admissions_covid_1d_prop_7dav", - "confirmed_admissions_covid_1d_7dav" + "confirmed_admissions_covid_1d_7dav", + "doses_admin_7dav", + "booster_doses_admin_7dav" ] } } From bcf6f0b458f74f5c3cb4047421de009b78bec1ea Mon Sep 17 00:00:00 2001 From: Ananya-Joshi Date: Mon, 7 Feb 2022 10:04:45 -0500 Subject: [PATCH 21/33] changed api name for cumulative signals --- .../delphi_dsew_community_profile/constants.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dsew_community_profile/delphi_dsew_community_profile/constants.py b/dsew_community_profile/delphi_dsew_community_profile/constants.py index 2d18823eb..00deffee6 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/constants.py +++ b/dsew_community_profile/delphi_dsew_community_profile/constants.py @@ -69,13 +69,13 @@ class Transform: }, "fully vaccinated": { "is_rate" : False, - "api_name": "total_full_vaccinated", + "api_name": "people_full_vaccinated", "make_prop": False, "cumulative" : True }, "booster dose since": { "is_rate" : False, - "api_name": "total_booster_doses", + "api_name": "people_booster_doses", "make_prop": False, "cumulative" : True }, From 2ed2c1784998779a31e1313ca31ca0e5f6064ae2 Mon Sep 17 00:00:00 2001 From: Ananya-Joshi Date: Mon, 7 Feb 2022 11:07:05 -0500 Subject: [PATCH 22/33] Update dsew_community_profile/delphi_dsew_community_profile/pull.py Co-authored-by: nmdefries <42820733+nmdefries@users.noreply.github.com> --- dsew_community_profile/delphi_dsew_community_profile/pull.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py index 50c836a2e..d14586d5f 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/pull.py +++ b/dsew_community_profile/delphi_dsew_community_profile/pull.py @@ -346,6 +346,9 @@ def select_fn(h): sig_select = [s for s in select if s[-1].find(sig) >= 0] + # Since "doses administered" is a substring of another desired header, + # "booster doses administered", we need to more strictly check if "doses administered" + # occurs at the beginning of a header to find the correct match. if sig == "doses administered": sig_select = [s for s in select if s[-1].startswith(sig)] assert len(sig_select) > 0, \ From f39b3158a0b9fe3c4d05adebe3000439f2b253c9 Mon Sep 17 00:00:00 2001 From: Ananya-Joshi Date: Mon, 7 Feb 2022 11:08:08 -0500 Subject: [PATCH 23/33] Update dsew_community_profile/delphi_dsew_community_profile/pull.py Co-authored-by: nmdefries <42820733+nmdefries@users.noreply.github.com> --- dsew_community_profile/delphi_dsew_community_profile/pull.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py index d14586d5f..319c22a1f 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/pull.py +++ b/dsew_community_profile/delphi_dsew_community_profile/pull.py @@ -132,8 +132,6 @@ def __setitem__(self, key, newvalue): self.positivity_reference_date = newvalue if key.lower()=="total": self.total_reference_date = newvalue - if key.lower() in ref_list: - self.hosp_reference_date = newvalue if key.lower()=="confirmed covid-19 admissions": self.hosp_reference_date = newvalue if key.lower() in ["doses administered","booster doses administered"]: From fa853abdbc346933f0a28540995e6140b6790fd1 Mon Sep 17 00:00:00 2001 From: Ananya-Joshi Date: Wed, 9 Feb 2022 18:37:34 -0500 Subject: [PATCH 24/33] added new end date for vaccine signals --- .../delphi_dsew_community_profile/pull.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py index 319c22a1f..38ec49462 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/pull.py +++ b/dsew_community_profile/delphi_dsew_community_profile/pull.py @@ -322,6 +322,15 @@ def select_fn(h): continue + # Vaccine data not available before May 2021. + if self.publish_date < datetime.date(2021, 5, 1) \ + and (sig in ["fully vaccinated", "doses administered"]) : + self.dfs[(sheet.level, sig, NOT_PROP)] = pd.DataFrame( + columns = ["geo_id", "timestamp", "val", \ + "se", "sample_size", "publish_date"] + ) + continue + # Booster data not available before November 2021. if self.publish_date < datetime.date(2021, 11, 1) \ From 108968af4552fb331bde426198485afd1f105c4a Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 14 Feb 2022 17:37:51 -0500 Subject: [PATCH 25/33] return empty df for doses before apr 29 --- .../delphi_dsew_community_profile/pull.py | 28 ++++++++----------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py index 319c22a1f..ab17123fb 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/pull.py +++ b/dsew_community_profile/delphi_dsew_community_profile/pull.py @@ -311,35 +311,31 @@ def select_fn(h): ] for sig in SIGNALS: + empty_formatted_df = pd.DataFrame( + columns = ["geo_id", "timestamp", "val", \ + "se", "sample_size", "publish_date"] + ) # Hospital admissions not available at the county or CBSA level prior to Jan 8, 2021. if (sheet.level == "msa" or sheet.level == "county") \ and self.publish_date < datetime.date(2021, 1, 8) \ and sig == "confirmed covid-19 admissions": - self.dfs[(sheet.level, sig, NOT_PROP)] = pd.DataFrame( - columns = ["geo_id", "timestamp", "val", \ - "se", "sample_size", "publish_date"] - ) + self.dfs[(sheet.level, sig, NOT_PROP)] = empty_formatted_df continue - - - # Booster data not available before November 2021. if self.publish_date < datetime.date(2021, 11, 1) \ and (sig in ["booster dose since", "booster doses administered"]) : - self.dfs[(sheet.level, sig, NOT_PROP)] = pd.DataFrame( - columns = ["geo_id", "timestamp", "val", \ - "se", "sample_size", "publish_date"] - ) + self.dfs[(sheet.level, sig, NOT_PROP)] = empty_formatted_df continue - # Booster and weekly doses administered not available below the state level. if ((sheet.level != "hhs" and sheet.level != "state") \ and (sig in ["doses administered", \ "booster doses administered", "booster dose since"])): - self.dfs[(sheet.level, sig, NOT_PROP)] = pd.DataFrame( - columns = ["geo_id", "timestamp", "val", \ - "se", "sample_size", "publish_date"] - ) + self.dfs[(sheet.level, sig, NOT_PROP)] = empty_formatted_df + continue + # Weekly doses administered not available before Apr 29, 2021. + if self.publish_date <= datetime.date(2021, 4, 29) \ + and sig == "doses administered": + self.dfs[(sheet.level, sig, NOT_PROP)] = empty_formatted_df continue sig_select = [s for s in select if s[-1].find(sig) >= 0] From 66de89e022b94b03695b7a7333bc5606d12736b9 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 14 Feb 2022 18:28:25 -0500 Subject: [PATCH 26/33] add check for fully_vaccinated by date --- .../delphi_dsew_community_profile/pull.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py index ab17123fb..8f9721ea7 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/pull.py +++ b/dsew_community_profile/delphi_dsew_community_profile/pull.py @@ -337,6 +337,17 @@ def select_fn(h): and sig == "doses administered": self.dfs[(sheet.level, sig, NOT_PROP)] = empty_formatted_df continue + # People fully vaccinated not available before Apr 11, 2021 at the CBSA level. + if (sheet.level == "msa" or sheet.level == "county") \ + and self.publish_date <= datetime.date(2021, 4, 11) \ + and sig == "fully vaccinated": + self.dfs[(sheet.level, sig, NOT_PROP)] = empty_formatted_df + continue + # People fully vaccinated not available before March 08, 2021 at any geo level. + if self.publish_date <= datetime.date(2021, 3, 8) \ + and sig == "fully vaccinated": + self.dfs[(sheet.level, sig, NOT_PROP)] = empty_formatted_df + continue sig_select = [s for s in select if s[-1].find(sig) >= 0] From a736155c5b53a33c525287c8e4900ae157628cb1 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 14 Feb 2022 18:39:30 -0500 Subject: [PATCH 27/33] in early jan 2021, no vax info available so decrement times in assert --- .../delphi_dsew_community_profile/pull.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py index 8f9721ea7..8866c8dc1 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/pull.py +++ b/dsew_community_profile/delphi_dsew_community_profile/pull.py @@ -240,8 +240,14 @@ def _parse_times_for_sheet(self, sheet): self.times[dt.column][sig] = dt[sig] else: self.times[dt.column] = dt - assert len(self.times) == 3, \ - f"No times extracted from overheaders:\n{NEWLINE.join(str(s) for s in overheaders)}" + + if self.publish_date <= datetime.date(2021, 1, 11): + # No vaccination data available, so we only have hospitalization and testing overheaders + assert len(self.times) == 2, \ + f"No times extracted from overheaders:\n{NEWLINE.join(str(s) for s in overheaders)}" + else: + assert len(self.times) == 3, \ + f"No times extracted from overheaders:\n{NEWLINE.join(str(s) for s in overheaders)}" @staticmethod def retain_header(header): From a8eada627ecb864e8e1be508e1834dfb392e494f Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 14 Feb 2022 18:46:21 -0500 Subject: [PATCH 28/33] condense checks on signal availability --- .../delphi_dsew_community_profile/pull.py | 53 +++++++++---------- 1 file changed, 25 insertions(+), 28 deletions(-) diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py index 8866c8dc1..fc5e154a9 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/pull.py +++ b/dsew_community_profile/delphi_dsew_community_profile/pull.py @@ -257,7 +257,7 @@ def retain_header(header): # include "Total RT-PCR diagnostic tests - [last|previous] 7 days ..." # include "NAAT positivity rate - [last|previous] 7 days ..." # include "Viral (RT-PCR) lab test positivity rate - [last|previous] 7 days ..." - # include "Booster doses administerd - [last|previous] 7 days ..." + # include "Booster doses administered - [last|previous] 7 days ..." # include "Doses administered - [last|previous] 7 days ..." (header.startswith("Total NAATs") or header.startswith("NAAT positivity rate") or @@ -317,42 +317,39 @@ def select_fn(h): ] for sig in SIGNALS: - empty_formatted_df = pd.DataFrame( - columns = ["geo_id", "timestamp", "val", \ - "se", "sample_size", "publish_date"] - ) # Hospital admissions not available at the county or CBSA level prior to Jan 8, 2021. - if (sheet.level == "msa" or sheet.level == "county") \ + is_hosp_adm_before_jan8 = (sheet.level == "msa" or sheet.level == "county") \ and self.publish_date < datetime.date(2021, 1, 8) \ - and sig == "confirmed covid-19 admissions": - self.dfs[(sheet.level, sig, NOT_PROP)] = empty_formatted_df - continue + and sig == "confirmed covid-19 admissions" # Booster data not available before November 2021. - if self.publish_date < datetime.date(2021, 11, 1) \ - and (sig in ["booster dose since", "booster doses administered"]) : - self.dfs[(sheet.level, sig, NOT_PROP)] = empty_formatted_df - continue + is_booster_before_nov1 = self.publish_date < datetime.date(2021, 11, 1) \ + and (sig in ["booster dose since", "booster doses administered"]) # Booster and weekly doses administered not available below the state level. - if ((sheet.level != "hhs" and sheet.level != "state") \ + is_booster_below_state = ((sheet.level != "hhs" and sheet.level != "state") \ and (sig in ["doses administered", \ - "booster doses administered", "booster dose since"])): - self.dfs[(sheet.level, sig, NOT_PROP)] = empty_formatted_df - continue + "booster doses administered", "booster dose since"])) # Weekly doses administered not available before Apr 29, 2021. - if self.publish_date <= datetime.date(2021, 4, 29) \ - and sig == "doses administered": - self.dfs[(sheet.level, sig, NOT_PROP)] = empty_formatted_df - continue + is_dose_admin_apr29 = self.publish_date <= datetime.date(2021, 4, 29) \ + and sig == "doses administered" # People fully vaccinated not available before Apr 11, 2021 at the CBSA level. - if (sheet.level == "msa" or sheet.level == "county") \ + is_fully_vax_msa_before_apr11 = (sheet.level == "msa" or sheet.level == "county") \ and self.publish_date <= datetime.date(2021, 4, 11) \ - and sig == "fully vaccinated": - self.dfs[(sheet.level, sig, NOT_PROP)] = empty_formatted_df - continue + and sig == "fully vaccinated" # People fully vaccinated not available before March 08, 2021 at any geo level. - if self.publish_date <= datetime.date(2021, 3, 8) \ - and sig == "fully vaccinated": - self.dfs[(sheet.level, sig, NOT_PROP)] = empty_formatted_df + is_fully_vax_before_mar8 = self.publish_date <= datetime.date(2021, 3, 8) \ + and sig == "fully vaccinated" + + if any([is_hosp_adm_before_jan8, + is_booster_before_nov1, + is_booster_below_state, + is_dose_admin_apr29, + is_fully_vax_msa_before_apr11, + is_fully_vax_before_mar8 + ]): + self.dfs[(sheet.level, sig, NOT_PROP)] = pd.DataFrame( + columns = ["geo_id", "timestamp", "val", \ + "se", "sample_size", "publish_date"] + ) continue sig_select = [s for s in select if s[-1].find(sig) >= 0] From 760d2c2a40bbdc92247ab2c4e5b8ae6f458556aa Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Mon, 14 Feb 2022 18:47:36 -0500 Subject: [PATCH 29/33] comment --- dsew_community_profile/delphi_dsew_community_profile/pull.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py index fc5e154a9..e6221bbd4 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/pull.py +++ b/dsew_community_profile/delphi_dsew_community_profile/pull.py @@ -317,6 +317,7 @@ def select_fn(h): ] for sig in SIGNALS: + ## Check if field is known to be missing # Hospital admissions not available at the county or CBSA level prior to Jan 8, 2021. is_hosp_adm_before_jan8 = (sheet.level == "msa" or sheet.level == "county") \ and self.publish_date < datetime.date(2021, 1, 8) \ From 8d9527e1897b5f6edea645435eb9f0ca7c6c80dd Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 15 Feb 2022 11:38:18 -0500 Subject: [PATCH 30/33] exclude different format of by-age fully vaccinated --- dsew_community_profile/delphi_dsew_community_profile/pull.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py index e6221bbd4..6ce698aba 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/pull.py +++ b/dsew_community_profile/delphi_dsew_community_profile/pull.py @@ -290,6 +290,8 @@ def retain_header(header): # exclude "People who are fully vaccinated - ages 5-11" ... # exclude "People who have received a booster dose - ages 65+" ... header.find(" age") < 0, + # exclude "People who are fully vaccinated - 12-17" ... + header.find("-") < 0, ])) def _parse_sheet(self, sheet): """Extract data frame for this sheet.""" From 83cca2d8e43e69a89aadb23a5e0c92897b937751 Mon Sep 17 00:00:00 2001 From: Ananya-Joshi Date: Wed, 16 Feb 2022 13:20:09 -0500 Subject: [PATCH 31/33] added original vaccination signal (pre JJ) and lint --- .../delphi_dsew_community_profile/pull.py | 21 +++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py index 6ce698aba..e7ef014b9 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/pull.py +++ b/dsew_community_profile/delphi_dsew_community_profile/pull.py @@ -280,7 +280,7 @@ def retain_header(header): header.find(" age") < 0, # exclude "Confirmed COVID-19 admissions per 100 inpatient beds - last 7 days" header.find(" beds") < 0, - ])) or all([ + ])) or (all([ # include "People who are fully vaccinated" # include "People who have received a booster dose since August 13, 2021" header.startswith("People who"), @@ -292,7 +292,13 @@ def retain_header(header): header.find(" age") < 0, # exclude "People who are fully vaccinated - 12-17" ... header.find("-") < 0, - ])) + + ]) or all([ + # include "People with full course administered" + header.startswith("People with full course"), + # exclude "People with full course administered as % of adult population" + header.find("%") < 0, + ]))) def _parse_sheet(self, sheet): """Extract data frame for this sheet.""" df = pd.read_excel( @@ -338,8 +344,8 @@ def select_fn(h): is_fully_vax_msa_before_apr11 = (sheet.level == "msa" or sheet.level == "county") \ and self.publish_date <= datetime.date(2021, 4, 11) \ and sig == "fully vaccinated" - # People fully vaccinated not available before March 08, 2021 at any geo level. - is_fully_vax_before_mar8 = self.publish_date <= datetime.date(2021, 3, 8) \ + # People fully vaccinated not available before Jan 15, 2021 at any geo level. + is_fully_vax_before_mar8 = self.publish_date <= datetime.date(2021, 1, 14) \ and sig == "fully vaccinated" if any([is_hosp_adm_before_jan8, @@ -356,7 +362,10 @@ def select_fn(h): continue sig_select = [s for s in select if s[-1].find(sig) >= 0] - + # The name of the cumulative vaccination was changed after 03/09/2021 + # when J&J vaccines were added. + if (sig == "fully vaccinated") and (len(sig_select)==0): + sig_select = [s for s in select if s[-1].find("people with full course") >= 0] # Since "doses administered" is a substring of another desired header, # "booster doses administered", we need to more strictly check if "doses administered" # occurs at the beginning of a header to find the correct match. @@ -404,7 +413,6 @@ def fetch_listing(params): ) for el in listing if el['filename'].endswith("xlsx") ] - if params['indicator']['reports'] == 'new': # drop files we already have in the input cache listing = [el for el in listing if not os.path.exists(el['cached_filename'])] @@ -485,6 +493,7 @@ def fetch_new_reports(params, logger=None): if len(latest_sig_df.index) > 0: latest_sig_df = latest_sig_df.reset_index(drop=True) + latest_sig_df.to_csv("problem.csv") assert all(latest_sig_df.groupby( ["timestamp", "geo_id"] ).size( From f126de7fb0915da66bf932732dda07ba2aebaeb8 Mon Sep 17 00:00:00 2001 From: Ananya-Joshi Date: Wed, 16 Feb 2022 21:35:47 -0500 Subject: [PATCH 32/33] clean-up and ready for review --- dsew_community_profile/delphi_dsew_community_profile/pull.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py index e7ef014b9..41488c4e4 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/pull.py +++ b/dsew_community_profile/delphi_dsew_community_profile/pull.py @@ -493,7 +493,6 @@ def fetch_new_reports(params, logger=None): if len(latest_sig_df.index) > 0: latest_sig_df = latest_sig_df.reset_index(drop=True) - latest_sig_df.to_csv("problem.csv") assert all(latest_sig_df.groupby( ["timestamp", "geo_id"] ).size( From f2f778b76bdef8db1f9d42f424c6b333bf969e1c Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 17 Feb 2022 10:06:02 -0500 Subject: [PATCH 33/33] small comments + name change --- .../delphi_dsew_community_profile/pull.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py index 41488c4e4..6807f0c3c 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/pull.py +++ b/dsew_community_profile/delphi_dsew_community_profile/pull.py @@ -330,22 +330,22 @@ def select_fn(h): is_hosp_adm_before_jan8 = (sheet.level == "msa" or sheet.level == "county") \ and self.publish_date < datetime.date(2021, 1, 8) \ and sig == "confirmed covid-19 admissions" - # Booster data not available before November 2021. + # Booster data not available before November 1 2021. is_booster_before_nov1 = self.publish_date < datetime.date(2021, 11, 1) \ and (sig in ["booster dose since", "booster doses administered"]) # Booster and weekly doses administered not available below the state level. is_booster_below_state = ((sheet.level != "hhs" and sheet.level != "state") \ and (sig in ["doses administered", \ "booster doses administered", "booster dose since"])) - # Weekly doses administered not available before Apr 29, 2021. + # Weekly doses administered not available on or before Apr 29, 2021. is_dose_admin_apr29 = self.publish_date <= datetime.date(2021, 4, 29) \ and sig == "doses administered" - # People fully vaccinated not available before Apr 11, 2021 at the CBSA level. + # People fully vaccinated not available on or before Apr 11, 2021 at the CBSA level. is_fully_vax_msa_before_apr11 = (sheet.level == "msa" or sheet.level == "county") \ and self.publish_date <= datetime.date(2021, 4, 11) \ and sig == "fully vaccinated" # People fully vaccinated not available before Jan 15, 2021 at any geo level. - is_fully_vax_before_mar8 = self.publish_date <= datetime.date(2021, 1, 14) \ + is_fully_vax_before_jan14 = self.publish_date <= datetime.date(2021, 1, 14) \ and sig == "fully vaccinated" if any([is_hosp_adm_before_jan8, @@ -353,7 +353,7 @@ def select_fn(h): is_booster_below_state, is_dose_admin_apr29, is_fully_vax_msa_before_apr11, - is_fully_vax_before_mar8 + is_fully_vax_before_jan14 ]): self.dfs[(sheet.level, sig, NOT_PROP)] = pd.DataFrame( columns = ["geo_id", "timestamp", "val", \