diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py index 38ec49462..6ce698aba 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/pull.py +++ b/dsew_community_profile/delphi_dsew_community_profile/pull.py @@ -240,8 +240,14 @@ def _parse_times_for_sheet(self, sheet): self.times[dt.column][sig] = dt[sig] else: self.times[dt.column] = dt - assert len(self.times) == 3, \ - f"No times extracted from overheaders:\n{NEWLINE.join(str(s) for s in overheaders)}" + + if self.publish_date <= datetime.date(2021, 1, 11): + # No vaccination data available, so we only have hospitalization and testing overheaders + assert len(self.times) == 2, \ + f"No times extracted from overheaders:\n{NEWLINE.join(str(s) for s in overheaders)}" + else: + assert len(self.times) == 3, \ + f"No times extracted from overheaders:\n{NEWLINE.join(str(s) for s in overheaders)}" @staticmethod def retain_header(header): @@ -251,7 +257,7 @@ def retain_header(header): # include "Total RT-PCR diagnostic tests - [last|previous] 7 days ..." # include "NAAT positivity rate - [last|previous] 7 days ..." # include "Viral (RT-PCR) lab test positivity rate - [last|previous] 7 days ..." - # include "Booster doses administerd - [last|previous] 7 days ..." + # include "Booster doses administered - [last|previous] 7 days ..." # include "Doses administered - [last|previous] 7 days ..." (header.startswith("Total NAATs") or header.startswith("NAAT positivity rate") or @@ -284,6 +290,8 @@ def retain_header(header): # exclude "People who are fully vaccinated - ages 5-11" ... # exclude "People who have received a booster dose - ages 65+" ... header.find(" age") < 0, + # exclude "People who are fully vaccinated - 12-17" ... + header.find("-") < 0, ])) def _parse_sheet(self, sheet): """Extract data frame for this sheet.""" @@ -311,40 +319,36 @@ def select_fn(h): ] for sig in SIGNALS: + ## Check if field is known to be missing # Hospital admissions not available at the county or CBSA level prior to Jan 8, 2021. - if (sheet.level == "msa" or sheet.level == "county") \ + is_hosp_adm_before_jan8 = (sheet.level == "msa" or sheet.level == "county") \ and self.publish_date < datetime.date(2021, 1, 8) \ - and sig == "confirmed covid-19 admissions": - self.dfs[(sheet.level, sig, NOT_PROP)] = pd.DataFrame( - columns = ["geo_id", "timestamp", "val", \ - "se", "sample_size", "publish_date"] - ) - continue - - - # Vaccine data not available before May 2021. - if self.publish_date < datetime.date(2021, 5, 1) \ - and (sig in ["fully vaccinated", "doses administered"]) : - self.dfs[(sheet.level, sig, NOT_PROP)] = pd.DataFrame( - columns = ["geo_id", "timestamp", "val", \ - "se", "sample_size", "publish_date"] - ) - continue - - + and sig == "confirmed covid-19 admissions" # Booster data not available before November 2021. - if self.publish_date < datetime.date(2021, 11, 1) \ - and (sig in ["booster dose since", "booster doses administered"]) : - self.dfs[(sheet.level, sig, NOT_PROP)] = pd.DataFrame( - columns = ["geo_id", "timestamp", "val", \ - "se", "sample_size", "publish_date"] - ) - continue - + is_booster_before_nov1 = self.publish_date < datetime.date(2021, 11, 1) \ + and (sig in ["booster dose since", "booster doses administered"]) # Booster and weekly doses administered not available below the state level. - if ((sheet.level != "hhs" and sheet.level != "state") \ + is_booster_below_state = ((sheet.level != "hhs" and sheet.level != "state") \ and (sig in ["doses administered", \ - "booster doses administered", "booster dose since"])): + "booster doses administered", "booster dose since"])) + # Weekly doses administered not available before Apr 29, 2021. + is_dose_admin_apr29 = self.publish_date <= datetime.date(2021, 4, 29) \ + and sig == "doses administered" + # People fully vaccinated not available before Apr 11, 2021 at the CBSA level. + is_fully_vax_msa_before_apr11 = (sheet.level == "msa" or sheet.level == "county") \ + and self.publish_date <= datetime.date(2021, 4, 11) \ + and sig == "fully vaccinated" + # People fully vaccinated not available before March 08, 2021 at any geo level. + is_fully_vax_before_mar8 = self.publish_date <= datetime.date(2021, 3, 8) \ + and sig == "fully vaccinated" + + if any([is_hosp_adm_before_jan8, + is_booster_before_nov1, + is_booster_below_state, + is_dose_admin_apr29, + is_fully_vax_msa_before_apr11, + is_fully_vax_before_mar8 + ]): self.dfs[(sheet.level, sig, NOT_PROP)] = pd.DataFrame( columns = ["geo_id", "timestamp", "val", \ "se", "sample_size", "publish_date"]