diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 000000000..8a80734f0 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,21 @@ +# EditorConfig helps developers define and maintain consistent +# coding styles between different editors and IDEs +# editorconfig.org + +root = true + + +[*] + +# Change these settings to your own preference +indent_style = space +indent_size = 4 + +# We recommend you to keep these unchanged +end_of_line = lf +charset = utf-8 +trim_trailing_whitespace = true +insert_final_newline = true + +[*.md] +trim_trailing_whitespace = false diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 000000000..445436af3 --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1,15 @@ +1278d2716c9f14018d4c7160ecaba24be955d92e +d07d9f4138ff7868f4f3d4ee33a61fa5a647aae0 +9bf4b91ccfd74ce1ae384daf8d54802b3977306b +355b8f31a5279f04413696b2b8b7639810e5a988 +7c2331d52aada34f383300d4cf76adc7dcade346 +22855140f3c478370ca5261eb38c0597ed895fcc +8f33ff506c457f3f1f657bf421b3119a43ddc708 +a4da4d20541c09405d983e4bc281f8c1e5406c1f +87f1facd8b170c56c97b7f092ad86dc47cdab8fe +61427c5540b37123ad2f1db7d3558cd14500163a +2dad3e8f27d4129b2438e751c474e197321b1993 +39146ec9c526cd64c590e24bddaf3ec178358084 +46b28c8d0c2a04a116947430b8b632d5eac10734 +546742beae8000463c8bc94e28ceaa63b5501568 +3840136ab3386a2237c7a69d297d47c86fe0e860 \ No newline at end of file diff --git a/src/acquisition/afhsb/afhsb_csv.py b/src/acquisition/afhsb/afhsb_csv.py index b839c4053..f4d620803 100644 --- a/src/acquisition/afhsb/afhsb_csv.py +++ b/src/acquisition/afhsb/afhsb_csv.py @@ -1,4 +1,4 @@ -''' +""" afhsb_csv.py creates CSV files filled_00to13.csv, filled_13to17.csv and simple_DMISID_FY2018.csv which will be later used to create MYSQL data tables. @@ -9,7 +9,7 @@ ili_1_2000_5_2013_new.sas7bdat and ili_1_2013_11_2017_new.sas7bdat under SOURCE_DIR country_codes.csv and DMISID_FY2018.csv under TARGET_DIR All intermediate files and final csv files will be stored in TARGET_DIR -''' +""" import csv import os @@ -19,174 +19,181 @@ import epiweeks as epi -DATAPATH = '/home/automation/afhsb_data' +DATAPATH = "/home/automation/afhsb_data" SOURCE_DIR = DATAPATH TARGET_DIR = DATAPATH INVALID_DMISIDS = set() + def get_flu_cat(dx): - # flu1 (influenza) - if len(dx) == 0: - return None - dx = dx.capitalize() - if dx.isnumeric(): - for prefix in ["487", "488"]: - if dx.startswith(prefix): - return 1 - for i in range(0, 7): - prefix = str(480 + i) - if dx.startswith(prefix): - return 2 - for i in range(0, 7): - prefix = str(460 + i) - if dx.startswith(prefix): - return 3 - for prefix in ["07999", "3829", "7806", "7862"]: - if dx.startswith(prefix): - return 3 - elif (dx[0].isalpha() and dx[1:].isnumeric()): - for prefix in ["J09", "J10", "J11"]: - if dx.startswith(prefix): - return 1 - for i in range(12, 19): - prefix = "J{}".format(i) - if dx.startswith(prefix): - return 2 - for i in range(0, 7): - prefix = "J0{}".format(i) - if dx.startswith(prefix): - return 3 - for i in range(20, 23): - prefix = "J{}".format(i) - if dx.startswith(prefix): - return 3 - for prefix in ["J40", "R05", "H669", "R509", "B9789"]: - if dx.startswith(prefix): - return 3 - else: - return None + # flu1 (influenza) + if len(dx) == 0: + return None + dx = dx.capitalize() + if dx.isnumeric(): + for prefix in ["487", "488"]: + if dx.startswith(prefix): + return 1 + for i in range(0, 7): + prefix = str(480 + i) + if dx.startswith(prefix): + return 2 + for i in range(0, 7): + prefix = str(460 + i) + if dx.startswith(prefix): + return 3 + for prefix in ["07999", "3829", "7806", "7862"]: + if dx.startswith(prefix): + return 3 + elif dx[0].isalpha() and dx[1:].isnumeric(): + for prefix in ["J09", "J10", "J11"]: + if dx.startswith(prefix): + return 1 + for i in range(12, 19): + prefix = "J{}".format(i) + if dx.startswith(prefix): + return 2 + for i in range(0, 7): + prefix = "J0{}".format(i) + if dx.startswith(prefix): + return 3 + for i in range(20, 23): + prefix = "J{}".format(i) + if dx.startswith(prefix): + return 3 + for prefix in ["J40", "R05", "H669", "R509", "B9789"]: + if dx.startswith(prefix): + return 3 + else: + return None + def aggregate_data(sourcefile, targetfile): - reader = sas7bdat.SAS7BDAT(os.path.join(SOURCE_DIR, sourcefile), skip_header=True) - # map column names to column indices - col_2_idx = {column.name.decode('utf-8'): column.col_id for column in reader.columns} - - def get_field(row, column): - return row[col_2_idx[column]] - - def row2flu(row): - for i in range(1, 9): - dx = get_field(row, "dx{}".format(i)) - flu_cat = get_flu_cat(dx) - if flu_cat is not None: - return flu_cat - return 0 - - def row2epiweek(row): - date = get_field(row, 'd_event') - year, month, day = date.year, date.month, date.day - week_tuple = epi.Week.fromdate(year, month, day).weektuple() - year, week_num = week_tuple[0], week_tuple[1] - return year, week_num - - results_dict = {} - for _, row in enumerate(reader): - # if (r >= 1000000): break - if get_field(row, 'type') != "Outpt": - continue - year, week_num = row2epiweek(row) - dmisid = get_field(row, 'DMISID') - flu_cat = row2flu(row) - - key_list = [year, week_num, dmisid, flu_cat] - curr_dict = results_dict - for i, key in enumerate(key_list): - if i == len(key_list) - 1: - if key not in curr_dict: - curr_dict[key] = 0 - curr_dict[key] += 1 - else: - if key not in curr_dict: - curr_dict[key] = {} - curr_dict = curr_dict[key] - - results_path = os.path.join(TARGET_DIR, targetfile) - with open(results_path, 'wb') as f: - pickle.dump(results_dict, f, pickle.HIGHEST_PROTOCOL) + reader = sas7bdat.SAS7BDAT(os.path.join(SOURCE_DIR, sourcefile), skip_header=True) + # map column names to column indices + col_2_idx = {column.name.decode("utf-8"): column.col_id for column in reader.columns} + + def get_field(row, column): + return row[col_2_idx[column]] + + def row2flu(row): + for i in range(1, 9): + dx = get_field(row, "dx{}".format(i)) + flu_cat = get_flu_cat(dx) + if flu_cat is not None: + return flu_cat + return 0 + + def row2epiweek(row): + date = get_field(row, "d_event") + year, month, day = date.year, date.month, date.day + week_tuple = epi.Week.fromdate(year, month, day).weektuple() + year, week_num = week_tuple[0], week_tuple[1] + return year, week_num + + results_dict = {} + for _, row in enumerate(reader): + # if (r >= 1000000): break + if get_field(row, "type") != "Outpt": + continue + year, week_num = row2epiweek(row) + dmisid = get_field(row, "DMISID") + flu_cat = row2flu(row) + + key_list = [year, week_num, dmisid, flu_cat] + curr_dict = results_dict + for i, key in enumerate(key_list): + if i == len(key_list) - 1: + if key not in curr_dict: + curr_dict[key] = 0 + curr_dict[key] += 1 + else: + if key not in curr_dict: + curr_dict[key] = {} + curr_dict = curr_dict[key] + + results_path = os.path.join(TARGET_DIR, targetfile) + with open(results_path, "wb") as f: + pickle.dump(results_dict, f, pickle.HIGHEST_PROTOCOL) ################# Functions for geographical information #################### + def get_country_mapping(): - filename = "country_codes.csv" - mapping = dict() - with open(os.path.join(TARGET_DIR, filename), "r") as csvfile: - reader = csv.DictReader(csvfile) - for row in reader: - print(row.keys()) - alpha2 = row['alpha-2'] - alpha3 = row['alpha-3'] - mapping[alpha2] = alpha3 - - return mapping + filename = "country_codes.csv" + mapping = dict() + with open(os.path.join(TARGET_DIR, filename), "r") as csvfile: + reader = csv.DictReader(csvfile) + for row in reader: + print(row.keys()) + alpha2 = row["alpha-2"] + alpha3 = row["alpha-3"] + mapping[alpha2] = alpha3 + + return mapping + def format_dmisid_csv(filename, target_name): - src_path = os.path.join(TARGET_DIR, "{}.csv".format(filename)) - dst_path = os.path.join(TARGET_DIR, target_name) - - src_csv = open(src_path, "r", encoding='utf-8-sig') - reader = csv.DictReader(src_csv) - - dst_csv = open(dst_path, "w") - fieldnames = ['dmisid', 'country', 'state', 'zip5'] - writer = csv.DictWriter(dst_csv, fieldnames=fieldnames) - writer.writeheader() - - country_mapping = get_country_mapping() - - for row in reader: - country2 = row['Facility ISO Country Code'] - if country2 == "": - country3 = "" - elif country2 not in country_mapping: - for key in row.keys(): - print(key, row[key]) - continue - else: - country3 = country_mapping[country2] - new_row = {'dmisid': row['DMIS ID'], - 'country': country3, - 'state': row['Facility State Code'], - 'zip5': row['Facility 5-Digit ZIP Code']} - writer.writerow(new_row) + src_path = os.path.join(TARGET_DIR, "{}.csv".format(filename)) + dst_path = os.path.join(TARGET_DIR, target_name) + + src_csv = open(src_path, "r", encoding="utf-8-sig") + reader = csv.DictReader(src_csv) + + dst_csv = open(dst_path, "w") + fieldnames = ["dmisid", "country", "state", "zip5"] + writer = csv.DictWriter(dst_csv, fieldnames=fieldnames) + writer.writeheader() + + country_mapping = get_country_mapping() + + for row in reader: + country2 = row["Facility ISO Country Code"] + if country2 == "": + country3 = "" + elif country2 not in country_mapping: + for key in row.keys(): + print(key, row[key]) + continue + else: + country3 = country_mapping[country2] + new_row = {"dmisid": row["DMIS ID"], "country": country3, "state": row["Facility State Code"], "zip5": row["Facility 5-Digit ZIP Code"]} + writer.writerow(new_row) + def dmisid(): - filename = 'DMISID_FY2018' - target_name = "simple_DMISID_FY2018.csv" - format_dmisid_csv(filename, target_name) - - -cen2states = {'cen1': {'CT', 'ME', 'MA', 'NH', 'RI', 'VT'}, - 'cen2': {'NJ', 'NY', 'PA'}, - 'cen3': {'IL', 'IN', 'MI', 'OH', 'WI'}, - 'cen4': {'IA', 'KS', 'MN', 'MO', 'NE', 'ND', 'SD'}, - 'cen5': {'DE', 'DC', 'FL', 'GA', 'MD', 'NC', 'SC', 'VA', 'WV'}, - 'cen6': {'AL', 'KY', 'MS', 'TN'}, - 'cen7': {'AR', 'LA', 'OK', 'TX'}, - 'cen8': {'AZ', 'CO', 'ID', 'MT', 'NV', 'NM', 'UT', 'WY'}, - 'cen9': {'AK', 'CA', 'HI', 'OR', 'WA'}} - -hhs2states = {'hhs1': {'VT', 'CT', 'ME', 'MA', 'NH', 'RI'}, - 'hhs2': {'NJ', 'NY'}, - 'hhs3': {'DE', 'DC', 'MD', 'PA', 'VA', 'WV'}, - 'hhs4': {'AL', 'FL', 'GA', 'KY', 'MS', 'NC', 'TN', 'SC'}, - 'hhs5': {'IL', 'IN', 'MI', 'MN', 'OH', 'WI'}, - 'hhs6': {'AR', 'LA', 'NM', 'OK', 'TX'}, - 'hhs7': {'IA', 'KS', 'MO', 'NE'}, - 'hhs8': {'CO', 'MT', 'ND', 'SD', 'UT', 'WY'}, - 'hhs9': {'AZ', 'CA', 'HI', 'NV'}, - 'hhs10': {'AK', 'ID', 'OR', 'WA'}} + filename = "DMISID_FY2018" + target_name = "simple_DMISID_FY2018.csv" + format_dmisid_csv(filename, target_name) + + +cen2states = { + "cen1": {"CT", "ME", "MA", "NH", "RI", "VT"}, + "cen2": {"NJ", "NY", "PA"}, + "cen3": {"IL", "IN", "MI", "OH", "WI"}, + "cen4": {"IA", "KS", "MN", "MO", "NE", "ND", "SD"}, + "cen5": {"DE", "DC", "FL", "GA", "MD", "NC", "SC", "VA", "WV"}, + "cen6": {"AL", "KY", "MS", "TN"}, + "cen7": {"AR", "LA", "OK", "TX"}, + "cen8": {"AZ", "CO", "ID", "MT", "NV", "NM", "UT", "WY"}, + "cen9": {"AK", "CA", "HI", "OR", "WA"}, +} + +hhs2states = { + "hhs1": {"VT", "CT", "ME", "MA", "NH", "RI"}, + "hhs2": {"NJ", "NY"}, + "hhs3": {"DE", "DC", "MD", "PA", "VA", "WV"}, + "hhs4": {"AL", "FL", "GA", "KY", "MS", "NC", "TN", "SC"}, + "hhs5": {"IL", "IN", "MI", "MN", "OH", "WI"}, + "hhs6": {"AR", "LA", "NM", "OK", "TX"}, + "hhs7": {"IA", "KS", "MO", "NE"}, + "hhs8": {"CO", "MT", "ND", "SD", "UT", "WY"}, + "hhs9": {"AZ", "CA", "HI", "NV"}, + "hhs10": {"AK", "ID", "OR", "WA"}, +} + def state2region(D): results = dict() @@ -197,155 +204,161 @@ def state2region(D): results[state] = region return results + def state2region_csv(): - to_hhs = state2region(hhs2states) - to_cen = state2region(cen2states) - states = to_hhs.keys() - target_name = "state2region.csv" - fieldnames = ['state', 'hhs', 'cen'] - with open(target_name, "w") as csvfile: - writer = csv.DictWriter(csvfile, fieldnames=fieldnames) - writer.writeheader() - for state in states: - content = {"state": state, "hhs": to_hhs[state], "cen": to_cen[state]} - writer.writerow(content) + to_hhs = state2region(hhs2states) + to_cen = state2region(cen2states) + states = to_hhs.keys() + target_name = "state2region.csv" + fieldnames = ["state", "hhs", "cen"] + with open(target_name, "w") as csvfile: + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + writer.writeheader() + for state in states: + content = {"state": state, "hhs": to_hhs[state], "cen": to_cen[state]} + writer.writerow(content) + ################# Functions for geographical information #################### ######################### Functions for AFHSB data ########################## + def write_afhsb_csv(period): - flu_mapping = {0: "ili-flu3", 1: "flu1", 2:"flu2-flu1", 3: "flu3-flu2"} - results_dict = pickle.load(open(os.path.join(TARGET_DIR, "{}.pickle".format(period)), 'rb')) - - fieldnames = ["id", "epiweek", "dmisid", "flu_type", "visit_sum"] - with open(os.path.join(TARGET_DIR, "{}.csv".format(period)), 'w') as csvfile: - writer = csv.DictWriter(csvfile, fieldnames=fieldnames) - writer.writeheader() - - i = 0 - for year in sorted(results_dict.keys()): - year_dict = results_dict[year] - for week in sorted(year_dict.keys()): - week_dict = year_dict[week] - for dmisid in sorted(week_dict.keys()): - dmisid_dict = week_dict[dmisid] - for flu in sorted(dmisid_dict.keys()): - visit_sum = dmisid_dict[flu] - i += 1 - epiweek = int("{}{:02d}".format(year, week)) - flu_type = flu_mapping[flu] - - row = {"epiweek": epiweek, "dmisid": None if (not dmisid.isnumeric()) else dmisid, - "flu_type": flu_type, "visit_sum": visit_sum, "id": i} - writer.writerow(row) - if i % 100000 == 0: - print(row) + flu_mapping = {0: "ili-flu3", 1: "flu1", 2: "flu2-flu1", 3: "flu3-flu2"} + results_dict = pickle.load(open(os.path.join(TARGET_DIR, "{}.pickle".format(period)), "rb")) + + fieldnames = ["id", "epiweek", "dmisid", "flu_type", "visit_sum"] + with open(os.path.join(TARGET_DIR, "{}.csv".format(period)), "w") as csvfile: + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + writer.writeheader() + + i = 0 + for year in sorted(results_dict.keys()): + year_dict = results_dict[year] + for week in sorted(year_dict.keys()): + week_dict = year_dict[week] + for dmisid in sorted(week_dict.keys()): + dmisid_dict = week_dict[dmisid] + for flu in sorted(dmisid_dict.keys()): + visit_sum = dmisid_dict[flu] + i += 1 + epiweek = int("{}{:02d}".format(year, week)) + flu_type = flu_mapping[flu] + + row = {"epiweek": epiweek, "dmisid": None if (not dmisid.isnumeric()) else dmisid, "flu_type": flu_type, "visit_sum": visit_sum, "id": i} + writer.writerow(row) + if i % 100000 == 0: + print(row) + def dmisid_start_time_from_file(filename): - starttime_record = dict() - with open(filename, 'r') as csvfile: - reader = csv.DictReader(csvfile) - for row in reader: - dmisid = row['dmisid'] - epiweek = int(row['epiweek']) - if dmisid not in starttime_record: - starttime_record[dmisid] = epiweek - else: - starttime_record[dmisid] = min(epiweek, starttime_record[dmisid]) - return starttime_record + starttime_record = dict() + with open(filename, "r") as csvfile: + reader = csv.DictReader(csvfile) + for row in reader: + dmisid = row["dmisid"] + epiweek = int(row["epiweek"]) + if dmisid not in starttime_record: + starttime_record[dmisid] = epiweek + else: + starttime_record[dmisid] = min(epiweek, starttime_record[dmisid]) + return starttime_record + def dmisid_start_time(): - record1 = dmisid_start_time_from_file(os.path.join(TARGET_DIR, "00to13.csv")) - record2 = dmisid_start_time_from_file(os.path.join(TARGET_DIR, "13to17.csv")) - record = record1 - for dmisid, epiweek in record2.items(): - if dmisid in record: - record[dmisid] = min(record[dmisid], epiweek) - else: - record[dmisid] = epiweek - return record + record1 = dmisid_start_time_from_file(os.path.join(TARGET_DIR, "00to13.csv")) + record2 = dmisid_start_time_from_file(os.path.join(TARGET_DIR, "13to17.csv")) + record = record1 + for dmisid, epiweek in record2.items(): + if dmisid in record: + record[dmisid] = min(record[dmisid], epiweek) + else: + record[dmisid] = epiweek + return record + def fillin_zero_to_csv(period, dmisid_start_record): - src_path = os.path.join(TARGET_DIR, "{}.csv".format(period)) - dst_path = os.path.join(TARGET_DIR, "filled_{}.csv".format(period)) - - # Load data into a dictionary - src_csv = open(src_path, "r") - reader = csv.DictReader(src_csv) - - results_dict = dict() # epiweek -> dmisid -> flu_type: visit_sum - for i, row in enumerate(reader): - epiweek = int(row['epiweek']) - dmisid = row['dmisid'] - flu_type = row['flu_type'] - visit_sum = row['visit_sum'] - if epiweek not in results_dict: - results_dict[epiweek] = dict() - week_dict = results_dict[epiweek] - if dmisid not in week_dict: - week_dict[dmisid] = dict() - dmisid_dict = week_dict[dmisid] - dmisid_dict[flu_type] = visit_sum - - # Fill in zero count records - dmisid_group = dmisid_start_record.keys() - flutype_group = ["ili-flu3", "flu1", "flu2-flu1", "flu3-flu2"] - - for epiweek in results_dict.keys(): - week_dict = results_dict[epiweek] - for dmisid in dmisid_group: - start_week = dmisid_start_record[dmisid] - if start_week > epiweek: - continue - - if dmisid not in week_dict: - week_dict[dmisid] = dict() - - dmisid_dict = week_dict[dmisid] - for flutype in flutype_group: - if flutype not in dmisid_dict: - dmisid_dict[flutype] = 0 - - # Write to csv files - dst_csv = open(dst_path, "w") - fieldnames = ["id", "epiweek", "dmisid", "flu_type", "visit_sum"] - writer = csv.DictWriter(dst_csv, fieldnames=fieldnames) - writer.writeheader() - - i = 1 - for epiweek in results_dict: - for dmisid in results_dict[epiweek]: - for flutype in results_dict[epiweek][dmisid]: - visit_sum = results_dict[epiweek][dmisid][flutype] - row = {"id": i, "epiweek": epiweek, "dmisid": dmisid, - "flu_type": flutype, "visit_sum": visit_sum} - writer.writerow(row) - if i % 100000 == 0: - print(row) - i += 1 - print("Wrote {} rows".format(i)) + src_path = os.path.join(TARGET_DIR, "{}.csv".format(period)) + dst_path = os.path.join(TARGET_DIR, "filled_{}.csv".format(period)) + + # Load data into a dictionary + src_csv = open(src_path, "r") + reader = csv.DictReader(src_csv) + + results_dict = dict() # epiweek -> dmisid -> flu_type: visit_sum + for i, row in enumerate(reader): + epiweek = int(row["epiweek"]) + dmisid = row["dmisid"] + flu_type = row["flu_type"] + visit_sum = row["visit_sum"] + if epiweek not in results_dict: + results_dict[epiweek] = dict() + week_dict = results_dict[epiweek] + if dmisid not in week_dict: + week_dict[dmisid] = dict() + dmisid_dict = week_dict[dmisid] + dmisid_dict[flu_type] = visit_sum + + # Fill in zero count records + dmisid_group = dmisid_start_record.keys() + flutype_group = ["ili-flu3", "flu1", "flu2-flu1", "flu3-flu2"] + + for epiweek in results_dict.keys(): + week_dict = results_dict[epiweek] + for dmisid in dmisid_group: + start_week = dmisid_start_record[dmisid] + if start_week > epiweek: + continue + + if dmisid not in week_dict: + week_dict[dmisid] = dict() + + dmisid_dict = week_dict[dmisid] + for flutype in flutype_group: + if flutype not in dmisid_dict: + dmisid_dict[flutype] = 0 + + # Write to csv files + dst_csv = open(dst_path, "w") + fieldnames = ["id", "epiweek", "dmisid", "flu_type", "visit_sum"] + writer = csv.DictWriter(dst_csv, fieldnames=fieldnames) + writer.writeheader() + + i = 1 + for epiweek in results_dict: + for dmisid in results_dict[epiweek]: + for flutype in results_dict[epiweek][dmisid]: + visit_sum = results_dict[epiweek][dmisid][flutype] + row = {"id": i, "epiweek": epiweek, "dmisid": dmisid, "flu_type": flutype, "visit_sum": visit_sum} + writer.writerow(row) + if i % 100000 == 0: + print(row) + i += 1 + print("Wrote {} rows".format(i)) + ######################### Functions for AFHSB data ########################## + def main(): - # Build tables containing geographical information - state2region_csv() - dmisid() + # Build tables containing geographical information + state2region_csv() + dmisid() - # Aggregate raw data into pickle files - aggregate_data("ili_1_2000_5_2013_new.sas7bdat", "00to13.pickle") - aggregate_data("ili_1_2013_11_2017_new.sas7bdat", "13to17.pickle") + # Aggregate raw data into pickle files + aggregate_data("ili_1_2000_5_2013_new.sas7bdat", "00to13.pickle") + aggregate_data("ili_1_2013_11_2017_new.sas7bdat", "13to17.pickle") # write pickle content to csv files - write_afhsb_csv("00to13") - write_afhsb_csv("13to17") + write_afhsb_csv("00to13") + write_afhsb_csv("13to17") # Fill in zero count records - dmisid_start_record = dmisid_start_time() - fillin_zero_to_csv("00to13", dmisid_start_record) - fillin_zero_to_csv("13to17", dmisid_start_record) + dmisid_start_record = dmisid_start_time() + fillin_zero_to_csv("00to13", dmisid_start_record) + fillin_zero_to_csv("13to17", dmisid_start_record) -if __name__ == '__main__': - main() +if __name__ == "__main__": + main() diff --git a/src/acquisition/afhsb/afhsb_sql.py b/src/acquisition/afhsb/afhsb_sql.py index 278f3fc38..3ffd7c0fb 100644 --- a/src/acquisition/afhsb/afhsb_sql.py +++ b/src/acquisition/afhsb/afhsb_sql.py @@ -11,17 +11,17 @@ def init_dmisid_table(sourcefile): (u, p) = secrets.db.epi cnx = connector.connect(user=u, passwd=p, database="epidata") - table_name = 'dmisid_table' - create_table_cmd = ''' - CREATE TABLE `{}` ( + table_name = "dmisid_table" + create_table_cmd = f""" + CREATE TABLE `{table_name}` ( `dmisid` INT(4) NOT NULL PRIMARY KEY, `country` CHAR(3) NULL, `state` CHAR(2) NULL ); - '''.format(table_name) - populate_table_cmd = ''' - LOAD DATA INFILE '{}' - INTO TABLE {} + """ + populate_table_cmd = f""" + LOAD DATA INFILE '{sourcefile}' + INTO TABLE {table_name} FIELDS TERMINATED BY ',' ENCLOSED BY '"' LINES TERMINATED BY '\r\n' @@ -32,7 +32,7 @@ def init_dmisid_table(sourcefile): country = nullif(@country, ''), state = nullif(@state, '') ; - '''.format(sourcefile, table_name) + """ try: cursor = cnx.cursor() cursor.execute(create_table_cmd) @@ -41,27 +41,28 @@ def init_dmisid_table(sourcefile): finally: cnx.close() + def init_region_table(sourcefile): (u, p) = secrets.db.epi cnx = connector.connect(user=u, passwd=p, database="epidata") - table_name = 'state2region_table' - create_table_cmd = ''' - CREATE TABLE `{}` ( + table_name = "state2region_table" + create_table_cmd = f""" + CREATE TABLE `{table_name}` ( `state` CHAR(2) NOT NULL PRIMARY KEY, `hhs` CHAR(5) NOT NULL, `cen` CHAR(4) NOT NULL ); - '''.format(table_name) - populate_table_cmd = ''' - LOAD DATA INFILE '{}' - INTO TABLE {} + """ + populate_table_cmd = f""" + LOAD DATA INFILE '{sourcefile}' + INTO TABLE {table_name} FIELDS TERMINATED BY ',' ENCLOSED BY '"' LINES TERMINATED BY '\r\n' IGNORE 1 ROWS (@state, @hhs, @cen) SET state=@state, hhs=@hhs, cen=@cen; - '''.format(sourcefile, table_name) + """ try: cursor = cnx.cursor() cursor.execute(create_table_cmd) @@ -75,8 +76,8 @@ def init_raw_data(table_name, sourcefile): print("Initialize {}".format(table_name)) (u, p) = secrets.db.epi cnx = connector.connect(user=u, passwd=p, database="epidata") - create_table_cmd = ''' - CREATE TABLE IF NOT EXISTS `{}` ( + create_table_cmd = f""" + CREATE TABLE IF NOT EXISTS `{table_name}` ( `id` INT(11) NOT NULL PRIMARY KEY AUTO_INCREMENT, `epiweek` INT(6) NOT NULL, `dmisid` CHAR(4) NULL, @@ -87,10 +88,10 @@ def init_raw_data(table_name, sourcefile): KEY `dmisid` (`dmisid`), KEY `flu_type` (`flu_type`) ); - '''.format(table_name) - populate_table_cmd = ''' - LOAD DATA INFILE '{}' - INTO TABLE {} + """ + populate_table_cmd = f""" + LOAD DATA INFILE '{sourcefile}' + INTO TABLE {table_name} FIELDS TERMINATED BY ',' ENCLOSED BY '"' LINES TERMINATED BY '\r\n' @@ -103,7 +104,7 @@ def init_raw_data(table_name, sourcefile): flu_type = @flu, visit_sum = @visits ; - '''.format(sourcefile, table_name) + """ try: cursor = cnx.cursor() cursor.execute(create_table_cmd) @@ -112,18 +113,19 @@ def init_raw_data(table_name, sourcefile): finally: cnx.close() + def agg_by_state(src_table, dest_table): print("Aggregating records by states...") (u, p) = secrets.db.epi cnx = connector.connect(user=u, passwd=p, database="epidata") - cmd = ''' - CREATE TABLE {} + cmd = f""" + CREATE TABLE {dest_table} SELECT a.epiweek, a.flu_type, d.state, d.country, sum(a.visit_sum) visit_sum - FROM {} a + FROM {src_table} a LEFT JOIN dmisid_table d ON a.dmisid = d.dmisid GROUP BY a.epiweek, a.flu_type, d.state, d.country; - '''.format(dest_table, src_table) + """ try: cursor = cnx.cursor() cursor.execute(cmd) @@ -131,18 +133,19 @@ def agg_by_state(src_table, dest_table): finally: cnx.close() + def agg_by_region(src_table, dest_table): print("Aggregating records by regions...") (u, p) = secrets.db.epi cnx = connector.connect(user=u, passwd=p, database="epidata") - cmd = ''' - CREATE TABLE {} + cmd = f""" + CREATE TABLE {dest_table} SELECT s.epiweek, s.flu_type, r.hhs, r.cen, sum(s.visit_sum) visit_sum - FROM {} s + FROM {src_table} s LEFT JOIN state2region_table r ON s.state = r.state GROUP BY s.epiweek, s.flu_type, r.hhs, r.cen; - '''.format(dest_table, src_table) + """ try: cursor = cnx.cursor() cursor.execute(cmd) @@ -150,26 +153,29 @@ def agg_by_region(src_table, dest_table): finally: cnx.close() + def init_all_tables(datapath): init_dmisid_table(os.path.join(datapath, "simple_DMISID_FY2018.csv")) init_region_table(os.path.join(datapath, "state2region.csv")) periods = ["00to13", "13to17"] for period in periods: - raw_table_name = 'afhsb_{}_raw'.format(period) - state_table_name = 'afhsb_{}_state'.format(period) - region_table_name = 'afhsb_{}_region'.format(period) + raw_table_name = f"afhsb_{period}_raw" + state_table_name = f"afhsb_{period}_state" + region_table_name = f"afhsb_{period}_region" - init_raw_data(raw_table_name, os.path.join(datapath, "filled_{}.csv".format(period))) + init_raw_data(raw_table_name, os.path.join(datapath, f"filled_{period}.csv")) agg_by_state(raw_table_name, state_table_name) agg_by_region(state_table_name, region_table_name) + def dangerously_drop_all_afhsb_tables(): (u, p) = secrets.db.epi cnx = connector.connect(user=u, passwd=p, database="epidata") try: cursor = cnx.cursor() - cursor.execute(''' + cursor.execute( + """ DROP TABLE IF EXISTS `afhsb_00to13_raw`, `afhsb_00to13_region`, `afhsb_00to13_state`, @@ -178,11 +184,13 @@ def dangerously_drop_all_afhsb_tables(): `afhsb_13to17_state`, `state2region_table`, `dmisid_table`; - ''') - cnx.commit() # (might do nothing; each DROP commits itself anyway) + """ + ) + cnx.commit() # (might do nothing; each DROP commits itself anyway) finally: cnx.close() + def run_cmd(cmd): (u, p) = secrets.db.epi cnx = connector.connect(user=u, passwd=p, database="epidata") diff --git a/src/acquisition/afhsb/afhsb_update.py b/src/acquisition/afhsb/afhsb_update.py index c5a8635c8..0eea23e60 100644 --- a/src/acquisition/afhsb/afhsb_update.py +++ b/src/acquisition/afhsb/afhsb_update.py @@ -8,11 +8,12 @@ # first party from . import afhsb_sql -DEFAULT_DATAPATH = '/home/automation/afhsb_data' +DEFAULT_DATAPATH = "/home/automation/afhsb_data" + def main(): parser = argparse.ArgumentParser() - parser.add_argument('--datapath', action='store', type=str, default=DEFAULT_DATAPATH, help='filepath to directory containing csv files to input into database') + parser.add_argument("--datapath", action="store", type=str, default=DEFAULT_DATAPATH, help="filepath to directory containing csv files to input into database") args = parser.parse_args() # MariaDB appears to refuse to LOAD DATA INFILE except on files under # /var/lib/mysql (which seems dedicated to its own files) or /tmp; create a @@ -35,5 +36,5 @@ def main(): # (Temporary parent directory should be deleted automatically.) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/src/acquisition/cdcp/cdc_dropbox_receiver.py b/src/acquisition/cdcp/cdc_dropbox_receiver.py index eb0d97f2a..65626101b 100644 --- a/src/acquisition/cdcp/cdc_dropbox_receiver.py +++ b/src/acquisition/cdcp/cdc_dropbox_receiver.py @@ -29,128 +29,128 @@ # location constants -DROPBOX_BASE_DIR = '/cdc_page_stats' -DELPHI_BASE_DIR = '/common/cdc_stage' +DROPBOX_BASE_DIR = "/cdc_page_stats" +DELPHI_BASE_DIR = "/common/cdc_stage" def get_timestamp_string(): - """ - Return the current local date and time as a string. + """ + Return the current local date and time as a string. - The format is "%Y%m%d_%H%M%S". - """ - return datetime.datetime.now().strftime('%Y%m%d_%H%M%S') + The format is "%Y%m%d_%H%M%S". + """ + return datetime.datetime.now().strftime("%Y%m%d_%H%M%S") def trigger_further_processing(): - """Add CDCP processing scripts to the Automation run queue.""" + """Add CDCP processing scripts to the Automation run queue.""" - # connect - u, p = secrets.db.auto - cnx = mysql.connector.connect(user=u, password=p, database='automation') - cur = cnx.cursor() + # connect + u, p = secrets.db.auto + cnx = mysql.connector.connect(user=u, password=p, database="automation") + cur = cnx.cursor() - # add step "Process CDCP Data" to queue - cur.execute('CALL automation.RunStep(46)') + # add step "Process CDCP Data" to queue + cur.execute("CALL automation.RunStep(46)") - # disconnect - cur.close() - cnx.commit() - cnx.close() + # disconnect + cur.close() + cnx.commit() + cnx.close() def fetch_data(): - """ - Check for new files on dropbox, download them, zip them, cleanup dropbox, and - trigger further processing of new data. - """ - - # initialize dropbox api - dbx = dropbox.Dropbox(secrets.cdcp.dropbox_token) - - # look for new CDC data files - print('checking dropbox:%s' % DROPBOX_BASE_DIR) - save_list = [] - for entry in dbx.files_list_folder(DROPBOX_BASE_DIR).entries: - name = entry.name - if name.endswith('.csv') or name.endswith('.zip'): - print(' download "%s"' % name) - save_list.append(name) - else: - print(' skip "%s"' % name) - - # determine if there's anything to be done - if len(save_list) == 0: - print('did not find any new data files') - return - - # download new files, saving them inside of a new zip file - timestamp = get_timestamp_string() - zip_path = '%s/dropbox_%s.zip' % (DELPHI_BASE_DIR, timestamp) - print('downloading into delphi:%s' % zip_path) - with ZipFile(zip_path, 'w', ZIP_DEFLATED) as zf: + """ + Check for new files on dropbox, download them, zip them, cleanup dropbox, and + trigger further processing of new data. + """ + + # initialize dropbox api + dbx = dropbox.Dropbox(secrets.cdcp.dropbox_token) + + # look for new CDC data files + print(f"checking dropbox: {DROPBOX_BASE_DIR}") + save_list = [] + for entry in dbx.files_list_folder(DROPBOX_BASE_DIR).entries: + name = entry.name + if name.endswith(".csv") or name.endswith(".zip"): + print(f" download: {name}") + save_list.append(name) + else: + print(f" skip: {name}") + + # determine if there's anything to be done + if len(save_list) == 0: + print("did not find any new data files") + return + + # download new files, saving them inside of a new zip file + timestamp = get_timestamp_string() + zip_path = f"{DELPHI_BASE_DIR}/dropbox_{timestamp}.zip" + print(f"downloading into delphi:{zip_path}") + with ZipFile(zip_path, "w", ZIP_DEFLATED) as zf: + for name in save_list: + # location of the file on dropbox + dropbox_path = f"{DROPBOX_BASE_DIR}/{name}" + print(f" {dropbox_path}") + + # start the download + meta, resp = dbx.files_download(dropbox_path) + + # check status and length + if resp.status_code != 200: + raise Exception(["resp.status_code", resp.status_code]) + dropbox_len = meta.size + print(" need %d bytes..." % dropbox_len) + content_len = int(resp.headers.get("Content-Length", -1)) + if dropbox_len != content_len: + info = ["dropbox_len", dropbox_len, "content_len", content_len] + raise Exception(info) + + # finish the download, holding the data in this variable + filedata = resp.content + + # check the length again + payload_len = len(filedata) + print(" downloaded") + if dropbox_len != payload_len: + info = ["dropbox_len", dropbox_len, "payload_len", payload_len] + raise Exception(info) + + # add the downloaded file to the zip file + zf.writestr(name, filedata) + print(" added") + + # At this point, all the data is stored and awaiting further processing on + # the delphi server. + print(f"saved all new data in {zip_path}") + + # on dropbox, archive downloaded files so they won't be downloaded again + archive_dir = f"archived_reports/processed_{timestamp}" + print("archiving files...") for name in save_list: - # location of the file on dropbox - dropbox_path = '%s/%s' % (DROPBOX_BASE_DIR, name) - print(' %s' % dropbox_path) - - # start the download - meta, resp = dbx.files_download(dropbox_path) - - # check status and length - if resp.status_code != 200: - raise Exception(['resp.status_code', resp.status_code]) - dropbox_len = meta.size - print(' need %d bytes...' % dropbox_len) - content_len = int(resp.headers.get('Content-Length', -1)) - if dropbox_len != content_len: - info = ['dropbox_len', dropbox_len, 'content_len', content_len] - raise Exception(info) - - # finish the download, holding the data in this variable - filedata = resp.content - - # check the length again - payload_len = len(filedata) - print(' downloaded') - if dropbox_len != payload_len: - info = ['dropbox_len', dropbox_len, 'payload_len', payload_len] - raise Exception(info) - - # add the downloaded file to the zip file - zf.writestr(name, filedata) - print(' added') - - # At this point, all the data is stored and awaiting further processing on - # the delphi server. - print('saved all new data in %s' % zip_path) - - # on dropbox, archive downloaded files so they won't be downloaded again - archive_dir = 'archived_reports/processed_%s' % timestamp - print('archiving files...') - for name in save_list: - # source and destination - dropbox_src = '%s/%s' % (DROPBOX_BASE_DIR, name) - dropbox_dst = '%s/%s/%s' % (DROPBOX_BASE_DIR, archive_dir, name) - print(' "%s" -> "%s"' % (dropbox_src, dropbox_dst)) - - # move the file - meta = dbx.files_move(dropbox_src, dropbox_dst) - - # sanity check - if archive_dir not in meta.path_lower: - raise Exception('failed to move "%s"' % name) - - # finally, trigger the usual processing flow - print('triggering processing flow') - trigger_further_processing() - print('done') + # source and destination + dropbox_src = f"{DROPBOX_BASE_DIR}/{name}" + dropbox_dst = f"{DROPBOX_BASE_DIR}/{archive_dir}/{name}" + print(f" {dropbox_src} -> {dropbox_dst}") + + # move the file + meta = dbx.files_move(dropbox_src, dropbox_dst) + + # sanity check + if archive_dir not in meta.path_lower: + raise Exception(f"failed to move {name}") + + # finally, trigger the usual processing flow + print("triggering processing flow") + trigger_further_processing() + print("done") def main(): - # fetch new data - fetch_data() + # fetch new data + fetch_data() -if __name__ == '__main__': - main() +if __name__ == "__main__": + main() diff --git a/src/acquisition/cdcp/cdc_extract.py b/src/acquisition/cdcp/cdc_extract.py index 83ed08d5b..45519307e 100644 --- a/src/acquisition/cdcp/cdc_extract.py +++ b/src/acquisition/cdcp/cdc_extract.py @@ -75,7 +75,7 @@ def get_num_hits(cur, epiweek, state, page): - sql = ''' + sql = """ SELECT sum(c.`num`) `num` FROM @@ -86,36 +86,36 @@ def get_num_hits(cur, epiweek, state, page): m.`date` = c.`date` AND m.`state` = c.`state` WHERE m.`epiweek` = %s AND c.`state` = %s AND c.`page` LIKE %s - ''' - num = None - cur.execute(sql, (epiweek, state, page)) - for (num,) in cur: - pass - if num is None: - return 0 - return num + """ + num = None + cur.execute(sql, (epiweek, state, page)) + for (num,) in cur: + pass + if num is None: + return 0 + return num def get_total_hits(cur, epiweek, state): - sql = ''' + sql = """ SELECT sum(m.`total`) `total` FROM `cdc_meta` m WHERE m.`epiweek` = %s AND m.`state` = %s - ''' - total = None - cur.execute(sql, (epiweek, state)) - for (total,) in cur: - pass - if total is None: - raise Exception('missing data for %d-%s' % (epiweek, state)) - return total + """ + total = None + cur.execute(sql, (epiweek, state)) + for (total,) in cur: + pass + if total is None: + raise Exception("missing data for %d-%s" % (epiweek, state)) + return total def store_result(cur, epiweek, state, num1, num2, num3, num4, num5, num6, num7, num8, total): - sql = ''' + sql = """ INSERT INTO `cdc_extract` (`epiweek`, `state`, `num1`, `num2`, `num3`, `num4`, `num5`, `num6`, `num7`, `num8`, `total`) VALUES @@ -130,94 +130,94 @@ def store_result(cur, epiweek, state, num1, num2, num3, num4, num5, num6, num7, `num7` = %s, `num8` = %s, `total` = %s - ''' - values = [num1, num2, num3, num4, num5, num6, num7, num8, total] - args = tuple([epiweek, state] + values + values) - cur.execute(sql, args) + """ + values = [num1, num2, num3, num4, num5, num6, num7, num8, total] + args = tuple([epiweek, state] + values + values) + cur.execute(sql, args) def extract(first_week=None, last_week=None, test_mode=False): - # page title templates - pages = [ - '%What You Should Know for the % Influenza Season%', - '%What To Do If You Get Sick%', - '%Flu Symptoms & Severity%', - '%How Flu Spreads%', - '%What You Should Know About Flu Antiviral Drugs%', - '%Weekly US Map%', - '%Basics%', - '%Flu Activity & Surveillance%', - ] - - # location information - states = sorted(cdc_upload.STATES.values()) - - # connect - u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - cur = cnx.cursor() - - # weeks to update - if first_week is None: - cur.execute('SELECT max(`epiweek`) FROM `cdc_extract`') - for (first_week,) in cur: - pass - if last_week is None: - cur.execute('SELECT max(`epiweek`) FROM `cdc_meta`') - for (last_week,) in cur: - pass - print('extracting %d--%d' % (first_week, last_week)) - - # update each epiweek - for epiweek in flu.range_epiweeks(first_week, last_week, inclusive=True): - # update each state - for state in states: - try: - num1 = get_num_hits(cur, epiweek, state, pages[0]) - num2 = get_num_hits(cur, epiweek, state, pages[1]) - num3 = get_num_hits(cur, epiweek, state, pages[2]) - num4 = get_num_hits(cur, epiweek, state, pages[3]) - num5 = get_num_hits(cur, epiweek, state, pages[4]) - num6 = get_num_hits(cur, epiweek, state, pages[5]) - num7 = get_num_hits(cur, epiweek, state, pages[6]) - num8 = get_num_hits(cur, epiweek, state, pages[7]) - total = get_total_hits(cur, epiweek, state) - store_result(cur, epiweek, state, num1, num2, num3, num4, num5, num6, num7, num8, total) - print(' %d-%s: %d %d %d %d %d %d %d %d (%d)' % (epiweek, state, num1, num2, num3, num4, num5, num6, num7, num8, total)) - except Exception as ex: - print(' %d-%s: failed' % (epiweek, state), ex) - #raise ex - sys.stdout.flush() - - # disconnect - cur.close() - if not test_mode: - cnx.commit() - cnx.close() + # page title templates + pages = [ + "%What You Should Know for the % Influenza Season%", + "%What To Do If You Get Sick%", + "%Flu Symptoms & Severity%", + "%How Flu Spreads%", + "%What You Should Know About Flu Antiviral Drugs%", + "%Weekly US Map%", + "%Basics%", + "%Flu Activity & Surveillance%", + ] + + # location information + states = sorted(cdc_upload.STATES.values()) + + # connect + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") + cur = cnx.cursor() + + # weeks to update + if first_week is None: + cur.execute("SELECT max(`epiweek`) FROM `cdc_extract`") + for (first_week,) in cur: + pass + if last_week is None: + cur.execute("SELECT max(`epiweek`) FROM `cdc_meta`") + for (last_week,) in cur: + pass + print("extracting %d--%d" % (first_week, last_week)) + + # update each epiweek + for epiweek in flu.range_epiweeks(first_week, last_week, inclusive=True): + # update each state + for state in states: + try: + num1 = get_num_hits(cur, epiweek, state, pages[0]) + num2 = get_num_hits(cur, epiweek, state, pages[1]) + num3 = get_num_hits(cur, epiweek, state, pages[2]) + num4 = get_num_hits(cur, epiweek, state, pages[3]) + num5 = get_num_hits(cur, epiweek, state, pages[4]) + num6 = get_num_hits(cur, epiweek, state, pages[5]) + num7 = get_num_hits(cur, epiweek, state, pages[6]) + num8 = get_num_hits(cur, epiweek, state, pages[7]) + total = get_total_hits(cur, epiweek, state) + store_result(cur, epiweek, state, num1, num2, num3, num4, num5, num6, num7, num8, total) + print(" %d-%s: %d %d %d %d %d %d %d %d (%d)" % (epiweek, state, num1, num2, num3, num4, num5, num6, num7, num8, total)) + except Exception as ex: + print(" %d-%s: failed" % (epiweek, state), ex) + # raise ex + sys.stdout.flush() + + # disconnect + cur.close() + if not test_mode: + cnx.commit() + cnx.close() def main(): - # args and usage - parser = argparse.ArgumentParser() - parser.add_argument('--first', '-f', default=None, type=int, help='first epiweek override') - parser.add_argument('--last', '-l', default=None, type=int, help='last epiweek override') - parser.add_argument('--epiweek', '-w', default=None, type=int, help='epiweek override') - parser.add_argument('--test', '-t', default=False, action='store_true', help='dry run only') - args = parser.parse_args() - - # sanity check - first, last, week = args.first, args.last, args.epiweek - for ew in [first, last, week]: - if ew is not None: - flu.check_epiweek(ew) - if first is not None and last is not None and first > last: - raise Exception('epiweeks in the wrong order') - if week is not None: - first = last = week - - # extract the page hits for all states on the specified weeks - extract(first, last, args.test) - - -if __name__ == '__main__': - main() + # args and usage + parser = argparse.ArgumentParser() + parser.add_argument("--first", "-f", default=None, type=int, help="first epiweek override") + parser.add_argument("--last", "-l", default=None, type=int, help="last epiweek override") + parser.add_argument("--epiweek", "-w", default=None, type=int, help="epiweek override") + parser.add_argument("--test", "-t", default=False, action="store_true", help="dry run only") + args = parser.parse_args() + + # sanity check + first, last, week = args.first, args.last, args.epiweek + for ew in [first, last, week]: + if ew is not None: + flu.check_epiweek(ew) + if first is not None and last is not None and first > last: + raise Exception("epiweeks in the wrong order") + if week is not None: + first = last = week + + # extract the page hits for all states on the specified weeks + extract(first, last, args.test) + + +if __name__ == "__main__": + main() diff --git a/src/acquisition/cdcp/cdc_upload.py b/src/acquisition/cdcp/cdc_upload.py index c9c206dfa..fef0821b7 100644 --- a/src/acquisition/cdcp/cdc_upload.py +++ b/src/acquisition/cdcp/cdc_upload.py @@ -87,191 +87,192 @@ STATES = { - 'Alabama': 'AL', - 'Alaska': 'AK', - 'Arizona': 'AZ', - 'Arkansas': 'AR', - 'California': 'CA', - 'Colorado': 'CO', - 'Connecticut': 'CT', - 'Delaware': 'DE', - 'District of Columbia': 'DC', - 'Florida': 'FL', - 'Georgia': 'GA', - 'Hawaii': 'HI', - 'Idaho': 'ID', - 'Illinois': 'IL', - 'Indiana': 'IN', - 'Iowa': 'IA', - 'Kansas': 'KS', - 'Kentucky': 'KY', - 'Louisiana': 'LA', - 'Maine': 'ME', - 'Maryland': 'MD', - 'Massachusetts': 'MA', - 'Michigan': 'MI', - 'Minnesota': 'MN', - 'Mississippi': 'MS', - 'Missouri': 'MO', - 'Montana': 'MT', - 'Nebraska': 'NE', - 'Nevada': 'NV', - 'New Hampshire': 'NH', - 'New Jersey': 'NJ', - 'New Mexico': 'NM', - 'New York': 'NY', - 'North Carolina': 'NC', - 'North Dakota': 'ND', - 'Ohio': 'OH', - 'Oklahoma': 'OK', - 'Oregon': 'OR', - 'Pennsylvania': 'PA', - 'Rhode Island': 'RI', - 'South Carolina': 'SC', - 'South Dakota': 'SD', - 'Tennessee': 'TN', - 'Texas': 'TX', - 'Utah': 'UT', - 'Vermont': 'VT', - 'Virginia': 'VA', - 'Washington': 'WA', - 'West Virginia': 'WV', - 'Wisconsin': 'WI', - 'Wyoming': 'WY', - #'Puerto Rico': 'PR', - #'Virgin Islands': 'VI', - #'Guam': 'GU', + "Alabama": "AL", + "Alaska": "AK", + "Arizona": "AZ", + "Arkansas": "AR", + "California": "CA", + "Colorado": "CO", + "Connecticut": "CT", + "Delaware": "DE", + "District of Columbia": "DC", + "Florida": "FL", + "Georgia": "GA", + "Hawaii": "HI", + "Idaho": "ID", + "Illinois": "IL", + "Indiana": "IN", + "Iowa": "IA", + "Kansas": "KS", + "Kentucky": "KY", + "Louisiana": "LA", + "Maine": "ME", + "Maryland": "MD", + "Massachusetts": "MA", + "Michigan": "MI", + "Minnesota": "MN", + "Mississippi": "MS", + "Missouri": "MO", + "Montana": "MT", + "Nebraska": "NE", + "Nevada": "NV", + "New Hampshire": "NH", + "New Jersey": "NJ", + "New Mexico": "NM", + "New York": "NY", + "North Carolina": "NC", + "North Dakota": "ND", + "Ohio": "OH", + "Oklahoma": "OK", + "Oregon": "OR", + "Pennsylvania": "PA", + "Rhode Island": "RI", + "South Carolina": "SC", + "South Dakota": "SD", + "Tennessee": "TN", + "Texas": "TX", + "Utah": "UT", + "Vermont": "VT", + "Virginia": "VA", + "Washington": "WA", + "West Virginia": "WV", + "Wisconsin": "WI", + "Wyoming": "WY", + #'Puerto Rico': 'PR', + #'Virgin Islands': 'VI', + #'Guam': 'GU', } -sql_cdc = ''' +sql_cdc = """ INSERT INTO `cdc` (`date`, `page`, `state`, `num`) VALUES (%s, %s, %s, %s) ON DUPLICATE KEY UPDATE `num` = %s -''' +""" -sql_cdc_meta = ''' +sql_cdc_meta = """ INSERT INTO `cdc_meta` (`date`, `epiweek`, `state`, `total`) VALUES (%s, yearweek(%s, 6), %s, %s) ON DUPLICATE KEY UPDATE `total` = %s -''' +""" def upload(test_mode): - # connect - u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - cur = cnx.cursor() - - # insert (or update) table `cdc` - def insert_cdc(date, page, state, num): - cur.execute(sql_cdc, (date, page, state, num, num)) - - # insert (or update) table `cdc_meta` - def insert_cdc_meta(date, state, total): - cur.execute(sql_cdc_meta, (date, date, state, total, total)) - - # loop over rows until the header row is found - def find_header(reader): - for row in reader: - if len(row) > 0 and row[0] == 'Date': - return True - return False - - # parse csv files for `cdc` and `cdc_meta` - def parse_csv(meta): - def handler(reader): - if not find_header(reader): - raise Exception('header not found') - count = 0 - cols = 3 if meta else 4 - for row in reader: - if len(row) != cols: - continue - if meta: - (a, c, d) = row - else: - (a, b, c, d) = row - c = c[:-16] - if c not in STATES: - continue - a = datetime.strptime(a, '%b %d, %Y').strftime('%Y-%m-%d') - c = STATES[c] - d = int(d) - if meta: - insert_cdc_meta(a, c, d) + # connect + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") + cur = cnx.cursor() + + # insert (or update) table `cdc` + def insert_cdc(date, page, state, num): + cur.execute(sql_cdc, (date, page, state, num, num)) + + # insert (or update) table `cdc_meta` + def insert_cdc_meta(date, state, total): + cur.execute(sql_cdc_meta, (date, date, state, total, total)) + + # loop over rows until the header row is found + def find_header(reader): + for row in reader: + if len(row) > 0 and row[0] == "Date": + return True + return False + + # parse csv files for `cdc` and `cdc_meta` + def parse_csv(meta): + def handler(reader): + if not find_header(reader): + raise Exception("header not found") + count = 0 + cols = 3 if meta else 4 + for row in reader: + if len(row) != cols: + continue + if meta: + (a, c, d) = row + else: + (a, b, c, d) = row + c = c[:-16] + if c not in STATES: + continue + a = datetime.strptime(a, "%b %d, %Y").strftime("%Y-%m-%d") + c = STATES[c] + d = int(d) + if meta: + insert_cdc_meta(a, c, d) + else: + insert_cdc(a, b, c, d) + count += 1 + return count + + return handler + + # recursively open zip files + def parse_zip(zf, level=1): + for name in zf.namelist(): + prefix = " " * level + print(prefix, name) + if name[-4:] == ".zip": + with zf.open(name) as temp: + with ZipFile(io.BytesIO(temp.read())) as zf2: + parse_zip(zf2, level + 1) + elif name[-4:] == ".csv": + handler = None + if "Flu Pages by Region" in name: + handler = parse_csv(False) + elif "Regions for all CDC" in name: + handler = parse_csv(True) + else: + print(prefix, " (skipped)") + if handler is not None: + with zf.open(name) as temp: + count = handler(csv.reader(io.StringIO(str(temp.read(), "utf-8")))) + print(prefix, " %d rows" % count) + else: + print(prefix, " (ignored)") + + # find, parse, and move zip files + zip_files = glob.glob("/common/cdc_stage/*.zip") + print("searching...") + for f in zip_files: + print(" ", f) + print("parsing...") + for f in zip_files: + with ZipFile(f) as zf: + parse_zip(zf) + print("moving...") + for f in zip_files: + src = f + dst = os.path.join("/home/automation/cdc_page_stats/", os.path.basename(src)) + print(" ", src, "->", dst) + if test_mode: + print(" (test mode enabled - not moved)") else: - insert_cdc(a, b, c, d) - count += 1 - return count - return handler - - # recursively open zip files - def parse_zip(zf, level=1): - for name in zf.namelist(): - prefix = ' ' * level - print(prefix, name) - if name[-4:] == '.zip': - with zf.open(name) as temp: - with ZipFile(io.BytesIO(temp.read())) as zf2: - parse_zip(zf2, level + 1) - elif name[-4:] == '.csv': - handler = None - if 'Flu Pages by Region' in name: - handler = parse_csv(False) - elif 'Regions for all CDC' in name: - handler = parse_csv(True) - else: - print(prefix, ' (skipped)') - if handler is not None: - with zf.open(name) as temp: - count = handler(csv.reader(io.StringIO(str(temp.read(), 'utf-8')))) - print(prefix, ' %d rows' % count) - else: - print(prefix, ' (ignored)') - - # find, parse, and move zip files - zip_files = glob.glob('/common/cdc_stage/*.zip') - print('searching...') - for f in zip_files: - print(' ', f) - print('parsing...') - for f in zip_files: - with ZipFile(f) as zf: - parse_zip(zf) - print('moving...') - for f in zip_files: - src = f - dst = os.path.join('/home/automation/cdc_page_stats/', os.path.basename(src)) - print(' ', src, '->', dst) - if test_mode: - print(' (test mode enabled - not moved)') - else: - shutil.move(src, dst) - if not os.path.isfile(dst): - raise Exception('unable to move file') - - # disconnect - cur.close() - if not test_mode: - cnx.commit() - cnx.close() + shutil.move(src, dst) + if not os.path.isfile(dst): + raise Exception("unable to move file") + + # disconnect + cur.close() + if not test_mode: + cnx.commit() + cnx.close() def main(): - # args and usage - parser = argparse.ArgumentParser() - parser.add_argument('--test', '-t', default=False, action='store_true', help='dry run only') - args = parser.parse_args() + # args and usage + parser = argparse.ArgumentParser() + parser.add_argument("--test", "-t", default=False, action="store_true", help="dry run only") + args = parser.parse_args() - # make it happen - upload(args.test) + # make it happen + upload(args.test) -if __name__ == '__main__': - main() +if __name__ == "__main__": + main() diff --git a/src/acquisition/covid_hosp/common/database.py b/src/acquisition/covid_hosp/common/database.py index 4fd0981a1..57071bc8d 100644 --- a/src/acquisition/covid_hosp/common/database.py +++ b/src/acquisition/covid_hosp/common/database.py @@ -15,263 +15,247 @@ Columndef = namedtuple("Columndef", "csv_name sql_name dtype") -class Database: - def __init__(self, - connection, - table_name=None, - hhs_dataset_id=None, - columns_and_types=None, - key_columns=None, - additional_fields=None): - """Create a new Database object. - - Parameters - ---------- - connection - An open connection to a database. - table_name : str - The name of the table which holds the dataset. - hhs_dataset_id : str - The 9-character healthdata.gov identifier for this dataset. - columns_and_types : tuple[str, str, Callable] - List of 3-tuples of (CSV header name, SQL column name, data type) for - all the columns in the CSV file. - additional_fields : tuple[str] - List of 2-tuples of (value, SQL column name) fordditional fields to include - at the end of the row which are not present in the CSV data. - """ - - self.connection = connection - self.table_name = table_name - self.hhs_dataset_id = hhs_dataset_id - self.publication_col_name = "issue" if table_name == 'covid_hosp_state_timeseries' else \ - 'publication_date' - self.columns_and_types = { - c.csv_name: c - for c in (columns_and_types if columns_and_types is not None else []) - } - self.key_columns = key_columns if key_columns is not None else [] - self.additional_fields = additional_fields if additional_fields is not None else [] - - @classmethod - def logger(database_class): - return get_structured_logger(f"{database_class.__module__}") - - @classmethod - @contextmanager - def connect(database_class, mysql_connector_impl=mysql.connector): - """Connect to a database and provide the connection as a context manager. - - As long as the context manager exits normally, the connection's transaction - will be committed. Otherwise, if the context is exited by an Exception, the - transaction will be rolled back. - - In any case, the connection will be gracefully closed upon exiting the - context manager. - """ - - # connect to the database - user, password = secrets.db.epi - connection = mysql_connector_impl.connect( - host=secrets.db.host, - user=user, - password=password, - database='epidata') - - try: - # provide the connection to the context manager - yield database_class(connection) - - # rollback by default; the following commit will only take place if no - # exception was raised in calling code - connection.commit() - finally: - # close the connection in any case - connection.close() - - @contextmanager - def new_cursor(self): - """Create and provide a database cursor as a context manager. - - The cursor will be gracefully closed upon exiting the context manager. - """ - - cursor = self.connection.cursor() - try: - yield cursor - finally: - cursor.close() - - def contains_revision(self, revision): - """Return whether the given revision already exists in the database. - - Parameters - ---------- - revision : str - Unique revision string. - - Returns - ------- - bool - True iff the revision already exists. - """ - - with self.new_cursor() as cursor: - cursor.execute(''' - SELECT - count(1) > 0 - FROM - `covid_hosp_meta` - WHERE - `hhs_dataset_id` = %s AND `revision_timestamp` = %s - ''', (self.hhs_dataset_id, revision)) - for (result,) in cursor: - return bool(result) - - def insert_metadata(self, publication_date, revision, meta_json, logger=False): - """Add revision metadata to the database. - - Parameters - ---------- - publication_date : int - Date when the dataset was published in YYYYMMDD format. - revision : str - Unique revision string. - meta_json : str - Metadata serialized as a JSON string. - logger structlog.Logger [optional; default False] - Logger to receive messages - """ - - with self.new_cursor() as cursor: - cursor.execute(''' - INSERT INTO - `covid_hosp_meta` ( - `dataset_name`, - `hhs_dataset_id`, - `publication_date`, - `revision_timestamp`, - `metadata_json`, - `acquisition_datetime` - ) - VALUES - (%s, %s, %s, %s, %s, NOW()) - ''', (self.table_name, self.hhs_dataset_id, publication_date, revision, meta_json)) - - def insert_dataset(self, publication_date, dataframe, logger=False): - """Add a dataset to the database. - - Parameters - ---------- - publication_date : int - Date when the dataset was published in YYYYMMDD format. - dataframe : pandas.DataFrame - The dataset. - logger structlog.Logger [optional; default False] - Logger to receive messages. - """ - dataframe_columns_and_types = [ - x for x in self.columns_and_types.values() if x.csv_name in dataframe.columns - ] - - def nan_safe_dtype(dtype, value): - if isinstance(value, float) and math.isnan(value): - return None - return dtype(value) - - # first convert keys and save the results; we'll need them later - for csv_name in self.key_columns: - dataframe.loc[:, csv_name] = dataframe[csv_name].map(self.columns_and_types[csv_name].dtype) - - num_columns = 2 + len(dataframe_columns_and_types) + len(self.additional_fields) - value_placeholders = ', '.join(['%s'] * num_columns) - columns = ', '.join(f'`{i.sql_name}`' for i in dataframe_columns_and_types + self.additional_fields) - sql = f'INSERT INTO `{self.table_name}` (`id`, `{self.publication_col_name}`, {columns}) ' \ - f'VALUES ({value_placeholders})' - id_and_publication_date = (0, publication_date) - if logger: - logger.info('updating values', count=len(dataframe.index)) - n = 0 - many_values = [] - with self.new_cursor() as cursor: - for index, row in dataframe.iterrows(): - values = [] - for c in dataframe_columns_and_types: - values.append(nan_safe_dtype(c.dtype, row[c.csv_name])) - many_values.append(id_and_publication_date + - tuple(values) + - tuple(i.csv_name for i in self.additional_fields)) - n += 1 - # insert in batches because one at a time is slow and all at once makes - # the connection drop :( - if n % 5_000 == 0: - try: - cursor.executemany(sql, many_values) - many_values = [] - except Exception as e: +class Database: + def __init__(self, connection, table_name=None, hhs_dataset_id=None, columns_and_types=None, key_columns=None, additional_fields=None): + """Create a new Database object. + + Parameters + ---------- + connection + An open connection to a database. + table_name : str + The name of the table which holds the dataset. + hhs_dataset_id : str + The 9-character healthdata.gov identifier for this dataset. + columns_and_types : tuple[str, str, Callable] + List of 3-tuples of (CSV header name, SQL column name, data type) for + all the columns in the CSV file. + additional_fields : tuple[str] + List of 2-tuples of (value, SQL column name) fordditional fields to include + at the end of the row which are not present in the CSV data. + """ + + self.connection = connection + self.table_name = table_name + self.hhs_dataset_id = hhs_dataset_id + self.publication_col_name = "issue" if table_name == "covid_hosp_state_timeseries" else "publication_date" + self.columns_and_types = {c.csv_name: c for c in (columns_and_types if columns_and_types is not None else [])} + self.key_columns = key_columns if key_columns is not None else [] + self.additional_fields = additional_fields if additional_fields is not None else [] + + @classmethod + def logger(database_class): + return get_structured_logger(f"{database_class.__module__}") + + @classmethod + @contextmanager + def connect(database_class, mysql_connector_impl=mysql.connector): + """Connect to a database and provide the connection as a context manager. + + As long as the context manager exits normally, the connection's transaction + will be committed. Otherwise, if the context is exited by an Exception, the + transaction will be rolled back. + + In any case, the connection will be gracefully closed upon exiting the + context manager. + """ + + # connect to the database + user, password = secrets.db.epi + connection = mysql_connector_impl.connect(host=secrets.db.host, user=user, password=password, database="epidata") + + try: + # provide the connection to the context manager + yield database_class(connection) + + # rollback by default; the following commit will only take place if no + # exception was raised in calling code + connection.commit() + finally: + # close the connection in any case + connection.close() + + @contextmanager + def new_cursor(self): + """Create and provide a database cursor as a context manager. + + The cursor will be gracefully closed upon exiting the context manager. + """ + + cursor = self.connection.cursor() + try: + yield cursor + finally: + cursor.close() + + def contains_revision(self, revision): + """Return whether the given revision already exists in the database. + + Parameters + ---------- + revision : str + Unique revision string. + + Returns + ------- + bool + True iff the revision already exists. + """ + + with self.new_cursor() as cursor: + cursor.execute( + """ + SELECT + count(1) > 0 + FROM + `covid_hosp_meta` + WHERE + `hhs_dataset_id` = %s AND `revision_timestamp` = %s + """, + (self.hhs_dataset_id, revision), + ) + for (result,) in cursor: + return bool(result) + + def insert_metadata(self, publication_date, revision, meta_json, logger=False): + """Add revision metadata to the database. + + Parameters + ---------- + publication_date : int + Date when the dataset was published in YYYYMMDD format. + revision : str + Unique revision string. + meta_json : str + Metadata serialized as a JSON string. + logger structlog.Logger [optional; default False] + Logger to receive messages + """ + + with self.new_cursor() as cursor: + cursor.execute( + """ + INSERT INTO + `covid_hosp_meta` ( + `dataset_name`, + `hhs_dataset_id`, + `publication_date`, + `revision_timestamp`, + `metadata_json`, + `acquisition_datetime` + ) + VALUES + (%s, %s, %s, %s, %s, NOW()) + """, + (self.table_name, self.hhs_dataset_id, publication_date, revision, meta_json), + ) + + def insert_dataset(self, publication_date, dataframe, logger=False): + """Add a dataset to the database. + + Parameters + ---------- + publication_date : int + Date when the dataset was published in YYYYMMDD format. + dataframe : pandas.DataFrame + The dataset. + logger structlog.Logger [optional; default False] + Logger to receive messages. + """ + dataframe_columns_and_types = [x for x in self.columns_and_types.values() if x.csv_name in dataframe.columns] + + def nan_safe_dtype(dtype, value): + if isinstance(value, float) and math.isnan(value): + return None + return dtype(value) + + # first convert keys and save the results; we'll need them later + for csv_name in self.key_columns: + dataframe.loc[:, csv_name] = dataframe[csv_name].map(self.columns_and_types[csv_name].dtype) + + num_columns = 2 + len(dataframe_columns_and_types) + len(self.additional_fields) + value_placeholders = ", ".join(["%s"] * num_columns) + columns = ", ".join(f"`{i.sql_name}`" for i in dataframe_columns_and_types + self.additional_fields) + sql = f"INSERT INTO `{self.table_name}` (`id`, `{self.publication_col_name}`, {columns}) " f"VALUES ({value_placeholders})" + id_and_publication_date = (0, publication_date) + if logger: + logger.info("updating values", count=len(dataframe.index)) + n = 0 + many_values = [] + with self.new_cursor() as cursor: + for index, row in dataframe.iterrows(): + values = [] + for c in dataframe_columns_and_types: + values.append(nan_safe_dtype(c.dtype, row[c.csv_name])) + many_values.append(id_and_publication_date + tuple(values) + tuple(i.csv_name for i in self.additional_fields)) + n += 1 + # insert in batches because one at a time is slow and all at once makes + # the connection drop :( + if n % 5_000 == 0: + try: + cursor.executemany(sql, many_values) + many_values = [] + except Exception as e: + if logger: + logger.error("error on insert", publ_date=publication_date, in_lines=(n - 5_000, n), index=index, values=values, exception=e) + raise e + # insert final batch + if many_values: + cursor.executemany(sql, many_values) + + # deal with non/seldomly updated columns used like a fk table (if this database needs it) + if hasattr(self, "AGGREGATE_KEY_COLS"): + if logger: + logger.info("updating keys") + ak_cols = self.AGGREGATE_KEY_COLS + + # restrict data to just the key columns and remove duplicate rows + # sort by key columns to ensure that the last ON DUPLICATE KEY overwrite + # uses the most-recent aggregate key information + ak_data = dataframe[set(ak_cols + self.key_columns)].sort_values(self.key_columns)[ak_cols].drop_duplicates() + # cast types + for col in ak_cols: + ak_data[col] = ak_data[col].map(lambda value: nan_safe_dtype(self.columns_and_types[col].dtype, value)) + # fix NULLs + ak_data = ak_data.to_numpy(na_value=None).tolist() + + # create string of tick-quoted and comma-seperated column list + ak_cols_str = ",".join(f"`{col}`" for col in ak_cols) + # ...and ticked and comma-sep'd "column=column" list for ON UPDATE (to keep only the most recent values for each pk) + ak_updates_str = ",".join(f"`{col}`=v.{col}" for col in ak_cols) + # ...and string of VALUES placeholders + values_str = ",".join(["%s"] * len(ak_cols)) + # use aggregate key table alias + ak_table = self.table_name + "_key" + # assemble full SQL statement + ak_insert_sql = f"INSERT INTO `{ak_table}` ({ak_cols_str}) VALUES ({values_str}) AS v ON DUPLICATE KEY UPDATE {ak_updates_str}" + if logger: + logger.info("database query", sql=ak_insert_sql) + + # commit the data + with self.new_cursor() as cur: + cur.executemany(ak_insert_sql, ak_data) + + def get_max_issue(self, logger=False): + """Fetch the most recent issue. + + This is used to bookend what updates we pull in from the HHS metadata. + """ + with self.new_cursor() as cursor: + cursor.execute( + f""" + SELECT + max(publication_date) + from + `covid_hosp_meta` + WHERE + hhs_dataset_id = "{self.hhs_dataset_id}" + """ + ) + for (result,) in cursor: + if result is not None: + return pd.Timestamp(str(result)) if logger: - logger.error('error on insert', publ_date=publication_date, in_lines=(n-5_000, n), index=index, values=values, exception=e) - raise e - # insert final batch - if many_values: - cursor.executemany(sql, many_values) - - # deal with non/seldomly updated columns used like a fk table (if this database needs it) - if hasattr(self, 'AGGREGATE_KEY_COLS'): - if logger: - logger.info('updating keys') - ak_cols = self.AGGREGATE_KEY_COLS - - # restrict data to just the key columns and remove duplicate rows - # sort by key columns to ensure that the last ON DUPLICATE KEY overwrite - # uses the most-recent aggregate key information - ak_data = (dataframe[set(ak_cols + self.key_columns)] - .sort_values(self.key_columns)[ak_cols] - .drop_duplicates()) - # cast types - for col in ak_cols: - ak_data[col] = ak_data[col].map( - lambda value: nan_safe_dtype(self.columns_and_types[col].dtype, value) - ) - # fix NULLs - ak_data = ak_data.to_numpy(na_value=None).tolist() - - # create string of tick-quoted and comma-seperated column list - ak_cols_str = ','.join(f'`{col}`' for col in ak_cols) - # ...and ticked and comma-sep'd "column=column" list for ON UPDATE (to keep only the most recent values for each pk) - ak_updates_str = ','.join(f'`{col}`=v.{col}' for col in ak_cols) - # ...and string of VALUES placeholders - values_str = ','.join( ['%s'] * len(ak_cols) ) - # use aggregate key table alias - ak_table = self.table_name + '_key' - # assemble full SQL statement - ak_insert_sql = f'INSERT INTO `{ak_table}` ({ak_cols_str}) VALUES ({values_str}) AS v ON DUPLICATE KEY UPDATE {ak_updates_str}' - if logger: - logger.info("database query", sql=ak_insert_sql) - - # commit the data - with self.new_cursor() as cur: - cur.executemany(ak_insert_sql, ak_data) - - - def get_max_issue(self, logger=False): - """Fetch the most recent issue. - - This is used to bookend what updates we pull in from the HHS metadata. - """ - with self.new_cursor() as cursor: - cursor.execute(f''' - SELECT - max(publication_date) - from - `covid_hosp_meta` - WHERE - hhs_dataset_id = "{self.hhs_dataset_id}" - ''') - for (result,) in cursor: - if result is not None: - return pd.Timestamp(str(result)) - if logger: - logger.warn("get_max_issue", msg="no matching results in meta table; returning 1900/1/1 epoch") - return pd.Timestamp("1900/1/1") + logger.warn("get_max_issue", msg="no matching results in meta table; returning 1900/1/1 epoch") + return pd.Timestamp("1900/1/1") diff --git a/src/acquisition/covid_hosp/common/network.py b/src/acquisition/covid_hosp/common/network.py index 7b6228f16..ff80c4c37 100644 --- a/src/acquisition/covid_hosp/common/network.py +++ b/src/acquisition/covid_hosp/common/network.py @@ -3,52 +3,51 @@ class Network: - METADATA_URL_TEMPLATE = \ - 'https://healthdata.gov/api/views/%s/rows.csv' - - def fetch_metadata_for_dataset(dataset_id, logger=False): - """Download and return metadata. - - Parameters - ---------- - dataset_id : str - healthdata.gov dataset identifier of the dataset. - logger : structlog.Logger [optional; default False] - Logger to receive messages. - - Returns - ------- - object - The metadata object. - """ - url = Network.METADATA_URL_TEMPLATE % dataset_id - if logger: - logger.info('fetching metadata', url=url) - df = Network.fetch_dataset(url) - df["Update Date"] = pandas.to_datetime(df["Update Date"]) - df.sort_values("Update Date", inplace=True) - df.set_index("Update Date", inplace=True) - return df - - def fetch_dataset(url, pandas_impl=pandas, logger=False): - """Download and return a dataset. - - Type inference is disabled in favor of explicit type casting at the - database abstraction layer. Pandas behavior is to represent non-missing - values as strings and missing values as `math.nan`. - - Parameters - ---------- - url : str - URL to the dataset in CSV format. - logger : structlog.Logger [optional; default False] - Logger to receive messages. - - Returns - ------- - pandas.DataFrame - The dataset. - """ - if logger: - logger.info('fetching dataset', url=url) - return pandas_impl.read_csv(url, dtype=str) + METADATA_URL_TEMPLATE = "https://healthdata.gov/api/views/%s/rows.csv" + + def fetch_metadata_for_dataset(dataset_id, logger=False): + """Download and return metadata. + + Parameters + ---------- + dataset_id : str + healthdata.gov dataset identifier of the dataset. + logger : structlog.Logger [optional; default False] + Logger to receive messages. + + Returns + ------- + object + The metadata object. + """ + url = Network.METADATA_URL_TEMPLATE % dataset_id + if logger: + logger.info("fetching metadata", url=url) + df = Network.fetch_dataset(url) + df["Update Date"] = pandas.to_datetime(df["Update Date"]) + df.sort_values("Update Date", inplace=True) + df.set_index("Update Date", inplace=True) + return df + + def fetch_dataset(url, pandas_impl=pandas, logger=False): + """Download and return a dataset. + + Type inference is disabled in favor of explicit type casting at the + database abstraction layer. Pandas behavior is to represent non-missing + values as strings and missing values as `math.nan`. + + Parameters + ---------- + url : str + URL to the dataset in CSV format. + logger : structlog.Logger [optional; default False] + Logger to receive messages. + + Returns + ------- + pandas.DataFrame + The dataset. + """ + if logger: + logger.info("fetching dataset", url=url) + return pandas_impl.read_csv(url, dtype=str) diff --git a/src/acquisition/covid_hosp/common/test_utils.py b/src/acquisition/covid_hosp/common/test_utils.py index 2a737b383..b5fb9384a 100644 --- a/src/acquisition/covid_hosp/common/test_utils.py +++ b/src/acquisition/covid_hosp/common/test_utils.py @@ -17,43 +17,41 @@ class UnitTestUtils: - # path to `covid_hosp` test data, relative to the top of the repo - PATH_TO_TESTDATA = 'testdata/acquisition/covid_hosp' - - def __init__(self, abs_path_to_caller): - # navigate to the root of the delphi-epidata repo - dataset_name = None - current_path = Path(abs_path_to_caller) - while not (current_path / 'testdata').exists(): - - # bail if we made it all the way to root - if not current_path.name: - raise Exception('unable to determine path to delphi-epidata repo') - - # looking for a path like .../acquisition/covid_hosp/ - if current_path.parent.name == 'covid_hosp': - dataset_name = current_path.name - - # move up one level - current_path = current_path.parent - - # the loop above stops at the top of the repo - path_to_repo = current_path - - if not dataset_name: - raise Exception('unable to determine name of dataset under test') - - # path dataset-specific test data, relative to the root of the repo - self.data_dir = ( - path_to_repo / UnitTestUtils.PATH_TO_TESTDATA / dataset_name - ).resolve() - - def load_sample_metadata(self, metadata_name='metadata.csv'): - df = pandas.read_csv(self.data_dir / metadata_name, dtype=str) - df["Update Date"] = pandas.to_datetime(df["Update Date"]) - df.sort_values("Update Date", inplace=True) - df.set_index("Update Date", inplace=True) - return df - - def load_sample_dataset(self, dataset_name='dataset.csv'): - return pandas.read_csv(self.data_dir / dataset_name, dtype=str) + # path to `covid_hosp` test data, relative to the top of the repo + PATH_TO_TESTDATA = "testdata/acquisition/covid_hosp" + + def __init__(self, abs_path_to_caller): + # navigate to the root of the delphi-epidata repo + dataset_name = None + current_path = Path(abs_path_to_caller) + while not (current_path / "testdata").exists(): + + # bail if we made it all the way to root + if not current_path.name: + raise Exception("unable to determine path to delphi-epidata repo") + + # looking for a path like .../acquisition/covid_hosp/ + if current_path.parent.name == "covid_hosp": + dataset_name = current_path.name + + # move up one level + current_path = current_path.parent + + # the loop above stops at the top of the repo + path_to_repo = current_path + + if not dataset_name: + raise Exception("unable to determine name of dataset under test") + + # path dataset-specific test data, relative to the root of the repo + self.data_dir = (path_to_repo / UnitTestUtils.PATH_TO_TESTDATA / dataset_name).resolve() + + def load_sample_metadata(self, metadata_name="metadata.csv"): + df = pandas.read_csv(self.data_dir / metadata_name, dtype=str) + df["Update Date"] = pandas.to_datetime(df["Update Date"]) + df.sort_values("Update Date", inplace=True) + df.set_index("Update Date", inplace=True) + return df + + def load_sample_dataset(self, dataset_name="dataset.csv"): + return pandas.read_csv(self.data_dir / dataset_name, dtype=str) diff --git a/src/acquisition/covid_hosp/common/utils.py b/src/acquisition/covid_hosp/common/utils.py index 5f718ad69..04363755e 100644 --- a/src/acquisition/covid_hosp/common/utils.py +++ b/src/acquisition/covid_hosp/common/utils.py @@ -8,219 +8,211 @@ class CovidHospException(Exception): - """Exception raised exclusively by `covid_hosp` utilities.""" + """Exception raised exclusively by `covid_hosp` utilities.""" class Utils: - # regex to extract issue date from revision field - # example revision: "Mon, 11/16/2020 - 00:55" - REVISION_PATTERN = re.compile(r'^.*\s(\d+)/(\d+)/(\d+)\s.*$') - - def launch_if_main(entrypoint, runtime_name): - """Call the given function in the main entry point, otherwise no-op.""" - - if runtime_name == '__main__': - entrypoint() - - def int_from_date(date): - """Convert a YYYY/MM/DD date from a string to a YYYYMMDD int. - - Parameters - ---------- - date : str - Date in "YYYY/MM/DD.*" format. - - Returns - ------- - int - Date in YYYYMMDD format. - """ - if isinstance(date, str): - return int(date[:10].replace('/', '').replace('-', '')) - return date - - def parse_bool(value): - """Convert a string to a boolean. - - Parameters - ---------- - value : str - Boolean-like value, like "true" or "false". - - Returns - ------- - bool - If the string contains some version of "true" or "false". - None - If the string is None or empty. - - Raises - ------ - CovidHospException - If the string constains something other than a version of "true" or - "false". - """ - - if not value: - return None - if value.lower() == 'true': - return True - if value.lower() == 'false': - return False - raise CovidHospException(f'cannot convert "{value}" to bool') - - def limited_string_fn(length): - def limited_string(value): - value = str(value) - if len(value) > length: - raise CovidHospException(f"Value '{value}':{len(value)} longer than max {length}") - return value - return limited_string - - GEOCODE_LENGTH = 32 - GEOCODE_PATTERN = re.compile(r'POINT \((-?[0-9.]+) (-?[0-9.]+)\)') - def limited_geocode(value): - if len(value) < Utils.GEOCODE_LENGTH: - return value - # otherwise parse and set precision to 6 decimal places - m = Utils.GEOCODE_PATTERN.match(value) - if not m: - raise CovidHospException(f"Couldn't parse geocode '{value}'") - return f'POINT ({" ".join(f"{float(x):.6f}" for x in m.groups())})' - - def issues_to_fetch(metadata, newer_than, older_than, logger=False): - """ - Construct all issue dates and URLs to be ingested based on metadata. - - Parameters - ---------- - metadata pd.DataFrame - HHS metadata indexed by issue date and with column "Archive Link" - newer_than Date - Lower bound (exclusive) of days to get issues for. - older_than Date - Upper bound (exclusive) of days to get issues for - logger structlog.Logger [optional; default False] - Logger to receive messages - Returns - ------- - Dictionary of {issue day: list of (download urls, index)} - for issues after newer_than and before older_than - """ - daily_issues = {} - n_beyond = 0 - n_selected = 0 - for index in sorted(set(metadata.index)): - day = index.date() - if day > newer_than and day < older_than: - urls = metadata.loc[index, "Archive Link"] - urls_list = [(urls, index)] if isinstance(urls, str) else [(url, index) for url in urls] - if day not in daily_issues: - daily_issues[day] = urls_list - else: - daily_issues[day] += urls_list - n_selected += len(urls_list) - elif day >= older_than: - n_beyond += 1 - if logger: - if n_beyond > 0: - logger.info("issues available beyond selection", on_or_newer=older_than, count=n_beyond) - logger.info("issues selected", newer_than=str(newer_than), older_than=str(older_than), count=n_selected) - return daily_issues - - @staticmethod - def merge_by_key_cols(dfs, key_cols, logger=False): - """Merge a list of data frames as a series of updates. - - Parameters: - ----------- - dfs : list(pd.DataFrame) - Data frames to merge, ordered from earliest to latest. - key_cols: list(str) - Columns to use as the index. - logger structlog.Logger [optional; default False] - Logger to receive messages - - Returns a single data frame containing the most recent data for each state+date. - """ - - dfs = [df.set_index(key_cols) for df in dfs - if not all(k in df.index.names for k in key_cols)] - result = dfs[0] - if logger and len(dfs) > 7: - logger.warning( - "expensive operation", - msg="concatenating more than 7 files may result in long running times", - count=len(dfs)) - for df in dfs[1:]: - # update values for existing keys - result.update(df) - # add any new keys. - ## repeated concatenation in pandas is expensive, but (1) we don't expect - ## batch sizes to be terribly large (7 files max) and (2) this way we can - ## more easily capture the next iteration's updates to any new keys - result_index_set = set(result.index.to_list()) - new_rows = df.loc[[i for i in df.index.to_list() if i not in result_index_set]] - result = pd.concat([result, new_rows]) - - # convert the index rows back to columns - return result.reset_index(level=key_cols) - - @staticmethod - def update_dataset(database, network, newer_than=None, older_than=None): - """Acquire the most recent dataset, unless it was previously acquired. - - Parameters - ---------- - database : delphi.epidata.acquisition.covid_hosp.common.database.Database - A `Database` subclass for a particular dataset. - network : delphi.epidata.acquisition.covid_hosp.common.network.Network - A `Network` subclass for a particular dataset. - newer_than : date - Lower bound (exclusive) of days to get issues for. - older_than : date - Upper bound (exclusive) of days to get issues for - - Returns - ------- - bool - Whether a new dataset was acquired. - """ - logger = database.logger() - - metadata = network.fetch_metadata(logger=logger) - datasets = [] - with database.connect() as db: - max_issue = db.get_max_issue(logger=logger) - - older_than = datetime.datetime.today().date() if newer_than is None else older_than - newer_than = max_issue if newer_than is None else newer_than - daily_issues = Utils.issues_to_fetch(metadata, newer_than, older_than, logger=logger) - if not daily_issues: - logger.info("no new issues; nothing to do") - return False - for issue, revisions in daily_issues.items(): - issue_int = int(issue.strftime("%Y%m%d")) - # download the dataset and add it to the database - dataset = Utils.merge_by_key_cols([network.fetch_dataset(url, logger=logger) for url, _ in revisions], - db.KEY_COLS, - logger=logger) - # add metadata to the database - all_metadata = [] - for url, index in revisions: - all_metadata.append((url, metadata.loc[index].reset_index().to_json())) - datasets.append(( - issue_int, - dataset, - all_metadata - )) - with database.connect() as db: - for issue_int, dataset, all_metadata in datasets: - db.insert_dataset(issue_int, dataset, logger=logger) - for url, metadata_json in all_metadata: - db.insert_metadata(issue_int, url, metadata_json, logger=logger) - logger.info("acquired rows", count=len(dataset)) - - # note that the transaction is committed by exiting the `with` block - return True + # regex to extract issue date from revision field + # example revision: "Mon, 11/16/2020 - 00:55" + REVISION_PATTERN = re.compile(r"^.*\s(\d+)/(\d+)/(\d+)\s.*$") + + def launch_if_main(entrypoint, runtime_name): + """Call the given function in the main entry point, otherwise no-op.""" + + if runtime_name == "__main__": + entrypoint() + + def int_from_date(date): + """Convert a YYYY/MM/DD date from a string to a YYYYMMDD int. + + Parameters + ---------- + date : str + Date in "YYYY/MM/DD.*" format. + + Returns + ------- + int + Date in YYYYMMDD format. + """ + if isinstance(date, str): + return int(date[:10].replace("/", "").replace("-", "")) + return date + + def parse_bool(value): + """Convert a string to a boolean. + + Parameters + ---------- + value : str + Boolean-like value, like "true" or "false". + + Returns + ------- + bool + If the string contains some version of "true" or "false". + None + If the string is None or empty. + + Raises + ------ + CovidHospException + If the string constains something other than a version of "true" or + "false". + """ + + if not value: + return None + if value.lower() == "true": + return True + if value.lower() == "false": + return False + raise CovidHospException(f'cannot convert "{value}" to bool') + + def limited_string_fn(length): + def limited_string(value): + value = str(value) + if len(value) > length: + raise CovidHospException(f"Value '{value}':{len(value)} longer than max {length}") + return value + + return limited_string + + GEOCODE_LENGTH = 32 + GEOCODE_PATTERN = re.compile(r"POINT \((-?[0-9.]+) (-?[0-9.]+)\)") + + def limited_geocode(value): + if len(value) < Utils.GEOCODE_LENGTH: + return value + # otherwise parse and set precision to 6 decimal places + m = Utils.GEOCODE_PATTERN.match(value) + if not m: + raise CovidHospException(f"Couldn't parse geocode '{value}'") + return f'POINT ({" ".join(f"{float(x):.6f}" for x in m.groups())})' + + def issues_to_fetch(metadata, newer_than, older_than, logger=False): + """ + Construct all issue dates and URLs to be ingested based on metadata. + + Parameters + ---------- + metadata pd.DataFrame + HHS metadata indexed by issue date and with column "Archive Link" + newer_than Date + Lower bound (exclusive) of days to get issues for. + older_than Date + Upper bound (exclusive) of days to get issues for + logger structlog.Logger [optional; default False] + Logger to receive messages + Returns + ------- + Dictionary of {issue day: list of (download urls, index)} + for issues after newer_than and before older_than + """ + daily_issues = {} + n_beyond = 0 + n_selected = 0 + for index in sorted(set(metadata.index)): + day = index.date() + if day > newer_than and day < older_than: + urls = metadata.loc[index, "Archive Link"] + urls_list = [(urls, index)] if isinstance(urls, str) else [(url, index) for url in urls] + if day not in daily_issues: + daily_issues[day] = urls_list + else: + daily_issues[day] += urls_list + n_selected += len(urls_list) + elif day >= older_than: + n_beyond += 1 + if logger: + if n_beyond > 0: + logger.info("issues available beyond selection", on_or_newer=older_than, count=n_beyond) + logger.info("issues selected", newer_than=str(newer_than), older_than=str(older_than), count=n_selected) + return daily_issues + + @staticmethod + def merge_by_key_cols(dfs, key_cols, logger=False): + """Merge a list of data frames as a series of updates. + + Parameters: + ----------- + dfs : list(pd.DataFrame) + Data frames to merge, ordered from earliest to latest. + key_cols: list(str) + Columns to use as the index. + logger structlog.Logger [optional; default False] + Logger to receive messages + + Returns a single data frame containing the most recent data for each state+date. + """ + + dfs = [df.set_index(key_cols) for df in dfs if not all(k in df.index.names for k in key_cols)] + result = dfs[0] + if logger and len(dfs) > 7: + logger.warning("expensive operation", msg="concatenating more than 7 files may result in long running times", count=len(dfs)) + for df in dfs[1:]: + # update values for existing keys + result.update(df) + # add any new keys. + ## repeated concatenation in pandas is expensive, but (1) we don't expect + ## batch sizes to be terribly large (7 files max) and (2) this way we can + ## more easily capture the next iteration's updates to any new keys + result_index_set = set(result.index.to_list()) + new_rows = df.loc[[i for i in df.index.to_list() if i not in result_index_set]] + result = pd.concat([result, new_rows]) + + # convert the index rows back to columns + return result.reset_index(level=key_cols) + + @staticmethod + def update_dataset(database, network, newer_than=None, older_than=None): + """Acquire the most recent dataset, unless it was previously acquired. + + Parameters + ---------- + database : delphi.epidata.acquisition.covid_hosp.common.database.Database + A `Database` subclass for a particular dataset. + network : delphi.epidata.acquisition.covid_hosp.common.network.Network + A `Network` subclass for a particular dataset. + newer_than : date + Lower bound (exclusive) of days to get issues for. + older_than : date + Upper bound (exclusive) of days to get issues for + + Returns + ------- + bool + Whether a new dataset was acquired. + """ + logger = database.logger() + + metadata = network.fetch_metadata(logger=logger) + datasets = [] + with database.connect() as db: + max_issue = db.get_max_issue(logger=logger) + + older_than = datetime.datetime.today().date() if newer_than is None else older_than + newer_than = max_issue if newer_than is None else newer_than + daily_issues = Utils.issues_to_fetch(metadata, newer_than, older_than, logger=logger) + if not daily_issues: + logger.info("no new issues; nothing to do") + return False + for issue, revisions in daily_issues.items(): + issue_int = int(issue.strftime("%Y%m%d")) + # download the dataset and add it to the database + dataset = Utils.merge_by_key_cols([network.fetch_dataset(url, logger=logger) for url, _ in revisions], db.KEY_COLS, logger=logger) + # add metadata to the database + all_metadata = [] + for url, index in revisions: + all_metadata.append((url, metadata.loc[index].reset_index().to_json())) + datasets.append((issue_int, dataset, all_metadata)) + with database.connect() as db: + for issue_int, dataset, all_metadata in datasets: + db.insert_dataset(issue_int, dataset, logger=logger) + for url, metadata_json in all_metadata: + db.insert_metadata(issue_int, url, metadata_json, logger=logger) + logger.info("acquired rows", count=len(dataset)) + + # note that the transaction is committed by exiting the `with` block + return True diff --git a/src/acquisition/covid_hosp/facility/database.py b/src/acquisition/covid_hosp/facility/database.py index 172f32dc4..1fa642c72 100644 --- a/src/acquisition/covid_hosp/facility/database.py +++ b/src/acquisition/covid_hosp/facility/database.py @@ -7,213 +7,151 @@ class Database(BaseDatabase): - TABLE_NAME = 'covid_hosp_facility' - KEY_COLS = ['hospital_pk', 'collection_week'] - AGGREGATE_KEY_COLS = ['address', 'ccn', 'city', 'fips_code', 'geocoded_hospital_address', 'hhs_ids', 'hospital_name', 'hospital_pk', 'hospital_subtype', 'is_metro_micro', 'state', 'zip'] - # These are 3-tuples of ( - # CSV header name, - # SQL db column name, - # data type - # ) for all the columns in the CSV file. - # Note that the corresponding database column names may be shorter - # due to constraints on the length of column names. See - # /src/ddl/covid_hosp.sql for more information. - ORDERED_CSV_COLUMNS = [ - Columndef('hospital_pk', 'hospital_pk', str), - Columndef('collection_week', 'collection_week', Utils.int_from_date), - Columndef('address', 'address', str), - Columndef('all_adult_hospital_beds_7_day_avg', 'all_adult_hospital_beds_7_day_avg', float), - Columndef('all_adult_hospital_beds_7_day_coverage', 'all_adult_hospital_beds_7_day_coverage', int), - Columndef('all_adult_hospital_beds_7_day_sum', 'all_adult_hospital_beds_7_day_sum', int), - Columndef('all_adult_hospital_inpatient_bed_occupied_7_day_avg', - 'all_adult_hospital_inpatient_bed_occupied_7_day_avg', float), - Columndef('all_adult_hospital_inpatient_bed_occupied_7_day_coverage', - 'all_adult_hospital_inpatient_bed_occupied_7_day_coverage', int), - Columndef('all_adult_hospital_inpatient_bed_occupied_7_day_sum', - 'all_adult_hospital_inpatient_bed_occupied_7_day_sum', int), - Columndef('all_adult_hospital_inpatient_beds_7_day_avg', 'all_adult_hospital_inpatient_beds_7_day_avg', - float), - Columndef('all_adult_hospital_inpatient_beds_7_day_coverage', - 'all_adult_hospital_inpatient_beds_7_day_coverage', int), - Columndef('all_adult_hospital_inpatient_beds_7_day_sum', 'all_adult_hospital_inpatient_beds_7_day_sum', - int), - Columndef('ccn', 'ccn', str), - Columndef('city', 'city', str), - Columndef('fips_code', 'fips_code', str), - Columndef('geocoded_hospital_address', 'geocoded_hospital_address', Utils.limited_geocode), - Columndef('hhs_ids', 'hhs_ids', str), - Columndef('hospital_name', 'hospital_name', str), - Columndef('hospital_subtype', 'hospital_subtype', str), - Columndef('icu_beds_used_7_day_avg', 'icu_beds_used_7_day_avg', float), - Columndef('icu_beds_used_7_day_coverage', 'icu_beds_used_7_day_coverage', int), - Columndef('icu_beds_used_7_day_sum', 'icu_beds_used_7_day_sum', int), - Columndef('icu_patients_confirmed_influenza_7_day_avg', 'icu_patients_confirmed_influenza_7_day_avg', - float), - Columndef('icu_patients_confirmed_influenza_7_day_coverage', - 'icu_patients_confirmed_influenza_7_day_coverage', int), - Columndef('icu_patients_confirmed_influenza_7_day_sum', 'icu_patients_confirmed_influenza_7_day_sum', - int), - Columndef('inpatient_beds_7_day_avg', 'inpatient_beds_7_day_avg', float), - Columndef('inpatient_beds_7_day_coverage', 'inpatient_beds_7_day_coverage', int), - Columndef('inpatient_beds_7_day_sum', 'inpatient_beds_7_day_sum', int), - Columndef('inpatient_beds_used_7_day_avg', 'inpatient_beds_used_7_day_avg', float), - Columndef('inpatient_beds_used_7_day_coverage', 'inpatient_beds_used_7_day_coverage', int), - Columndef('inpatient_beds_used_7_day_sum', 'inpatient_beds_used_7_day_sum', int), - Columndef('is_corrected', 'is_corrected', Utils.parse_bool), - Columndef('is_metro_micro', 'is_metro_micro', Utils.parse_bool), - Columndef('previous_day_admission_adult_covid_confirmed_18-19_7_day_sum', - 'previous_day_admission_adult_covid_confirmed_18_19_7_day_sum', int), - Columndef('previous_day_admission_adult_covid_confirmed_20-29_7_day_sum', - 'previous_day_admission_adult_covid_confirmed_20_29_7_day_sum', int), - Columndef('previous_day_admission_adult_covid_confirmed_30-39_7_day_sum', - 'previous_day_admission_adult_covid_confirmed_30_39_7_day_sum', int), - Columndef('previous_day_admission_adult_covid_confirmed_40-49_7_day_sum', - 'previous_day_admission_adult_covid_confirmed_40_49_7_day_sum', int), - Columndef('previous_day_admission_adult_covid_confirmed_50-59_7_day_sum', - 'previous_day_admission_adult_covid_confirmed_50_59_7_day_sum', int), - Columndef('previous_day_admission_adult_covid_confirmed_60-69_7_day_sum', - 'previous_day_admission_adult_covid_confirmed_60_69_7_day_sum', int), - Columndef('previous_day_admission_adult_covid_confirmed_70-79_7_day_sum', - 'previous_day_admission_adult_covid_confirmed_70_79_7_day_sum', int), - Columndef('previous_day_admission_adult_covid_confirmed_7_day_coverage', - 'previous_day_admission_adult_covid_confirmed_7_day_coverage', int), - Columndef('previous_day_admission_adult_covid_confirmed_7_day_sum', - 'previous_day_admission_adult_covid_confirmed_7_day_sum', int), - Columndef('previous_day_admission_adult_covid_confirmed_80+_7_day_sum', - 'previous_day_admission_adult_covid_confirmed_80plus_7_day_sum', int), - Columndef('previous_day_admission_adult_covid_confirmed_unknown_7_day_sum', - 'previous_day_admission_adult_covid_confirmed_unknown_7_day_sum', int), - Columndef('previous_day_admission_adult_covid_suspected_18-19_7_day_sum', - 'previous_day_admission_adult_covid_suspected_18_19_7_day_sum', int), - Columndef('previous_day_admission_adult_covid_suspected_20-29_7_day_sum', - 'previous_day_admission_adult_covid_suspected_20_29_7_day_sum', int), - Columndef('previous_day_admission_adult_covid_suspected_30-39_7_day_sum', - 'previous_day_admission_adult_covid_suspected_30_39_7_day_sum', int), - Columndef('previous_day_admission_adult_covid_suspected_40-49_7_day_sum', - 'previous_day_admission_adult_covid_suspected_40_49_7_day_sum', int), - Columndef('previous_day_admission_adult_covid_suspected_50-59_7_day_sum', - 'previous_day_admission_adult_covid_suspected_50_59_7_day_sum', int), - Columndef('previous_day_admission_adult_covid_suspected_60-69_7_day_sum', - 'previous_day_admission_adult_covid_suspected_60_69_7_day_sum', int), - Columndef('previous_day_admission_adult_covid_suspected_70-79_7_day_sum', - 'previous_day_admission_adult_covid_suspected_70_79_7_day_sum', int), - Columndef('previous_day_admission_adult_covid_suspected_7_day_coverage', - 'previous_day_admission_adult_covid_suspected_7_day_coverage', int), - Columndef('previous_day_admission_adult_covid_suspected_7_day_sum', - 'previous_day_admission_adult_covid_suspected_7_day_sum', int), - Columndef('previous_day_admission_adult_covid_suspected_80+_7_day_sum', - 'previous_day_admission_adult_covid_suspected_80plus_7_day_sum', int), - Columndef('previous_day_admission_adult_covid_suspected_unknown_7_day_sum', - 'previous_day_admission_adult_covid_suspected_unknown_7_day_sum', int), - Columndef('previous_day_admission_influenza_confirmed_7_day_sum', - 'previous_day_admission_influenza_confirmed_7_day_sum', int), - Columndef('previous_day_admission_pediatric_covid_confirmed_7_day_coverage', - 'previous_day_admission_pediatric_covid_confirmed_7_day_coverage', int), - Columndef('previous_day_admission_pediatric_covid_confirmed_7_day_sum', - 'previous_day_admission_pediatric_covid_confirmed_7_day_sum', int), - Columndef('previous_day_admission_pediatric_covid_suspected_7_day_coverage', - 'previous_day_admission_pediatric_covid_suspected_7_day_coverage', int), - Columndef('previous_day_admission_pediatric_covid_suspected_7_day_sum', - 'previous_day_admission_pediatric_covid_suspected_7_day_sum', int), - Columndef('previous_day_covid_ED_visits_7_day_sum', 'previous_day_covid_ed_visits_7_day_sum', int), - Columndef('previous_day_total_ED_visits_7_day_sum', 'previous_day_total_ed_visits_7_day_sum', int), - Columndef('previous_week_patients_covid_vaccinated_doses_all_7_day', - 'previous_week_patients_covid_vaccinated_doses_all_7_day', int), - Columndef('previous_week_patients_covid_vaccinated_doses_all_7_day_sum', - 'previous_week_patients_covid_vaccinated_doses_all_7_day_sum', int), - Columndef('previous_week_patients_covid_vaccinated_doses_one_7_day', - 'previous_week_patients_covid_vaccinated_doses_one_7_day', int), - Columndef('previous_week_patients_covid_vaccinated_doses_one_7_day_sum', - 'previous_week_patients_covid_vaccinated_doses_one_7_day_sum', int), - Columndef('previous_week_personnel_covid_vaccinated_doses_administered_7_day', - 'previous_week_personnel_covid_vaccd_doses_administered_7_day', int), - Columndef('previous_week_personnel_covid_vaccinated_doses_administered_7_day_sum', - 'previous_week_personnel_covid_vaccd_doses_administered_7_day_sum', int), - Columndef('staffed_adult_icu_bed_occupancy_7_day_avg', 'staffed_adult_icu_bed_occupancy_7_day_avg', - float), - Columndef('staffed_adult_icu_bed_occupancy_7_day_coverage', - 'staffed_adult_icu_bed_occupancy_7_day_coverage', int), - Columndef('staffed_adult_icu_bed_occupancy_7_day_sum', 'staffed_adult_icu_bed_occupancy_7_day_sum', - int), - Columndef('staffed_icu_adult_patients_confirmed_and_suspected_covid_7_day_avg', - 'staffed_icu_adult_patients_confirmed_suspected_covid_7d_avg', float), - Columndef('staffed_icu_adult_patients_confirmed_and_suspected_covid_7_day_coverage', - 'staffed_icu_adult_patients_confirmed_suspected_covid_7d_cov', int), - Columndef('staffed_icu_adult_patients_confirmed_and_suspected_covid_7_day_sum', - 'staffed_icu_adult_patients_confirmed_suspected_covid_7d_sum', int), - Columndef('staffed_icu_adult_patients_confirmed_covid_7_day_avg', - 'staffed_icu_adult_patients_confirmed_covid_7_day_avg', float), - Columndef('staffed_icu_adult_patients_confirmed_covid_7_day_coverage', - 'staffed_icu_adult_patients_confirmed_covid_7_day_coverage', int), - Columndef('staffed_icu_adult_patients_confirmed_covid_7_day_sum', - 'staffed_icu_adult_patients_confirmed_covid_7_day_sum', int), - Columndef('state', 'state', str), - Columndef('total_adult_patients_hospitalized_confirmed_and_suspected_covid_7_day_avg', - 'total_adult_patients_hosp_confirmed_suspected_covid_7d_avg', float), - Columndef('total_adult_patients_hospitalized_confirmed_and_suspected_covid_7_day_coverage', - 'total_adult_patients_hosp_confirmed_suspected_covid_7d_cov', int), - Columndef('total_adult_patients_hospitalized_confirmed_and_suspected_covid_7_day_sum', - 'total_adult_patients_hosp_confirmed_suspected_covid_7d_sum', int), - Columndef('total_adult_patients_hospitalized_confirmed_covid_7_day_avg', - 'total_adult_patients_hospitalized_confirmed_covid_7_day_avg', float), - Columndef('total_adult_patients_hospitalized_confirmed_covid_7_day_coverage', - 'total_adult_patients_hospitalized_confirmed_covid_7_day_coverage', int), - Columndef('total_adult_patients_hospitalized_confirmed_covid_7_day_sum', - 'total_adult_patients_hospitalized_confirmed_covid_7_day_sum', int), - Columndef('total_beds_7_day_avg', 'total_beds_7_day_avg', float), - Columndef('total_beds_7_day_coverage', 'total_beds_7_day_coverage', int), - Columndef('total_beds_7_day_sum', 'total_beds_7_day_sum', int), - Columndef('total_icu_beds_7_day_avg', 'total_icu_beds_7_day_avg', float), - Columndef('total_icu_beds_7_day_coverage', 'total_icu_beds_7_day_coverage', int), - Columndef('total_icu_beds_7_day_sum', 'total_icu_beds_7_day_sum', int), - Columndef('total_patients_hospitalized_confirmed_influenza_7_day_avg', - 'total_patients_hospitalized_confirmed_influenza_7_day_avg', float), - Columndef('total_patients_hospitalized_confirmed_influenza_7_day_coverage', - 'total_patients_hospitalized_confirmed_influenza_7_day_coverage', int), - Columndef('total_patients_hospitalized_confirmed_influenza_7_day_sum', - 'total_patients_hospitalized_confirmed_influenza_7_day_sum', int), - Columndef('total_patients_hospitalized_confirmed_influenza_and_covid_7_day_avg', - 'total_patients_hosp_confirmed_influenza_and_covid_7d_avg', float), - Columndef('total_patients_hospitalized_confirmed_influenza_and_covid_7_day_coverage', - 'total_patients_hosp_confirmed_influenza_and_covid_7d_cov', int), - Columndef('total_patients_hospitalized_confirmed_influenza_and_covid_7_day_sum', - 'total_patients_hosp_confirmed_influenza_and_covid_7d_sum', int), - Columndef('total_pediatric_patients_hospitalized_confirmed_and_suspected_covid_7_day_avg', - 'total_pediatric_patients_hosp_confirmed_suspected_covid_7d_avg', float), - Columndef('total_pediatric_patients_hospitalized_confirmed_and_suspected_covid_7_day_coverage', - 'total_pediatric_patients_hosp_confirmed_suspected_covid_7d_cov', int), - Columndef('total_pediatric_patients_hospitalized_confirmed_and_suspected_covid_7_day_sum', - 'total_pediatric_patients_hosp_confirmed_suspected_covid_7d_sum', int), - Columndef('total_pediatric_patients_hospitalized_confirmed_covid_7_day_avg', - 'total_pediatric_patients_hospitalized_confirmed_covid_7_day_avg', float), - Columndef('total_pediatric_patients_hospitalized_confirmed_covid_7_day_coverage', - 'total_pediatric_patients_hosp_confirmed_covid_7d_cov', int), - Columndef('total_pediatric_patients_hospitalized_confirmed_covid_7_day_sum', - 'total_pediatric_patients_hospitalized_confirmed_covid_7_day_sum', int), - Columndef('total_personnel_covid_vaccinated_doses_all_7_day', - 'total_personnel_covid_vaccinated_doses_all_7_day', int), - Columndef('total_personnel_covid_vaccinated_doses_all_7_day_sum', - 'total_personnel_covid_vaccinated_doses_all_7_day_sum', int), - Columndef('total_personnel_covid_vaccinated_doses_none_7_day', - 'total_personnel_covid_vaccinated_doses_none_7_day', int), - Columndef('total_personnel_covid_vaccinated_doses_none_7_day_sum', - 'total_personnel_covid_vaccinated_doses_none_7_day_sum', int), - Columndef('total_personnel_covid_vaccinated_doses_one_7_day', - 'total_personnel_covid_vaccinated_doses_one_7_day', int), - Columndef('total_personnel_covid_vaccinated_doses_one_7_day_sum', - 'total_personnel_covid_vaccinated_doses_one_7_day_sum', int), - Columndef('total_staffed_adult_icu_beds_7_day_avg', 'total_staffed_adult_icu_beds_7_day_avg', float), - Columndef('total_staffed_adult_icu_beds_7_day_coverage', 'total_staffed_adult_icu_beds_7_day_coverage', - int), - Columndef('total_staffed_adult_icu_beds_7_day_sum', 'total_staffed_adult_icu_beds_7_day_sum', int), - Columndef('zip', 'zip', str), - ] + TABLE_NAME = "covid_hosp_facility" + KEY_COLS = ["hospital_pk", "collection_week"] + AGGREGATE_KEY_COLS = [ + "address", + "ccn", + "city", + "fips_code", + "geocoded_hospital_address", + "hhs_ids", + "hospital_name", + "hospital_pk", + "hospital_subtype", + "is_metro_micro", + "state", + "zip", + ] + # These are 3-tuples of ( + # CSV header name, + # SQL db column name, + # data type + # ) for all the columns in the CSV file. + # Note that the corresponding database column names may be shorter + # due to constraints on the length of column names. See + # /src/ddl/covid_hosp.sql for more information. + ORDERED_CSV_COLUMNS = [ + Columndef("hospital_pk", "hospital_pk", str), + Columndef("collection_week", "collection_week", Utils.int_from_date), + Columndef("address", "address", str), + Columndef("all_adult_hospital_beds_7_day_avg", "all_adult_hospital_beds_7_day_avg", float), + Columndef("all_adult_hospital_beds_7_day_coverage", "all_adult_hospital_beds_7_day_coverage", int), + Columndef("all_adult_hospital_beds_7_day_sum", "all_adult_hospital_beds_7_day_sum", int), + Columndef("all_adult_hospital_inpatient_bed_occupied_7_day_avg", "all_adult_hospital_inpatient_bed_occupied_7_day_avg", float), + Columndef("all_adult_hospital_inpatient_bed_occupied_7_day_coverage", "all_adult_hospital_inpatient_bed_occupied_7_day_coverage", int), + Columndef("all_adult_hospital_inpatient_bed_occupied_7_day_sum", "all_adult_hospital_inpatient_bed_occupied_7_day_sum", int), + Columndef("all_adult_hospital_inpatient_beds_7_day_avg", "all_adult_hospital_inpatient_beds_7_day_avg", float), + Columndef("all_adult_hospital_inpatient_beds_7_day_coverage", "all_adult_hospital_inpatient_beds_7_day_coverage", int), + Columndef("all_adult_hospital_inpatient_beds_7_day_sum", "all_adult_hospital_inpatient_beds_7_day_sum", int), + Columndef("ccn", "ccn", str), + Columndef("city", "city", str), + Columndef("fips_code", "fips_code", str), + Columndef("geocoded_hospital_address", "geocoded_hospital_address", Utils.limited_geocode), + Columndef("hhs_ids", "hhs_ids", str), + Columndef("hospital_name", "hospital_name", str), + Columndef("hospital_subtype", "hospital_subtype", str), + Columndef("icu_beds_used_7_day_avg", "icu_beds_used_7_day_avg", float), + Columndef("icu_beds_used_7_day_coverage", "icu_beds_used_7_day_coverage", int), + Columndef("icu_beds_used_7_day_sum", "icu_beds_used_7_day_sum", int), + Columndef("icu_patients_confirmed_influenza_7_day_avg", "icu_patients_confirmed_influenza_7_day_avg", float), + Columndef("icu_patients_confirmed_influenza_7_day_coverage", "icu_patients_confirmed_influenza_7_day_coverage", int), + Columndef("icu_patients_confirmed_influenza_7_day_sum", "icu_patients_confirmed_influenza_7_day_sum", int), + Columndef("inpatient_beds_7_day_avg", "inpatient_beds_7_day_avg", float), + Columndef("inpatient_beds_7_day_coverage", "inpatient_beds_7_day_coverage", int), + Columndef("inpatient_beds_7_day_sum", "inpatient_beds_7_day_sum", int), + Columndef("inpatient_beds_used_7_day_avg", "inpatient_beds_used_7_day_avg", float), + Columndef("inpatient_beds_used_7_day_coverage", "inpatient_beds_used_7_day_coverage", int), + Columndef("inpatient_beds_used_7_day_sum", "inpatient_beds_used_7_day_sum", int), + Columndef("is_corrected", "is_corrected", Utils.parse_bool), + Columndef("is_metro_micro", "is_metro_micro", Utils.parse_bool), + Columndef("previous_day_admission_adult_covid_confirmed_18-19_7_day_sum", "previous_day_admission_adult_covid_confirmed_18_19_7_day_sum", int), + Columndef("previous_day_admission_adult_covid_confirmed_20-29_7_day_sum", "previous_day_admission_adult_covid_confirmed_20_29_7_day_sum", int), + Columndef("previous_day_admission_adult_covid_confirmed_30-39_7_day_sum", "previous_day_admission_adult_covid_confirmed_30_39_7_day_sum", int), + Columndef("previous_day_admission_adult_covid_confirmed_40-49_7_day_sum", "previous_day_admission_adult_covid_confirmed_40_49_7_day_sum", int), + Columndef("previous_day_admission_adult_covid_confirmed_50-59_7_day_sum", "previous_day_admission_adult_covid_confirmed_50_59_7_day_sum", int), + Columndef("previous_day_admission_adult_covid_confirmed_60-69_7_day_sum", "previous_day_admission_adult_covid_confirmed_60_69_7_day_sum", int), + Columndef("previous_day_admission_adult_covid_confirmed_70-79_7_day_sum", "previous_day_admission_adult_covid_confirmed_70_79_7_day_sum", int), + Columndef("previous_day_admission_adult_covid_confirmed_7_day_coverage", "previous_day_admission_adult_covid_confirmed_7_day_coverage", int), + Columndef("previous_day_admission_adult_covid_confirmed_7_day_sum", "previous_day_admission_adult_covid_confirmed_7_day_sum", int), + Columndef("previous_day_admission_adult_covid_confirmed_80+_7_day_sum", "previous_day_admission_adult_covid_confirmed_80plus_7_day_sum", int), + Columndef("previous_day_admission_adult_covid_confirmed_unknown_7_day_sum", "previous_day_admission_adult_covid_confirmed_unknown_7_day_sum", int), + Columndef("previous_day_admission_adult_covid_suspected_18-19_7_day_sum", "previous_day_admission_adult_covid_suspected_18_19_7_day_sum", int), + Columndef("previous_day_admission_adult_covid_suspected_20-29_7_day_sum", "previous_day_admission_adult_covid_suspected_20_29_7_day_sum", int), + Columndef("previous_day_admission_adult_covid_suspected_30-39_7_day_sum", "previous_day_admission_adult_covid_suspected_30_39_7_day_sum", int), + Columndef("previous_day_admission_adult_covid_suspected_40-49_7_day_sum", "previous_day_admission_adult_covid_suspected_40_49_7_day_sum", int), + Columndef("previous_day_admission_adult_covid_suspected_50-59_7_day_sum", "previous_day_admission_adult_covid_suspected_50_59_7_day_sum", int), + Columndef("previous_day_admission_adult_covid_suspected_60-69_7_day_sum", "previous_day_admission_adult_covid_suspected_60_69_7_day_sum", int), + Columndef("previous_day_admission_adult_covid_suspected_70-79_7_day_sum", "previous_day_admission_adult_covid_suspected_70_79_7_day_sum", int), + Columndef("previous_day_admission_adult_covid_suspected_7_day_coverage", "previous_day_admission_adult_covid_suspected_7_day_coverage", int), + Columndef("previous_day_admission_adult_covid_suspected_7_day_sum", "previous_day_admission_adult_covid_suspected_7_day_sum", int), + Columndef("previous_day_admission_adult_covid_suspected_80+_7_day_sum", "previous_day_admission_adult_covid_suspected_80plus_7_day_sum", int), + Columndef("previous_day_admission_adult_covid_suspected_unknown_7_day_sum", "previous_day_admission_adult_covid_suspected_unknown_7_day_sum", int), + Columndef("previous_day_admission_influenza_confirmed_7_day_sum", "previous_day_admission_influenza_confirmed_7_day_sum", int), + Columndef("previous_day_admission_pediatric_covid_confirmed_7_day_coverage", "previous_day_admission_pediatric_covid_confirmed_7_day_coverage", int), + Columndef("previous_day_admission_pediatric_covid_confirmed_7_day_sum", "previous_day_admission_pediatric_covid_confirmed_7_day_sum", int), + Columndef("previous_day_admission_pediatric_covid_suspected_7_day_coverage", "previous_day_admission_pediatric_covid_suspected_7_day_coverage", int), + Columndef("previous_day_admission_pediatric_covid_suspected_7_day_sum", "previous_day_admission_pediatric_covid_suspected_7_day_sum", int), + Columndef("previous_day_covid_ED_visits_7_day_sum", "previous_day_covid_ed_visits_7_day_sum", int), + Columndef("previous_day_total_ED_visits_7_day_sum", "previous_day_total_ed_visits_7_day_sum", int), + Columndef("previous_week_patients_covid_vaccinated_doses_all_7_day", "previous_week_patients_covid_vaccinated_doses_all_7_day", int), + Columndef("previous_week_patients_covid_vaccinated_doses_all_7_day_sum", "previous_week_patients_covid_vaccinated_doses_all_7_day_sum", int), + Columndef("previous_week_patients_covid_vaccinated_doses_one_7_day", "previous_week_patients_covid_vaccinated_doses_one_7_day", int), + Columndef("previous_week_patients_covid_vaccinated_doses_one_7_day_sum", "previous_week_patients_covid_vaccinated_doses_one_7_day_sum", int), + Columndef("previous_week_personnel_covid_vaccinated_doses_administered_7_day", "previous_week_personnel_covid_vaccd_doses_administered_7_day", int), + Columndef("previous_week_personnel_covid_vaccinated_doses_administered_7_day_sum", "previous_week_personnel_covid_vaccd_doses_administered_7_day_sum", int), + Columndef("staffed_adult_icu_bed_occupancy_7_day_avg", "staffed_adult_icu_bed_occupancy_7_day_avg", float), + Columndef("staffed_adult_icu_bed_occupancy_7_day_coverage", "staffed_adult_icu_bed_occupancy_7_day_coverage", int), + Columndef("staffed_adult_icu_bed_occupancy_7_day_sum", "staffed_adult_icu_bed_occupancy_7_day_sum", int), + Columndef("staffed_icu_adult_patients_confirmed_and_suspected_covid_7_day_avg", "staffed_icu_adult_patients_confirmed_suspected_covid_7d_avg", float), + Columndef("staffed_icu_adult_patients_confirmed_and_suspected_covid_7_day_coverage", "staffed_icu_adult_patients_confirmed_suspected_covid_7d_cov", int), + Columndef("staffed_icu_adult_patients_confirmed_and_suspected_covid_7_day_sum", "staffed_icu_adult_patients_confirmed_suspected_covid_7d_sum", int), + Columndef("staffed_icu_adult_patients_confirmed_covid_7_day_avg", "staffed_icu_adult_patients_confirmed_covid_7_day_avg", float), + Columndef("staffed_icu_adult_patients_confirmed_covid_7_day_coverage", "staffed_icu_adult_patients_confirmed_covid_7_day_coverage", int), + Columndef("staffed_icu_adult_patients_confirmed_covid_7_day_sum", "staffed_icu_adult_patients_confirmed_covid_7_day_sum", int), + Columndef("state", "state", str), + Columndef("total_adult_patients_hospitalized_confirmed_and_suspected_covid_7_day_avg", "total_adult_patients_hosp_confirmed_suspected_covid_7d_avg", float), + Columndef("total_adult_patients_hospitalized_confirmed_and_suspected_covid_7_day_coverage", "total_adult_patients_hosp_confirmed_suspected_covid_7d_cov", int), + Columndef("total_adult_patients_hospitalized_confirmed_and_suspected_covid_7_day_sum", "total_adult_patients_hosp_confirmed_suspected_covid_7d_sum", int), + Columndef("total_adult_patients_hospitalized_confirmed_covid_7_day_avg", "total_adult_patients_hospitalized_confirmed_covid_7_day_avg", float), + Columndef("total_adult_patients_hospitalized_confirmed_covid_7_day_coverage", "total_adult_patients_hospitalized_confirmed_covid_7_day_coverage", int), + Columndef("total_adult_patients_hospitalized_confirmed_covid_7_day_sum", "total_adult_patients_hospitalized_confirmed_covid_7_day_sum", int), + Columndef("total_beds_7_day_avg", "total_beds_7_day_avg", float), + Columndef("total_beds_7_day_coverage", "total_beds_7_day_coverage", int), + Columndef("total_beds_7_day_sum", "total_beds_7_day_sum", int), + Columndef("total_icu_beds_7_day_avg", "total_icu_beds_7_day_avg", float), + Columndef("total_icu_beds_7_day_coverage", "total_icu_beds_7_day_coverage", int), + Columndef("total_icu_beds_7_day_sum", "total_icu_beds_7_day_sum", int), + Columndef("total_patients_hospitalized_confirmed_influenza_7_day_avg", "total_patients_hospitalized_confirmed_influenza_7_day_avg", float), + Columndef("total_patients_hospitalized_confirmed_influenza_7_day_coverage", "total_patients_hospitalized_confirmed_influenza_7_day_coverage", int), + Columndef("total_patients_hospitalized_confirmed_influenza_7_day_sum", "total_patients_hospitalized_confirmed_influenza_7_day_sum", int), + Columndef("total_patients_hospitalized_confirmed_influenza_and_covid_7_day_avg", "total_patients_hosp_confirmed_influenza_and_covid_7d_avg", float), + Columndef("total_patients_hospitalized_confirmed_influenza_and_covid_7_day_coverage", "total_patients_hosp_confirmed_influenza_and_covid_7d_cov", int), + Columndef("total_patients_hospitalized_confirmed_influenza_and_covid_7_day_sum", "total_patients_hosp_confirmed_influenza_and_covid_7d_sum", int), + Columndef("total_pediatric_patients_hospitalized_confirmed_and_suspected_covid_7_day_avg", "total_pediatric_patients_hosp_confirmed_suspected_covid_7d_avg", float), + Columndef("total_pediatric_patients_hospitalized_confirmed_and_suspected_covid_7_day_coverage", "total_pediatric_patients_hosp_confirmed_suspected_covid_7d_cov", int), + Columndef("total_pediatric_patients_hospitalized_confirmed_and_suspected_covid_7_day_sum", "total_pediatric_patients_hosp_confirmed_suspected_covid_7d_sum", int), + Columndef("total_pediatric_patients_hospitalized_confirmed_covid_7_day_avg", "total_pediatric_patients_hospitalized_confirmed_covid_7_day_avg", float), + Columndef("total_pediatric_patients_hospitalized_confirmed_covid_7_day_coverage", "total_pediatric_patients_hosp_confirmed_covid_7d_cov", int), + Columndef("total_pediatric_patients_hospitalized_confirmed_covid_7_day_sum", "total_pediatric_patients_hospitalized_confirmed_covid_7_day_sum", int), + Columndef("total_personnel_covid_vaccinated_doses_all_7_day", "total_personnel_covid_vaccinated_doses_all_7_day", int), + Columndef("total_personnel_covid_vaccinated_doses_all_7_day_sum", "total_personnel_covid_vaccinated_doses_all_7_day_sum", int), + Columndef("total_personnel_covid_vaccinated_doses_none_7_day", "total_personnel_covid_vaccinated_doses_none_7_day", int), + Columndef("total_personnel_covid_vaccinated_doses_none_7_day_sum", "total_personnel_covid_vaccinated_doses_none_7_day_sum", int), + Columndef("total_personnel_covid_vaccinated_doses_one_7_day", "total_personnel_covid_vaccinated_doses_one_7_day", int), + Columndef("total_personnel_covid_vaccinated_doses_one_7_day_sum", "total_personnel_covid_vaccinated_doses_one_7_day_sum", int), + Columndef("total_staffed_adult_icu_beds_7_day_avg", "total_staffed_adult_icu_beds_7_day_avg", float), + Columndef("total_staffed_adult_icu_beds_7_day_coverage", "total_staffed_adult_icu_beds_7_day_coverage", int), + Columndef("total_staffed_adult_icu_beds_7_day_sum", "total_staffed_adult_icu_beds_7_day_sum", int), + Columndef("zip", "zip", str), + ] - def __init__(self, *args, **kwargs): - super().__init__( - *args, - **kwargs, - table_name=Database.TABLE_NAME, - hhs_dataset_id=Network.DATASET_ID, - key_columns=Database.KEY_COLS, - columns_and_types=Database.ORDERED_CSV_COLUMNS) + def __init__(self, *args, **kwargs): + super().__init__( + *args, + **kwargs, + table_name=Database.TABLE_NAME, + hhs_dataset_id=Network.DATASET_ID, + key_columns=Database.KEY_COLS, + columns_and_types=Database.ORDERED_CSV_COLUMNS, + ) diff --git a/src/acquisition/covid_hosp/facility/network.py b/src/acquisition/covid_hosp/facility/network.py index 6a0092c7f..9ed1bf6ca 100644 --- a/src/acquisition/covid_hosp/facility/network.py +++ b/src/acquisition/covid_hosp/facility/network.py @@ -4,14 +4,13 @@ class Network(BaseNetwork): - DATASET_ID = 'anag-cw7u' - METADATA_ID = 'j4ip-wfsv' + DATASET_ID = "anag-cw7u" + METADATA_ID = "j4ip-wfsv" - def fetch_metadata(*args, **kwags): - """Download and return metadata. + def fetch_metadata(*args, **kwags): + """Download and return metadata. - See `fetch_metadata_for_dataset`. - """ + See `fetch_metadata_for_dataset`. + """ - return Network.fetch_metadata_for_dataset( - *args, **kwags, dataset_id=Network.METADATA_ID) + return Network.fetch_metadata_for_dataset(*args, **kwags, dataset_id=Network.METADATA_ID) diff --git a/src/acquisition/covid_hosp/facility/update.py b/src/acquisition/covid_hosp/facility/update.py index b2b96c2e3..d269c353a 100644 --- a/src/acquisition/covid_hosp/facility/update.py +++ b/src/acquisition/covid_hosp/facility/update.py @@ -11,17 +11,16 @@ class Update: + def run(network=Network): + """Acquire the most recent dataset, unless it was previously acquired. - def run(network=Network): - """Acquire the most recent dataset, unless it was previously acquired. + Returns + ------- + bool + Whether a new dataset was acquired. + """ - Returns - ------- - bool - Whether a new dataset was acquired. - """ - - return Utils.update_dataset(Database, network) + return Utils.update_dataset(Database, network) # main entry point diff --git a/src/acquisition/covid_hosp/state_daily/database.py b/src/acquisition/covid_hosp/state_daily/database.py index 6a8228994..7dc0a0dbc 100644 --- a/src/acquisition/covid_hosp/state_daily/database.py +++ b/src/acquisition/covid_hosp/state_daily/database.py @@ -7,224 +7,148 @@ class Database(BaseDatabase): - # note we share a database with state_timeseries - TABLE_NAME = 'covid_hosp_state_timeseries' - KEY_COLS = ['state', 'reporting_cutoff_start'] - # These are 3-tuples of (CSV header name, SQL db column name, data type) for - # all the columns in the CSV file. - # Note that the corresponding database column names may be shorter - # due to constraints on the length of column names. See - # /src/ddl/covid_hosp.sql for more information. - # Additionally, all column names below are shared with state_timeseries, - # except for reporting_cutoff_start (here) and date (there). If you need - # to update a column name, do it in both places. - ORDERED_CSV_COLUMNS = [ - Columndef('state', 'state', str), - Columndef('reporting_cutoff_start', 'date', Utils.int_from_date), - Columndef('adult_icu_bed_covid_utilization', 'adult_icu_bed_covid_utilization', float), - Columndef('adult_icu_bed_covid_utilization_coverage', 'adult_icu_bed_covid_utilization_coverage', int), - Columndef('adult_icu_bed_covid_utilization_denominator', 'adult_icu_bed_covid_utilization_denominator', - int), - Columndef('adult_icu_bed_covid_utilization_numerator', 'adult_icu_bed_covid_utilization_numerator', - int), - Columndef('adult_icu_bed_utilization', 'adult_icu_bed_utilization', float), - Columndef('adult_icu_bed_utilization_coverage', 'adult_icu_bed_utilization_coverage', int), - Columndef('adult_icu_bed_utilization_denominator', 'adult_icu_bed_utilization_denominator', int), - Columndef('adult_icu_bed_utilization_numerator', 'adult_icu_bed_utilization_numerator', int), - Columndef('critical_staffing_shortage_anticipated_within_week_no', - 'critical_staffing_shortage_anticipated_within_week_no', int), - Columndef('critical_staffing_shortage_anticipated_within_week_not_reported', - 'critical_staffing_shortage_anticipated_within_week_not_reported', int), - Columndef('critical_staffing_shortage_anticipated_within_week_yes', - 'critical_staffing_shortage_anticipated_within_week_yes', int), - Columndef('critical_staffing_shortage_today_no', 'critical_staffing_shortage_today_no', int), - Columndef('critical_staffing_shortage_today_not_reported', - 'critical_staffing_shortage_today_not_reported', int), - Columndef('critical_staffing_shortage_today_yes', 'critical_staffing_shortage_today_yes', int), - Columndef('deaths_covid', 'deaths_covid', int), - Columndef('deaths_covid_coverage', 'deaths_covid_coverage', int), - Columndef('geocoded_state', 'geocoded_state', str), - Columndef('hospital_onset_covid', 'hospital_onset_covid', int), - Columndef('hospital_onset_covid_coverage', 'hospital_onset_covid_coverage', int), - Columndef('icu_patients_confirmed_influenza', 'icu_patients_confirmed_influenza', int), - Columndef('icu_patients_confirmed_influenza_coverage', 'icu_patients_confirmed_influenza_coverage', - int), - Columndef('inpatient_bed_covid_utilization', 'inpatient_bed_covid_utilization', float), - Columndef('inpatient_bed_covid_utilization_coverage', 'inpatient_bed_covid_utilization_coverage', int), - Columndef('inpatient_bed_covid_utilization_denominator', 'inpatient_bed_covid_utilization_denominator', - int), - Columndef('inpatient_bed_covid_utilization_numerator', 'inpatient_bed_covid_utilization_numerator', - int), - Columndef('inpatient_beds', 'inpatient_beds', int), - Columndef('inpatient_beds_coverage', 'inpatient_beds_coverage', int), - Columndef('inpatient_beds_used', 'inpatient_beds_used', int), - Columndef('inpatient_beds_used_coverage', 'inpatient_beds_used_coverage', int), - Columndef('inpatient_beds_used_covid', 'inpatient_beds_used_covid', int), - Columndef('inpatient_beds_used_covid_coverage', 'inpatient_beds_used_covid_coverage', int), - Columndef('inpatient_beds_utilization', 'inpatient_beds_utilization', float), - Columndef('inpatient_beds_utilization_coverage', 'inpatient_beds_utilization_coverage', int), - Columndef('inpatient_beds_utilization_denominator', 'inpatient_beds_utilization_denominator', int), - Columndef('inpatient_beds_utilization_numerator', 'inpatient_beds_utilization_numerator', int), - Columndef('on_hand_supply_therapeutic_a_casirivimab_imdevimab_courses', - 'on_hand_supply_therapeutic_a_casirivimab_imdevimab_courses', int), - Columndef('on_hand_supply_therapeutic_b_bamlanivimab_courses', - 'on_hand_supply_therapeutic_b_bamlanivimab_courses', int), - Columndef('on_hand_supply_therapeutic_c_bamlanivimab_etesevimab_courses', - 'on_hand_supply_therapeutic_c_bamlanivimab_etesevimab_courses', int), - Columndef('percent_of_inpatients_with_covid', 'percent_of_inpatients_with_covid', float), - Columndef('percent_of_inpatients_with_covid_coverage', 'percent_of_inpatients_with_covid_coverage', - int), - Columndef('percent_of_inpatients_with_covid_denominator', - 'percent_of_inpatients_with_covid_denominator', int), - Columndef('percent_of_inpatients_with_covid_numerator', 'percent_of_inpatients_with_covid_numerator', - int), - Columndef('previous_day_admission_adult_covid_confirmed', - 'previous_day_admission_adult_covid_confirmed', int), - Columndef('previous_day_admission_adult_covid_confirmed_18-19', - 'previous_day_admission_adult_covid_confirmed_18_19', int), - Columndef('previous_day_admission_adult_covid_confirmed_18-19_coverage', - 'previous_day_admission_adult_covid_confirmed_18_19_coverage', int), - Columndef('previous_day_admission_adult_covid_confirmed_20-29', - 'previous_day_admission_adult_covid_confirmed_20_29', int), - Columndef('previous_day_admission_adult_covid_confirmed_20-29_coverage', - 'previous_day_admission_adult_covid_confirmed_20_29_coverage', int), - Columndef('previous_day_admission_adult_covid_confirmed_30-39', - 'previous_day_admission_adult_covid_confirmed_30_39', int), - Columndef('previous_day_admission_adult_covid_confirmed_30-39_coverage', - 'previous_day_admission_adult_covid_confirmed_30_39_coverage', int), - Columndef('previous_day_admission_adult_covid_confirmed_40-49', - 'previous_day_admission_adult_covid_confirmed_40_49', int), - Columndef('previous_day_admission_adult_covid_confirmed_40-49_coverage', - 'previous_day_admission_adult_covid_confirmed_40_49_coverage', int), - Columndef('previous_day_admission_adult_covid_confirmed_50-59', - 'previous_day_admission_adult_covid_confirmed_50_59', int), - Columndef('previous_day_admission_adult_covid_confirmed_50-59_coverage', - 'previous_day_admission_adult_covid_confirmed_50_59_coverage', int), - Columndef('previous_day_admission_adult_covid_confirmed_60-69', - 'previous_day_admission_adult_covid_confirmed_60_69', int), - Columndef('previous_day_admission_adult_covid_confirmed_60-69_coverage', - 'previous_day_admission_adult_covid_confirmed_60_69_coverage', int), - Columndef('previous_day_admission_adult_covid_confirmed_70-79', - 'previous_day_admission_adult_covid_confirmed_70_79', int), - Columndef('previous_day_admission_adult_covid_confirmed_70-79_coverage', - 'previous_day_admission_adult_covid_confirmed_70_79_coverage', int), - Columndef('previous_day_admission_adult_covid_confirmed_80+', - 'previous_day_admission_adult_covid_confirmed_80plus', int), - Columndef('previous_day_admission_adult_covid_confirmed_80+_coverage', - 'previous_day_admission_adult_covid_confirmed_80plus_coverage', int), - Columndef('previous_day_admission_adult_covid_confirmed_coverage', - 'previous_day_admission_adult_covid_confirmed_coverage', int), - Columndef('previous_day_admission_adult_covid_confirmed_unknown', - 'previous_day_admission_adult_covid_confirmed_unknown', int), - Columndef('previous_day_admission_adult_covid_confirmed_unknown_coverage', - 'previous_day_admission_adult_covid_confirmed_unknown_coverage', int), - Columndef('previous_day_admission_adult_covid_suspected', - 'previous_day_admission_adult_covid_suspected', int), - Columndef('previous_day_admission_adult_covid_suspected_18-19', - 'previous_day_admission_adult_covid_suspected_18_19', int), - Columndef('previous_day_admission_adult_covid_suspected_18-19_coverage', - 'previous_day_admission_adult_covid_suspected_18_19_coverage', int), - Columndef('previous_day_admission_adult_covid_suspected_20-29', - 'previous_day_admission_adult_covid_suspected_20_29', int), - Columndef('previous_day_admission_adult_covid_suspected_20-29_coverage', - 'previous_day_admission_adult_covid_suspected_20_29_coverage', int), - Columndef('previous_day_admission_adult_covid_suspected_30-39', - 'previous_day_admission_adult_covid_suspected_30_39', int), - Columndef('previous_day_admission_adult_covid_suspected_30-39_coverage', - 'previous_day_admission_adult_covid_suspected_30_39_coverage', int), - Columndef('previous_day_admission_adult_covid_suspected_40-49', - 'previous_day_admission_adult_covid_suspected_40_49', int), - Columndef('previous_day_admission_adult_covid_suspected_40-49_coverage', - 'previous_day_admission_adult_covid_suspected_40_49_coverage', int), - Columndef('previous_day_admission_adult_covid_suspected_50-59', - 'previous_day_admission_adult_covid_suspected_50_59', int), - Columndef('previous_day_admission_adult_covid_suspected_50-59_coverage', - 'previous_day_admission_adult_covid_suspected_50_59_coverage', int), - Columndef('previous_day_admission_adult_covid_suspected_60_69', #this is correct; csv header is irregular - 'previous_day_admission_adult_covid_suspected_60_69', int), - Columndef('previous_day_admission_adult_covid_suspected_60-69_coverage', - 'previous_day_admission_adult_covid_suspected_60_69_coverage', int), - Columndef('previous_day_admission_adult_covid_suspected_70-79', - 'previous_day_admission_adult_covid_suspected_70_79', int), - Columndef('previous_day_admission_adult_covid_suspected_70-79_coverage', - 'previous_day_admission_adult_covid_suspected_70_79_coverage', int), - Columndef('previous_day_admission_adult_covid_suspected_80', - 'previous_day_admission_adult_covid_suspected_80plus', int), - Columndef('previous_day_admission_adult_covid_suspected_80+_coverage', - 'previous_day_admission_adult_covid_suspected_80plus_coverage', int), - Columndef('previous_day_admission_adult_covid_suspected_coverage', - 'previous_day_admission_adult_covid_suspected_coverage', int), - Columndef('previous_day_admission_adult_covid_suspected_unknown', - 'previous_day_admission_adult_covid_suspected_unknown', int), - Columndef('previous_day_admission_adult_covid_suspected_unknown_coverage', - 'previous_day_admission_adult_covid_suspected_unknown_coverage', int), - Columndef('previous_day_admission_influenza_confirmed', 'previous_day_admission_influenza_confirmed', - int), - Columndef('previous_day_admission_influenza_confirmed_coverage', - 'previous_day_admission_influenza_confirmed_coverage', int), - Columndef('previous_day_admission_pediatric_covid_confirmed', - 'previous_day_admission_pediatric_covid_confirmed', int), - Columndef('previous_day_admission_pediatric_covid_confirmed_coverage', - 'previous_day_admission_pediatric_covid_confirmed_coverage', int), - Columndef('previous_day_admission_pediatric_covid_suspected', - 'previous_day_admission_pediatric_covid_suspected', int), - Columndef('previous_day_admission_pediatric_covid_suspected_coverage', - 'previous_day_admission_pediatric_covid_suspected_coverage', int), - Columndef('previous_day_deaths_covid_and_influenza', 'previous_day_deaths_covid_and_influenza', int), - Columndef('previous_day_deaths_covid_and_influenza_coverage', - 'previous_day_deaths_covid_and_influenza_coverage', int), - Columndef('previous_day_deaths_influenza', 'previous_day_deaths_influenza', int), - Columndef('previous_day_deaths_influenza_coverage', 'previous_day_deaths_influenza_coverage', int), - Columndef('previous_week_therapeutic_a_casirivimab_imdevimab_courses_used', - 'previous_week_therapeutic_a_casirivimab_imdevimab_courses_used', int), - Columndef('previous_week_therapeutic_b_bamlanivimab_courses_used', - 'previous_week_therapeutic_b_bamlanivimab_courses_used', int), - Columndef('previous_week_therapeutic_c_bamlanivimab_etesevimab_courses_used', - 'previous_week_therapeutic_c_bamlanivimab_etesevimab_courses_used', int), - Columndef('staffed_adult_icu_bed_occupancy', 'staffed_adult_icu_bed_occupancy', int), - Columndef('staffed_adult_icu_bed_occupancy_coverage', 'staffed_adult_icu_bed_occupancy_coverage', int), - Columndef('staffed_icu_adult_patients_confirmed_and_suspected_covid', - 'staffed_icu_adult_patients_confirmed_suspected_covid', int), - Columndef('staffed_icu_adult_patients_confirmed_and_suspected_covid_coverage', - 'staffed_icu_adult_patients_confirmed_suspected_covid_coverage', int), - Columndef('staffed_icu_adult_patients_confirmed_covid', 'staffed_icu_adult_patients_confirmed_covid', - int), - Columndef('staffed_icu_adult_patients_confirmed_covid_coverage', - 'staffed_icu_adult_patients_confirmed_covid_coverage', int), - Columndef('total_adult_patients_hospitalized_confirmed_and_suspected_covid', - 'total_adult_patients_hosp_confirmed_suspected_covid', int), - Columndef('total_adult_patients_hospitalized_confirmed_and_suspected_covid_coverage', - 'total_adult_patients_hosp_confirmed_suspected_covid_coverage', int), - Columndef('total_adult_patients_hospitalized_confirmed_covid', - 'total_adult_patients_hosp_confirmed_covid', int), - Columndef('total_adult_patients_hospitalized_confirmed_covid_coverage', - 'total_adult_patients_hosp_confirmed_covid_coverage', int), - Columndef('total_patients_hospitalized_confirmed_influenza', - 'total_patients_hospitalized_confirmed_influenza', int), - Columndef('total_patients_hospitalized_confirmed_influenza_coverage', - 'total_patients_hospitalized_confirmed_influenza_coverage', int), - Columndef('total_patients_hospitalized_confirmed_influenza_covid', - 'total_patients_hospitalized_confirmed_influenza_covid', int), - Columndef('total_patients_hospitalized_confirmed_influenza_covid_coverage', - 'total_patients_hospitalized_confirmed_influenza_covid_coverage', int), - Columndef('total_pediatric_patients_hospitalized_confirmed_and_suspected_covid', - 'total_pediatric_patients_hosp_confirmed_suspected_covid', int), - Columndef('total_pediatric_patients_hospitalized_confirmed_and_suspected_covid_coverage', - 'total_pediatric_patients_hosp_confirmed_suspected_covid_coverage', int), - Columndef('total_pediatric_patients_hospitalized_confirmed_covid', - 'total_pediatric_patients_hosp_confirmed_covid', int), - Columndef('total_pediatric_patients_hospitalized_confirmed_covid_coverage', - 'total_pediatric_patients_hosp_confirmed_covid_coverage', int), - Columndef('total_staffed_adult_icu_beds', 'total_staffed_adult_icu_beds', int), - Columndef('total_staffed_adult_icu_beds_coverage', 'total_staffed_adult_icu_beds_coverage', int), - ] + # note we share a database with state_timeseries + TABLE_NAME = "covid_hosp_state_timeseries" + KEY_COLS = ["state", "reporting_cutoff_start"] + # These are 3-tuples of (CSV header name, SQL db column name, data type) for + # all the columns in the CSV file. + # Note that the corresponding database column names may be shorter + # due to constraints on the length of column names. See + # /src/ddl/covid_hosp.sql for more information. + # Additionally, all column names below are shared with state_timeseries, + # except for reporting_cutoff_start (here) and date (there). If you need + # to update a column name, do it in both places. + ORDERED_CSV_COLUMNS = [ + Columndef("state", "state", str), + Columndef("reporting_cutoff_start", "date", Utils.int_from_date), + Columndef("adult_icu_bed_covid_utilization", "adult_icu_bed_covid_utilization", float), + Columndef("adult_icu_bed_covid_utilization_coverage", "adult_icu_bed_covid_utilization_coverage", int), + Columndef("adult_icu_bed_covid_utilization_denominator", "adult_icu_bed_covid_utilization_denominator", int), + Columndef("adult_icu_bed_covid_utilization_numerator", "adult_icu_bed_covid_utilization_numerator", int), + Columndef("adult_icu_bed_utilization", "adult_icu_bed_utilization", float), + Columndef("adult_icu_bed_utilization_coverage", "adult_icu_bed_utilization_coverage", int), + Columndef("adult_icu_bed_utilization_denominator", "adult_icu_bed_utilization_denominator", int), + Columndef("adult_icu_bed_utilization_numerator", "adult_icu_bed_utilization_numerator", int), + Columndef("critical_staffing_shortage_anticipated_within_week_no", "critical_staffing_shortage_anticipated_within_week_no", int), + Columndef("critical_staffing_shortage_anticipated_within_week_not_reported", "critical_staffing_shortage_anticipated_within_week_not_reported", int), + Columndef("critical_staffing_shortage_anticipated_within_week_yes", "critical_staffing_shortage_anticipated_within_week_yes", int), + Columndef("critical_staffing_shortage_today_no", "critical_staffing_shortage_today_no", int), + Columndef("critical_staffing_shortage_today_not_reported", "critical_staffing_shortage_today_not_reported", int), + Columndef("critical_staffing_shortage_today_yes", "critical_staffing_shortage_today_yes", int), + Columndef("deaths_covid", "deaths_covid", int), + Columndef("deaths_covid_coverage", "deaths_covid_coverage", int), + Columndef("geocoded_state", "geocoded_state", str), + Columndef("hospital_onset_covid", "hospital_onset_covid", int), + Columndef("hospital_onset_covid_coverage", "hospital_onset_covid_coverage", int), + Columndef("icu_patients_confirmed_influenza", "icu_patients_confirmed_influenza", int), + Columndef("icu_patients_confirmed_influenza_coverage", "icu_patients_confirmed_influenza_coverage", int), + Columndef("inpatient_bed_covid_utilization", "inpatient_bed_covid_utilization", float), + Columndef("inpatient_bed_covid_utilization_coverage", "inpatient_bed_covid_utilization_coverage", int), + Columndef("inpatient_bed_covid_utilization_denominator", "inpatient_bed_covid_utilization_denominator", int), + Columndef("inpatient_bed_covid_utilization_numerator", "inpatient_bed_covid_utilization_numerator", int), + Columndef("inpatient_beds", "inpatient_beds", int), + Columndef("inpatient_beds_coverage", "inpatient_beds_coverage", int), + Columndef("inpatient_beds_used", "inpatient_beds_used", int), + Columndef("inpatient_beds_used_coverage", "inpatient_beds_used_coverage", int), + Columndef("inpatient_beds_used_covid", "inpatient_beds_used_covid", int), + Columndef("inpatient_beds_used_covid_coverage", "inpatient_beds_used_covid_coverage", int), + Columndef("inpatient_beds_utilization", "inpatient_beds_utilization", float), + Columndef("inpatient_beds_utilization_coverage", "inpatient_beds_utilization_coverage", int), + Columndef("inpatient_beds_utilization_denominator", "inpatient_beds_utilization_denominator", int), + Columndef("inpatient_beds_utilization_numerator", "inpatient_beds_utilization_numerator", int), + Columndef("on_hand_supply_therapeutic_a_casirivimab_imdevimab_courses", "on_hand_supply_therapeutic_a_casirivimab_imdevimab_courses", int), + Columndef("on_hand_supply_therapeutic_b_bamlanivimab_courses", "on_hand_supply_therapeutic_b_bamlanivimab_courses", int), + Columndef("on_hand_supply_therapeutic_c_bamlanivimab_etesevimab_courses", "on_hand_supply_therapeutic_c_bamlanivimab_etesevimab_courses", int), + Columndef("percent_of_inpatients_with_covid", "percent_of_inpatients_with_covid", float), + Columndef("percent_of_inpatients_with_covid_coverage", "percent_of_inpatients_with_covid_coverage", int), + Columndef("percent_of_inpatients_with_covid_denominator", "percent_of_inpatients_with_covid_denominator", int), + Columndef("percent_of_inpatients_with_covid_numerator", "percent_of_inpatients_with_covid_numerator", int), + Columndef("previous_day_admission_adult_covid_confirmed", "previous_day_admission_adult_covid_confirmed", int), + Columndef("previous_day_admission_adult_covid_confirmed_18-19", "previous_day_admission_adult_covid_confirmed_18_19", int), + Columndef("previous_day_admission_adult_covid_confirmed_18-19_coverage", "previous_day_admission_adult_covid_confirmed_18_19_coverage", int), + Columndef("previous_day_admission_adult_covid_confirmed_20-29", "previous_day_admission_adult_covid_confirmed_20_29", int), + Columndef("previous_day_admission_adult_covid_confirmed_20-29_coverage", "previous_day_admission_adult_covid_confirmed_20_29_coverage", int), + Columndef("previous_day_admission_adult_covid_confirmed_30-39", "previous_day_admission_adult_covid_confirmed_30_39", int), + Columndef("previous_day_admission_adult_covid_confirmed_30-39_coverage", "previous_day_admission_adult_covid_confirmed_30_39_coverage", int), + Columndef("previous_day_admission_adult_covid_confirmed_40-49", "previous_day_admission_adult_covid_confirmed_40_49", int), + Columndef("previous_day_admission_adult_covid_confirmed_40-49_coverage", "previous_day_admission_adult_covid_confirmed_40_49_coverage", int), + Columndef("previous_day_admission_adult_covid_confirmed_50-59", "previous_day_admission_adult_covid_confirmed_50_59", int), + Columndef("previous_day_admission_adult_covid_confirmed_50-59_coverage", "previous_day_admission_adult_covid_confirmed_50_59_coverage", int), + Columndef("previous_day_admission_adult_covid_confirmed_60-69", "previous_day_admission_adult_covid_confirmed_60_69", int), + Columndef("previous_day_admission_adult_covid_confirmed_60-69_coverage", "previous_day_admission_adult_covid_confirmed_60_69_coverage", int), + Columndef("previous_day_admission_adult_covid_confirmed_70-79", "previous_day_admission_adult_covid_confirmed_70_79", int), + Columndef("previous_day_admission_adult_covid_confirmed_70-79_coverage", "previous_day_admission_adult_covid_confirmed_70_79_coverage", int), + Columndef("previous_day_admission_adult_covid_confirmed_80+", "previous_day_admission_adult_covid_confirmed_80plus", int), + Columndef("previous_day_admission_adult_covid_confirmed_80+_coverage", "previous_day_admission_adult_covid_confirmed_80plus_coverage", int), + Columndef("previous_day_admission_adult_covid_confirmed_coverage", "previous_day_admission_adult_covid_confirmed_coverage", int), + Columndef("previous_day_admission_adult_covid_confirmed_unknown", "previous_day_admission_adult_covid_confirmed_unknown", int), + Columndef("previous_day_admission_adult_covid_confirmed_unknown_coverage", "previous_day_admission_adult_covid_confirmed_unknown_coverage", int), + Columndef("previous_day_admission_adult_covid_suspected", "previous_day_admission_adult_covid_suspected", int), + Columndef("previous_day_admission_adult_covid_suspected_18-19", "previous_day_admission_adult_covid_suspected_18_19", int), + Columndef("previous_day_admission_adult_covid_suspected_18-19_coverage", "previous_day_admission_adult_covid_suspected_18_19_coverage", int), + Columndef("previous_day_admission_adult_covid_suspected_20-29", "previous_day_admission_adult_covid_suspected_20_29", int), + Columndef("previous_day_admission_adult_covid_suspected_20-29_coverage", "previous_day_admission_adult_covid_suspected_20_29_coverage", int), + Columndef("previous_day_admission_adult_covid_suspected_30-39", "previous_day_admission_adult_covid_suspected_30_39", int), + Columndef("previous_day_admission_adult_covid_suspected_30-39_coverage", "previous_day_admission_adult_covid_suspected_30_39_coverage", int), + Columndef("previous_day_admission_adult_covid_suspected_40-49", "previous_day_admission_adult_covid_suspected_40_49", int), + Columndef("previous_day_admission_adult_covid_suspected_40-49_coverage", "previous_day_admission_adult_covid_suspected_40_49_coverage", int), + Columndef("previous_day_admission_adult_covid_suspected_50-59", "previous_day_admission_adult_covid_suspected_50_59", int), + Columndef("previous_day_admission_adult_covid_suspected_50-59_coverage", "previous_day_admission_adult_covid_suspected_50_59_coverage", int), + Columndef("previous_day_admission_adult_covid_suspected_60_69", "previous_day_admission_adult_covid_suspected_60_69", int), # this is correct; csv header is irregular + Columndef("previous_day_admission_adult_covid_suspected_60-69_coverage", "previous_day_admission_adult_covid_suspected_60_69_coverage", int), + Columndef("previous_day_admission_adult_covid_suspected_70-79", "previous_day_admission_adult_covid_suspected_70_79", int), + Columndef("previous_day_admission_adult_covid_suspected_70-79_coverage", "previous_day_admission_adult_covid_suspected_70_79_coverage", int), + Columndef("previous_day_admission_adult_covid_suspected_80", "previous_day_admission_adult_covid_suspected_80plus", int), + Columndef("previous_day_admission_adult_covid_suspected_80+_coverage", "previous_day_admission_adult_covid_suspected_80plus_coverage", int), + Columndef("previous_day_admission_adult_covid_suspected_coverage", "previous_day_admission_adult_covid_suspected_coverage", int), + Columndef("previous_day_admission_adult_covid_suspected_unknown", "previous_day_admission_adult_covid_suspected_unknown", int), + Columndef("previous_day_admission_adult_covid_suspected_unknown_coverage", "previous_day_admission_adult_covid_suspected_unknown_coverage", int), + Columndef("previous_day_admission_influenza_confirmed", "previous_day_admission_influenza_confirmed", int), + Columndef("previous_day_admission_influenza_confirmed_coverage", "previous_day_admission_influenza_confirmed_coverage", int), + Columndef("previous_day_admission_pediatric_covid_confirmed", "previous_day_admission_pediatric_covid_confirmed", int), + Columndef("previous_day_admission_pediatric_covid_confirmed_coverage", "previous_day_admission_pediatric_covid_confirmed_coverage", int), + Columndef("previous_day_admission_pediatric_covid_suspected", "previous_day_admission_pediatric_covid_suspected", int), + Columndef("previous_day_admission_pediatric_covid_suspected_coverage", "previous_day_admission_pediatric_covid_suspected_coverage", int), + Columndef("previous_day_deaths_covid_and_influenza", "previous_day_deaths_covid_and_influenza", int), + Columndef("previous_day_deaths_covid_and_influenza_coverage", "previous_day_deaths_covid_and_influenza_coverage", int), + Columndef("previous_day_deaths_influenza", "previous_day_deaths_influenza", int), + Columndef("previous_day_deaths_influenza_coverage", "previous_day_deaths_influenza_coverage", int), + Columndef("previous_week_therapeutic_a_casirivimab_imdevimab_courses_used", "previous_week_therapeutic_a_casirivimab_imdevimab_courses_used", int), + Columndef("previous_week_therapeutic_b_bamlanivimab_courses_used", "previous_week_therapeutic_b_bamlanivimab_courses_used", int), + Columndef("previous_week_therapeutic_c_bamlanivimab_etesevimab_courses_used", "previous_week_therapeutic_c_bamlanivimab_etesevimab_courses_used", int), + Columndef("staffed_adult_icu_bed_occupancy", "staffed_adult_icu_bed_occupancy", int), + Columndef("staffed_adult_icu_bed_occupancy_coverage", "staffed_adult_icu_bed_occupancy_coverage", int), + Columndef("staffed_icu_adult_patients_confirmed_and_suspected_covid", "staffed_icu_adult_patients_confirmed_suspected_covid", int), + Columndef("staffed_icu_adult_patients_confirmed_and_suspected_covid_coverage", "staffed_icu_adult_patients_confirmed_suspected_covid_coverage", int), + Columndef("staffed_icu_adult_patients_confirmed_covid", "staffed_icu_adult_patients_confirmed_covid", int), + Columndef("staffed_icu_adult_patients_confirmed_covid_coverage", "staffed_icu_adult_patients_confirmed_covid_coverage", int), + Columndef("total_adult_patients_hospitalized_confirmed_and_suspected_covid", "total_adult_patients_hosp_confirmed_suspected_covid", int), + Columndef("total_adult_patients_hospitalized_confirmed_and_suspected_covid_coverage", "total_adult_patients_hosp_confirmed_suspected_covid_coverage", int), + Columndef("total_adult_patients_hospitalized_confirmed_covid", "total_adult_patients_hosp_confirmed_covid", int), + Columndef("total_adult_patients_hospitalized_confirmed_covid_coverage", "total_adult_patients_hosp_confirmed_covid_coverage", int), + Columndef("total_patients_hospitalized_confirmed_influenza", "total_patients_hospitalized_confirmed_influenza", int), + Columndef("total_patients_hospitalized_confirmed_influenza_coverage", "total_patients_hospitalized_confirmed_influenza_coverage", int), + Columndef("total_patients_hospitalized_confirmed_influenza_covid", "total_patients_hospitalized_confirmed_influenza_covid", int), + Columndef("total_patients_hospitalized_confirmed_influenza_covid_coverage", "total_patients_hospitalized_confirmed_influenza_covid_coverage", int), + Columndef("total_pediatric_patients_hospitalized_confirmed_and_suspected_covid", "total_pediatric_patients_hosp_confirmed_suspected_covid", int), + Columndef( + "total_pediatric_patients_hospitalized_confirmed_and_suspected_covid_coverage", + "total_pediatric_patients_hosp_confirmed_suspected_covid_coverage", + int, + ), + Columndef("total_pediatric_patients_hospitalized_confirmed_covid", "total_pediatric_patients_hosp_confirmed_covid", int), + Columndef("total_pediatric_patients_hospitalized_confirmed_covid_coverage", "total_pediatric_patients_hosp_confirmed_covid_coverage", int), + Columndef("total_staffed_adult_icu_beds", "total_staffed_adult_icu_beds", int), + Columndef("total_staffed_adult_icu_beds_coverage", "total_staffed_adult_icu_beds_coverage", int), + ] - def __init__(self, *args, **kwargs): - super().__init__( - *args, - **kwargs, - table_name=Database.TABLE_NAME, - hhs_dataset_id=Network.DATASET_ID, - columns_and_types=Database.ORDERED_CSV_COLUMNS, - key_columns=Database.KEY_COLS, - additional_fields=[Columndef('D', 'record_type', None)]) + def __init__(self, *args, **kwargs): + super().__init__( + *args, + **kwargs, + table_name=Database.TABLE_NAME, + hhs_dataset_id=Network.DATASET_ID, + columns_and_types=Database.ORDERED_CSV_COLUMNS, + key_columns=Database.KEY_COLS, + additional_fields=[Columndef("D", "record_type", None)], + ) diff --git a/src/acquisition/covid_hosp/state_daily/network.py b/src/acquisition/covid_hosp/state_daily/network.py index f4678cc9b..5e4e9e4fb 100644 --- a/src/acquisition/covid_hosp/state_daily/network.py +++ b/src/acquisition/covid_hosp/state_daily/network.py @@ -1,36 +1,36 @@ # first party from delphi.epidata.acquisition.covid_hosp.common.network import Network as BaseNetwork + class Network(BaseNetwork): - DATASET_ID = '6xf2-c3ie' - METADATA_ID = '4cnb-m4rz' + DATASET_ID = "6xf2-c3ie" + METADATA_ID = "4cnb-m4rz" - @staticmethod - def fetch_metadata(*args, **kwags): - """Download and return metadata. + @staticmethod + def fetch_metadata(*args, **kwags): + """Download and return metadata. - See `fetch_metadata_for_dataset`. - """ + See `fetch_metadata_for_dataset`. + """ - return Network.fetch_metadata_for_dataset( - *args, **kwags, dataset_id=Network.METADATA_ID) + return Network.fetch_metadata_for_dataset(*args, **kwags, dataset_id=Network.METADATA_ID) - @staticmethod - def fetch_revisions(metadata, newer_than): - """ - Extract all dataset URLs from metadata for issues after newer_than. + @staticmethod + def fetch_revisions(metadata, newer_than): + """ + Extract all dataset URLs from metadata for issues after newer_than. - Parameters - ---------- - metadata DataFrame - Metadata DF containing all rows of metadata from data source page. + Parameters + ---------- + metadata DataFrame + Metadata DF containing all rows of metadata from data source page. - newer_than Timestamp or datetime - Date and time of issue to use as lower bound for new URLs. + newer_than Timestamp or datetime + Date and time of issue to use as lower bound for new URLs. - Returns - ------- - List of URLs of issues after newer_than - """ - return list(metadata.loc[metadata.index > newer_than, "Archive Link"]) + Returns + ------- + List of URLs of issues after newer_than + """ + return list(metadata.loc[metadata.index > newer_than, "Archive Link"]) diff --git a/src/acquisition/covid_hosp/state_daily/update.py b/src/acquisition/covid_hosp/state_daily/update.py index 12a51e6c3..d44efa369 100644 --- a/src/acquisition/covid_hosp/state_daily/update.py +++ b/src/acquisition/covid_hosp/state_daily/update.py @@ -10,18 +10,17 @@ class Update: + @staticmethod + def run(network=Network): + """Acquire the most recent dataset, unless it was previously acquired. - @staticmethod - def run(network=Network): - """Acquire the most recent dataset, unless it was previously acquired. + Returns + ------- + bool + Whether a new dataset was acquired. + """ - Returns - ------- - bool - Whether a new dataset was acquired. - """ - - return Utils.update_dataset(Database, network) + return Utils.update_dataset(Database, network) # main entry point diff --git a/src/acquisition/covid_hosp/state_timeseries/database.py b/src/acquisition/covid_hosp/state_timeseries/database.py index 348d9fc0b..b1d1c98af 100644 --- a/src/acquisition/covid_hosp/state_timeseries/database.py +++ b/src/acquisition/covid_hosp/state_timeseries/database.py @@ -7,223 +7,143 @@ class Database(BaseDatabase): - TABLE_NAME = 'covid_hosp_state_timeseries' - KEY_COLS = ['state', 'date'] - # These are 3-tuples of (CSV header name, SQL db column name, data type) for - # all the columns in the CSV file. - # Note that the corresponding database column names may be shorter - # due to constraints on the length of column names. See - # /src/ddl/covid_hosp.sql for more information. - # Additionally, all column names below are shared with state_daily, - # except for reporting_cutoff_start (there) and date (here). If you need - # to update a column name, do it in both places. - ORDERED_CSV_COLUMNS = [ - Columndef('state', 'state', str), - Columndef('date', 'date', Utils.int_from_date), - Columndef('adult_icu_bed_covid_utilization', 'adult_icu_bed_covid_utilization', float), - Columndef('adult_icu_bed_covid_utilization_coverage', 'adult_icu_bed_covid_utilization_coverage', int), - Columndef('adult_icu_bed_covid_utilization_denominator', 'adult_icu_bed_covid_utilization_denominator', - int), - Columndef('adult_icu_bed_covid_utilization_numerator', 'adult_icu_bed_covid_utilization_numerator', - int), - Columndef('adult_icu_bed_utilization', 'adult_icu_bed_utilization', float), - Columndef('adult_icu_bed_utilization_coverage', 'adult_icu_bed_utilization_coverage', int), - Columndef('adult_icu_bed_utilization_denominator', 'adult_icu_bed_utilization_denominator', int), - Columndef('adult_icu_bed_utilization_numerator', 'adult_icu_bed_utilization_numerator', int), - Columndef('critical_staffing_shortage_anticipated_within_week_no', - 'critical_staffing_shortage_anticipated_within_week_no', int), - Columndef('critical_staffing_shortage_anticipated_within_week_not_reported', - 'critical_staffing_shortage_anticipated_within_week_not_reported', int), - Columndef('critical_staffing_shortage_anticipated_within_week_yes', - 'critical_staffing_shortage_anticipated_within_week_yes', int), - Columndef('critical_staffing_shortage_today_no', 'critical_staffing_shortage_today_no', int), - Columndef('critical_staffing_shortage_today_not_reported', - 'critical_staffing_shortage_today_not_reported', int), - Columndef('critical_staffing_shortage_today_yes', 'critical_staffing_shortage_today_yes', int), - Columndef('deaths_covid', 'deaths_covid', int), - Columndef('deaths_covid_coverage', 'deaths_covid_coverage', int), - Columndef('geocoded_state', 'geocoded_state', str), - Columndef('hospital_onset_covid', 'hospital_onset_covid', int), - Columndef('hospital_onset_covid_coverage', 'hospital_onset_covid_coverage', int), - Columndef('icu_patients_confirmed_influenza', 'icu_patients_confirmed_influenza', int), - Columndef('icu_patients_confirmed_influenza_coverage', 'icu_patients_confirmed_influenza_coverage', - int), - Columndef('inpatient_bed_covid_utilization', 'inpatient_bed_covid_utilization', float), - Columndef('inpatient_bed_covid_utilization_coverage', 'inpatient_bed_covid_utilization_coverage', int), - Columndef('inpatient_bed_covid_utilization_denominator', 'inpatient_bed_covid_utilization_denominator', - int), - Columndef('inpatient_bed_covid_utilization_numerator', 'inpatient_bed_covid_utilization_numerator', - int), - Columndef('inpatient_beds', 'inpatient_beds', int), - Columndef('inpatient_beds_coverage', 'inpatient_beds_coverage', int), - Columndef('inpatient_beds_used', 'inpatient_beds_used', int), - Columndef('inpatient_beds_used_coverage', 'inpatient_beds_used_coverage', int), - Columndef('inpatient_beds_used_covid', 'inpatient_beds_used_covid', int), - Columndef('inpatient_beds_used_covid_coverage', 'inpatient_beds_used_covid_coverage', int), - Columndef('inpatient_beds_utilization', 'inpatient_beds_utilization', float), - Columndef('inpatient_beds_utilization_coverage', 'inpatient_beds_utilization_coverage', int), - Columndef('inpatient_beds_utilization_denominator', 'inpatient_beds_utilization_denominator', int), - Columndef('inpatient_beds_utilization_numerator', 'inpatient_beds_utilization_numerator', int), - Columndef('on_hand_supply_therapeutic_a_casirivimab_imdevimab_courses', - 'on_hand_supply_therapeutic_a_casirivimab_imdevimab_courses', int), - Columndef('on_hand_supply_therapeutic_b_bamlanivimab_courses', - 'on_hand_supply_therapeutic_b_bamlanivimab_courses', int), - Columndef('on_hand_supply_therapeutic_c_bamlanivimab_etesevimab_courses', - 'on_hand_supply_therapeutic_c_bamlanivimab_etesevimab_courses', int), - Columndef('percent_of_inpatients_with_covid', 'percent_of_inpatients_with_covid', float), - Columndef('percent_of_inpatients_with_covid_coverage', 'percent_of_inpatients_with_covid_coverage', - int), - Columndef('percent_of_inpatients_with_covid_denominator', - 'percent_of_inpatients_with_covid_denominator', int), - Columndef('percent_of_inpatients_with_covid_numerator', 'percent_of_inpatients_with_covid_numerator', - int), - Columndef('previous_day_admission_adult_covid_confirmed', - 'previous_day_admission_adult_covid_confirmed', int), - Columndef('previous_day_admission_adult_covid_confirmed_18-19', - 'previous_day_admission_adult_covid_confirmed_18_19', int), - Columndef('previous_day_admission_adult_covid_confirmed_18-19_coverage', - 'previous_day_admission_adult_covid_confirmed_18_19_coverage', int), - Columndef('previous_day_admission_adult_covid_confirmed_20-29', - 'previous_day_admission_adult_covid_confirmed_20_29', int), - Columndef('previous_day_admission_adult_covid_confirmed_20-29_coverage', - 'previous_day_admission_adult_covid_confirmed_20_29_coverage', int), - Columndef('previous_day_admission_adult_covid_confirmed_30-39', - 'previous_day_admission_adult_covid_confirmed_30_39', int), - Columndef('previous_day_admission_adult_covid_confirmed_30-39_coverage', - 'previous_day_admission_adult_covid_confirmed_30_39_coverage', int), - Columndef('previous_day_admission_adult_covid_confirmed_40-49', - 'previous_day_admission_adult_covid_confirmed_40_49', int), - Columndef('previous_day_admission_adult_covid_confirmed_40-49_coverage', - 'previous_day_admission_adult_covid_confirmed_40_49_coverage', int), - Columndef('previous_day_admission_adult_covid_confirmed_50-59', - 'previous_day_admission_adult_covid_confirmed_50_59', int), - Columndef('previous_day_admission_adult_covid_confirmed_50-59_coverage', - 'previous_day_admission_adult_covid_confirmed_50_59_coverage', int), - Columndef('previous_day_admission_adult_covid_confirmed_60-69', - 'previous_day_admission_adult_covid_confirmed_60_69', int), - Columndef('previous_day_admission_adult_covid_confirmed_60-69_coverage', - 'previous_day_admission_adult_covid_confirmed_60_69_coverage', int), - Columndef('previous_day_admission_adult_covid_confirmed_70-79', - 'previous_day_admission_adult_covid_confirmed_70_79', int), - Columndef('previous_day_admission_adult_covid_confirmed_70-79_coverage', - 'previous_day_admission_adult_covid_confirmed_70_79_coverage', int), - Columndef('previous_day_admission_adult_covid_confirmed_80+', - 'previous_day_admission_adult_covid_confirmed_80plus', int), - Columndef('previous_day_admission_adult_covid_confirmed_80+_coverage', - 'previous_day_admission_adult_covid_confirmed_80plus_coverage', int), - Columndef('previous_day_admission_adult_covid_confirmed_coverage', - 'previous_day_admission_adult_covid_confirmed_coverage', int), - Columndef('previous_day_admission_adult_covid_confirmed_unknown', - 'previous_day_admission_adult_covid_confirmed_unknown', int), - Columndef('previous_day_admission_adult_covid_confirmed_unknown_coverage', - 'previous_day_admission_adult_covid_confirmed_unknown_coverage', int), - Columndef('previous_day_admission_adult_covid_suspected', - 'previous_day_admission_adult_covid_suspected', int), - Columndef('previous_day_admission_adult_covid_suspected_18-19', - 'previous_day_admission_adult_covid_suspected_18_19', int), - Columndef('previous_day_admission_adult_covid_suspected_18-19_coverage', - 'previous_day_admission_adult_covid_suspected_18_19_coverage', int), - Columndef('previous_day_admission_adult_covid_suspected_20-29', - 'previous_day_admission_adult_covid_suspected_20_29', int), - Columndef('previous_day_admission_adult_covid_suspected_20-29_coverage', - 'previous_day_admission_adult_covid_suspected_20_29_coverage', int), - Columndef('previous_day_admission_adult_covid_suspected_30-39', - 'previous_day_admission_adult_covid_suspected_30_39', int), - Columndef('previous_day_admission_adult_covid_suspected_30-39_coverage', - 'previous_day_admission_adult_covid_suspected_30_39_coverage', int), - Columndef('previous_day_admission_adult_covid_suspected_40-49', - 'previous_day_admission_adult_covid_suspected_40_49', int), - Columndef('previous_day_admission_adult_covid_suspected_40-49_coverage', - 'previous_day_admission_adult_covid_suspected_40_49_coverage', int), - Columndef('previous_day_admission_adult_covid_suspected_50-59', - 'previous_day_admission_adult_covid_suspected_50_59', int), - Columndef('previous_day_admission_adult_covid_suspected_50-59_coverage', - 'previous_day_admission_adult_covid_suspected_50_59_coverage', int), - Columndef('previous_day_admission_adult_covid_suspected_60-69', - 'previous_day_admission_adult_covid_suspected_60_69', int), - Columndef('previous_day_admission_adult_covid_suspected_60-69_coverage', - 'previous_day_admission_adult_covid_suspected_60_69_coverage', int), - Columndef('previous_day_admission_adult_covid_suspected_70-79', - 'previous_day_admission_adult_covid_suspected_70_79', int), - Columndef('previous_day_admission_adult_covid_suspected_70-79_coverage', - 'previous_day_admission_adult_covid_suspected_70_79_coverage', int), - Columndef('previous_day_admission_adult_covid_suspected_80+', - 'previous_day_admission_adult_covid_suspected_80plus', int), - Columndef('previous_day_admission_adult_covid_suspected_80+_coverage', - 'previous_day_admission_adult_covid_suspected_80plus_coverage', int), - Columndef('previous_day_admission_adult_covid_suspected_coverage', - 'previous_day_admission_adult_covid_suspected_coverage', int), - Columndef('previous_day_admission_adult_covid_suspected_unknown', - 'previous_day_admission_adult_covid_suspected_unknown', int), - Columndef('previous_day_admission_adult_covid_suspected_unknown_coverage', - 'previous_day_admission_adult_covid_suspected_unknown_coverage', int), - Columndef('previous_day_admission_influenza_confirmed', 'previous_day_admission_influenza_confirmed', - int), - Columndef('previous_day_admission_influenza_confirmed_coverage', - 'previous_day_admission_influenza_confirmed_coverage', int), - Columndef('previous_day_admission_pediatric_covid_confirmed', - 'previous_day_admission_pediatric_covid_confirmed', int), - Columndef('previous_day_admission_pediatric_covid_confirmed_coverage', - 'previous_day_admission_pediatric_covid_confirmed_coverage', int), - Columndef('previous_day_admission_pediatric_covid_suspected', - 'previous_day_admission_pediatric_covid_suspected', int), - Columndef('previous_day_admission_pediatric_covid_suspected_coverage', - 'previous_day_admission_pediatric_covid_suspected_coverage', int), - Columndef('previous_day_deaths_covid_and_influenza', 'previous_day_deaths_covid_and_influenza', int), - Columndef('previous_day_deaths_covid_and_influenza_coverage', - 'previous_day_deaths_covid_and_influenza_coverage', int), - Columndef('previous_day_deaths_influenza', 'previous_day_deaths_influenza', int), - Columndef('previous_day_deaths_influenza_coverage', 'previous_day_deaths_influenza_coverage', int), - Columndef('previous_week_therapeutic_a_casirivimab_imdevimab_courses_used', - 'previous_week_therapeutic_a_casirivimab_imdevimab_courses_used', int), - Columndef('previous_week_therapeutic_b_bamlanivimab_courses_used', - 'previous_week_therapeutic_b_bamlanivimab_courses_used', int), - Columndef('previous_week_therapeutic_c_bamlanivimab_etesevimab_courses_used', - 'previous_week_therapeutic_c_bamlanivimab_etesevimab_courses_used', int), - Columndef('staffed_adult_icu_bed_occupancy', 'staffed_adult_icu_bed_occupancy', int), - Columndef('staffed_adult_icu_bed_occupancy_coverage', 'staffed_adult_icu_bed_occupancy_coverage', int), - Columndef('staffed_icu_adult_patients_confirmed_and_suspected_covid', - 'staffed_icu_adult_patients_confirmed_suspected_covid', int), - Columndef('staffed_icu_adult_patients_confirmed_and_suspected_covid_coverage', - 'staffed_icu_adult_patients_confirmed_suspected_covid_coverage', int), - Columndef('staffed_icu_adult_patients_confirmed_covid', 'staffed_icu_adult_patients_confirmed_covid', - int), - Columndef('staffed_icu_adult_patients_confirmed_covid_coverage', - 'staffed_icu_adult_patients_confirmed_covid_coverage', int), - Columndef('total_adult_patients_hospitalized_confirmed_and_suspected_covid', - 'total_adult_patients_hosp_confirmed_suspected_covid', int), - Columndef('total_adult_patients_hospitalized_confirmed_and_suspected_covid_coverage', - 'total_adult_patients_hosp_confirmed_suspected_covid_coverage', int), - Columndef('total_adult_patients_hospitalized_confirmed_covid', - 'total_adult_patients_hosp_confirmed_covid', int), - Columndef('total_adult_patients_hospitalized_confirmed_covid_coverage', - 'total_adult_patients_hosp_confirmed_covid_coverage', int), - Columndef('total_patients_hospitalized_confirmed_influenza', - 'total_patients_hospitalized_confirmed_influenza', int), - Columndef('total_patients_hospitalized_confirmed_influenza_coverage', - 'total_patients_hospitalized_confirmed_influenza_coverage', int), - Columndef('total_patients_hospitalized_confirmed_influenza_covid', - 'total_patients_hospitalized_confirmed_influenza_covid', int), - Columndef('total_patients_hospitalized_confirmed_influenza_covid_coverage', - 'total_patients_hospitalized_confirmed_influenza_covid_coverage', int), - Columndef('total_pediatric_patients_hospitalized_confirmed_and_suspected_covid', - 'total_pediatric_patients_hosp_confirmed_suspected_covid', int), - Columndef('total_pediatric_patients_hospitalized_confirmed_and_suspected_covid_coverage', - 'total_pediatric_patients_hosp_confirmed_suspected_covid_coverage', int), - Columndef('total_pediatric_patients_hospitalized_confirmed_covid', - 'total_pediatric_patients_hosp_confirmed_covid', int), - Columndef('total_pediatric_patients_hospitalized_confirmed_covid_coverage', - 'total_pediatric_patients_hosp_confirmed_covid_coverage', int), - Columndef('total_staffed_adult_icu_beds', 'total_staffed_adult_icu_beds', int), - Columndef('total_staffed_adult_icu_beds_coverage', 'total_staffed_adult_icu_beds_coverage', int), - ] + TABLE_NAME = "covid_hosp_state_timeseries" + KEY_COLS = ["state", "date"] + # These are 3-tuples of (CSV header name, SQL db column name, data type) for + # all the columns in the CSV file. + # Note that the corresponding database column names may be shorter + # due to constraints on the length of column names. See + # /src/ddl/covid_hosp.sql for more information. + # Additionally, all column names below are shared with state_daily, + # except for reporting_cutoff_start (there) and date (here). If you need + # to update a column name, do it in both places. + ORDERED_CSV_COLUMNS = [ + Columndef("state", "state", str), + Columndef("date", "date", Utils.int_from_date), + Columndef("adult_icu_bed_covid_utilization", "adult_icu_bed_covid_utilization", float), + Columndef("adult_icu_bed_covid_utilization_coverage", "adult_icu_bed_covid_utilization_coverage", int), + Columndef("adult_icu_bed_covid_utilization_denominator", "adult_icu_bed_covid_utilization_denominator", int), + Columndef("adult_icu_bed_covid_utilization_numerator", "adult_icu_bed_covid_utilization_numerator", int), + Columndef("adult_icu_bed_utilization", "adult_icu_bed_utilization", float), + Columndef("adult_icu_bed_utilization_coverage", "adult_icu_bed_utilization_coverage", int), + Columndef("adult_icu_bed_utilization_denominator", "adult_icu_bed_utilization_denominator", int), + Columndef("adult_icu_bed_utilization_numerator", "adult_icu_bed_utilization_numerator", int), + Columndef("critical_staffing_shortage_anticipated_within_week_no", "critical_staffing_shortage_anticipated_within_week_no", int), + Columndef("critical_staffing_shortage_anticipated_within_week_not_reported", "critical_staffing_shortage_anticipated_within_week_not_reported", int), + Columndef("critical_staffing_shortage_anticipated_within_week_yes", "critical_staffing_shortage_anticipated_within_week_yes", int), + Columndef("critical_staffing_shortage_today_no", "critical_staffing_shortage_today_no", int), + Columndef("critical_staffing_shortage_today_not_reported", "critical_staffing_shortage_today_not_reported", int), + Columndef("critical_staffing_shortage_today_yes", "critical_staffing_shortage_today_yes", int), + Columndef("deaths_covid", "deaths_covid", int), + Columndef("deaths_covid_coverage", "deaths_covid_coverage", int), + Columndef("geocoded_state", "geocoded_state", str), + Columndef("hospital_onset_covid", "hospital_onset_covid", int), + Columndef("hospital_onset_covid_coverage", "hospital_onset_covid_coverage", int), + Columndef("icu_patients_confirmed_influenza", "icu_patients_confirmed_influenza", int), + Columndef("icu_patients_confirmed_influenza_coverage", "icu_patients_confirmed_influenza_coverage", int), + Columndef("inpatient_bed_covid_utilization", "inpatient_bed_covid_utilization", float), + Columndef("inpatient_bed_covid_utilization_coverage", "inpatient_bed_covid_utilization_coverage", int), + Columndef("inpatient_bed_covid_utilization_denominator", "inpatient_bed_covid_utilization_denominator", int), + Columndef("inpatient_bed_covid_utilization_numerator", "inpatient_bed_covid_utilization_numerator", int), + Columndef("inpatient_beds", "inpatient_beds", int), + Columndef("inpatient_beds_coverage", "inpatient_beds_coverage", int), + Columndef("inpatient_beds_used", "inpatient_beds_used", int), + Columndef("inpatient_beds_used_coverage", "inpatient_beds_used_coverage", int), + Columndef("inpatient_beds_used_covid", "inpatient_beds_used_covid", int), + Columndef("inpatient_beds_used_covid_coverage", "inpatient_beds_used_covid_coverage", int), + Columndef("inpatient_beds_utilization", "inpatient_beds_utilization", float), + Columndef("inpatient_beds_utilization_coverage", "inpatient_beds_utilization_coverage", int), + Columndef("inpatient_beds_utilization_denominator", "inpatient_beds_utilization_denominator", int), + Columndef("inpatient_beds_utilization_numerator", "inpatient_beds_utilization_numerator", int), + Columndef("on_hand_supply_therapeutic_a_casirivimab_imdevimab_courses", "on_hand_supply_therapeutic_a_casirivimab_imdevimab_courses", int), + Columndef("on_hand_supply_therapeutic_b_bamlanivimab_courses", "on_hand_supply_therapeutic_b_bamlanivimab_courses", int), + Columndef("on_hand_supply_therapeutic_c_bamlanivimab_etesevimab_courses", "on_hand_supply_therapeutic_c_bamlanivimab_etesevimab_courses", int), + Columndef("percent_of_inpatients_with_covid", "percent_of_inpatients_with_covid", float), + Columndef("percent_of_inpatients_with_covid_coverage", "percent_of_inpatients_with_covid_coverage", int), + Columndef("percent_of_inpatients_with_covid_denominator", "percent_of_inpatients_with_covid_denominator", int), + Columndef("percent_of_inpatients_with_covid_numerator", "percent_of_inpatients_with_covid_numerator", int), + Columndef("previous_day_admission_adult_covid_confirmed", "previous_day_admission_adult_covid_confirmed", int), + Columndef("previous_day_admission_adult_covid_confirmed_18-19", "previous_day_admission_adult_covid_confirmed_18_19", int), + Columndef("previous_day_admission_adult_covid_confirmed_18-19_coverage", "previous_day_admission_adult_covid_confirmed_18_19_coverage", int), + Columndef("previous_day_admission_adult_covid_confirmed_20-29", "previous_day_admission_adult_covid_confirmed_20_29", int), + Columndef("previous_day_admission_adult_covid_confirmed_20-29_coverage", "previous_day_admission_adult_covid_confirmed_20_29_coverage", int), + Columndef("previous_day_admission_adult_covid_confirmed_30-39", "previous_day_admission_adult_covid_confirmed_30_39", int), + Columndef("previous_day_admission_adult_covid_confirmed_30-39_coverage", "previous_day_admission_adult_covid_confirmed_30_39_coverage", int), + Columndef("previous_day_admission_adult_covid_confirmed_40-49", "previous_day_admission_adult_covid_confirmed_40_49", int), + Columndef("previous_day_admission_adult_covid_confirmed_40-49_coverage", "previous_day_admission_adult_covid_confirmed_40_49_coverage", int), + Columndef("previous_day_admission_adult_covid_confirmed_50-59", "previous_day_admission_adult_covid_confirmed_50_59", int), + Columndef("previous_day_admission_adult_covid_confirmed_50-59_coverage", "previous_day_admission_adult_covid_confirmed_50_59_coverage", int), + Columndef("previous_day_admission_adult_covid_confirmed_60-69", "previous_day_admission_adult_covid_confirmed_60_69", int), + Columndef("previous_day_admission_adult_covid_confirmed_60-69_coverage", "previous_day_admission_adult_covid_confirmed_60_69_coverage", int), + Columndef("previous_day_admission_adult_covid_confirmed_70-79", "previous_day_admission_adult_covid_confirmed_70_79", int), + Columndef("previous_day_admission_adult_covid_confirmed_70-79_coverage", "previous_day_admission_adult_covid_confirmed_70_79_coverage", int), + Columndef("previous_day_admission_adult_covid_confirmed_80+", "previous_day_admission_adult_covid_confirmed_80plus", int), + Columndef("previous_day_admission_adult_covid_confirmed_80+_coverage", "previous_day_admission_adult_covid_confirmed_80plus_coverage", int), + Columndef("previous_day_admission_adult_covid_confirmed_coverage", "previous_day_admission_adult_covid_confirmed_coverage", int), + Columndef("previous_day_admission_adult_covid_confirmed_unknown", "previous_day_admission_adult_covid_confirmed_unknown", int), + Columndef("previous_day_admission_adult_covid_confirmed_unknown_coverage", "previous_day_admission_adult_covid_confirmed_unknown_coverage", int), + Columndef("previous_day_admission_adult_covid_suspected", "previous_day_admission_adult_covid_suspected", int), + Columndef("previous_day_admission_adult_covid_suspected_18-19", "previous_day_admission_adult_covid_suspected_18_19", int), + Columndef("previous_day_admission_adult_covid_suspected_18-19_coverage", "previous_day_admission_adult_covid_suspected_18_19_coverage", int), + Columndef("previous_day_admission_adult_covid_suspected_20-29", "previous_day_admission_adult_covid_suspected_20_29", int), + Columndef("previous_day_admission_adult_covid_suspected_20-29_coverage", "previous_day_admission_adult_covid_suspected_20_29_coverage", int), + Columndef("previous_day_admission_adult_covid_suspected_30-39", "previous_day_admission_adult_covid_suspected_30_39", int), + Columndef("previous_day_admission_adult_covid_suspected_30-39_coverage", "previous_day_admission_adult_covid_suspected_30_39_coverage", int), + Columndef("previous_day_admission_adult_covid_suspected_40-49", "previous_day_admission_adult_covid_suspected_40_49", int), + Columndef("previous_day_admission_adult_covid_suspected_40-49_coverage", "previous_day_admission_adult_covid_suspected_40_49_coverage", int), + Columndef("previous_day_admission_adult_covid_suspected_50-59", "previous_day_admission_adult_covid_suspected_50_59", int), + Columndef("previous_day_admission_adult_covid_suspected_50-59_coverage", "previous_day_admission_adult_covid_suspected_50_59_coverage", int), + Columndef("previous_day_admission_adult_covid_suspected_60-69", "previous_day_admission_adult_covid_suspected_60_69", int), + Columndef("previous_day_admission_adult_covid_suspected_60-69_coverage", "previous_day_admission_adult_covid_suspected_60_69_coverage", int), + Columndef("previous_day_admission_adult_covid_suspected_70-79", "previous_day_admission_adult_covid_suspected_70_79", int), + Columndef("previous_day_admission_adult_covid_suspected_70-79_coverage", "previous_day_admission_adult_covid_suspected_70_79_coverage", int), + Columndef("previous_day_admission_adult_covid_suspected_80+", "previous_day_admission_adult_covid_suspected_80plus", int), + Columndef("previous_day_admission_adult_covid_suspected_80+_coverage", "previous_day_admission_adult_covid_suspected_80plus_coverage", int), + Columndef("previous_day_admission_adult_covid_suspected_coverage", "previous_day_admission_adult_covid_suspected_coverage", int), + Columndef("previous_day_admission_adult_covid_suspected_unknown", "previous_day_admission_adult_covid_suspected_unknown", int), + Columndef("previous_day_admission_adult_covid_suspected_unknown_coverage", "previous_day_admission_adult_covid_suspected_unknown_coverage", int), + Columndef("previous_day_admission_influenza_confirmed", "previous_day_admission_influenza_confirmed", int), + Columndef("previous_day_admission_influenza_confirmed_coverage", "previous_day_admission_influenza_confirmed_coverage", int), + Columndef("previous_day_admission_pediatric_covid_confirmed", "previous_day_admission_pediatric_covid_confirmed", int), + Columndef("previous_day_admission_pediatric_covid_confirmed_coverage", "previous_day_admission_pediatric_covid_confirmed_coverage", int), + Columndef("previous_day_admission_pediatric_covid_suspected", "previous_day_admission_pediatric_covid_suspected", int), + Columndef("previous_day_admission_pediatric_covid_suspected_coverage", "previous_day_admission_pediatric_covid_suspected_coverage", int), + Columndef("previous_day_deaths_covid_and_influenza", "previous_day_deaths_covid_and_influenza", int), + Columndef("previous_day_deaths_covid_and_influenza_coverage", "previous_day_deaths_covid_and_influenza_coverage", int), + Columndef("previous_day_deaths_influenza", "previous_day_deaths_influenza", int), + Columndef("previous_day_deaths_influenza_coverage", "previous_day_deaths_influenza_coverage", int), + Columndef("previous_week_therapeutic_a_casirivimab_imdevimab_courses_used", "previous_week_therapeutic_a_casirivimab_imdevimab_courses_used", int), + Columndef("previous_week_therapeutic_b_bamlanivimab_courses_used", "previous_week_therapeutic_b_bamlanivimab_courses_used", int), + Columndef("previous_week_therapeutic_c_bamlanivimab_etesevimab_courses_used", "previous_week_therapeutic_c_bamlanivimab_etesevimab_courses_used", int), + Columndef("staffed_adult_icu_bed_occupancy", "staffed_adult_icu_bed_occupancy", int), + Columndef("staffed_adult_icu_bed_occupancy_coverage", "staffed_adult_icu_bed_occupancy_coverage", int), + Columndef("staffed_icu_adult_patients_confirmed_and_suspected_covid", "staffed_icu_adult_patients_confirmed_suspected_covid", int), + Columndef("staffed_icu_adult_patients_confirmed_and_suspected_covid_coverage", "staffed_icu_adult_patients_confirmed_suspected_covid_coverage", int), + Columndef("staffed_icu_adult_patients_confirmed_covid", "staffed_icu_adult_patients_confirmed_covid", int), + Columndef("staffed_icu_adult_patients_confirmed_covid_coverage", "staffed_icu_adult_patients_confirmed_covid_coverage", int), + Columndef("total_adult_patients_hospitalized_confirmed_and_suspected_covid", "total_adult_patients_hosp_confirmed_suspected_covid", int), + Columndef("total_adult_patients_hospitalized_confirmed_and_suspected_covid_coverage", "total_adult_patients_hosp_confirmed_suspected_covid_coverage", int), + Columndef("total_adult_patients_hospitalized_confirmed_covid", "total_adult_patients_hosp_confirmed_covid", int), + Columndef("total_adult_patients_hospitalized_confirmed_covid_coverage", "total_adult_patients_hosp_confirmed_covid_coverage", int), + Columndef("total_patients_hospitalized_confirmed_influenza", "total_patients_hospitalized_confirmed_influenza", int), + Columndef("total_patients_hospitalized_confirmed_influenza_coverage", "total_patients_hospitalized_confirmed_influenza_coverage", int), + Columndef("total_patients_hospitalized_confirmed_influenza_covid", "total_patients_hospitalized_confirmed_influenza_covid", int), + Columndef("total_patients_hospitalized_confirmed_influenza_covid_coverage", "total_patients_hospitalized_confirmed_influenza_covid_coverage", int), + Columndef("total_pediatric_patients_hospitalized_confirmed_and_suspected_covid", "total_pediatric_patients_hosp_confirmed_suspected_covid", int), + Columndef("total_pediatric_patients_hospitalized_confirmed_and_suspected_covid_coverage", "total_pediatric_patients_hosp_confirmed_suspected_covid_coverage", int), + Columndef("total_pediatric_patients_hospitalized_confirmed_covid", "total_pediatric_patients_hosp_confirmed_covid", int), + Columndef("total_pediatric_patients_hospitalized_confirmed_covid_coverage", "total_pediatric_patients_hosp_confirmed_covid_coverage", int), + Columndef("total_staffed_adult_icu_beds", "total_staffed_adult_icu_beds", int), + Columndef("total_staffed_adult_icu_beds_coverage", "total_staffed_adult_icu_beds_coverage", int), + ] - def __init__(self, *args, **kwargs): - super().__init__( - *args, - **kwargs, - table_name=Database.TABLE_NAME, - hhs_dataset_id=Network.DATASET_ID, - columns_and_types=Database.ORDERED_CSV_COLUMNS, - key_columns=Database.KEY_COLS, - additional_fields=[Columndef('T', 'record_type', None)]) + def __init__(self, *args, **kwargs): + super().__init__( + *args, + **kwargs, + table_name=Database.TABLE_NAME, + hhs_dataset_id=Network.DATASET_ID, + columns_and_types=Database.ORDERED_CSV_COLUMNS, + key_columns=Database.KEY_COLS, + additional_fields=[Columndef("T", "record_type", None)], + ) diff --git a/src/acquisition/covid_hosp/state_timeseries/network.py b/src/acquisition/covid_hosp/state_timeseries/network.py index 7bd5082a8..ff53900db 100644 --- a/src/acquisition/covid_hosp/state_timeseries/network.py +++ b/src/acquisition/covid_hosp/state_timeseries/network.py @@ -4,14 +4,13 @@ class Network(BaseNetwork): - DATASET_ID = 'g62h-syeh' - METADATA_ID = 'qqte-vkut' + DATASET_ID = "g62h-syeh" + METADATA_ID = "qqte-vkut" - def fetch_metadata(*args, **kwags): - """Download and return metadata. + def fetch_metadata(*args, **kwags): + """Download and return metadata. - See `fetch_metadata_for_dataset`. - """ + See `fetch_metadata_for_dataset`. + """ - return Network.fetch_metadata_for_dataset( - *args, **kwags, dataset_id=Network.METADATA_ID) + return Network.fetch_metadata_for_dataset(*args, **kwags, dataset_id=Network.METADATA_ID) diff --git a/src/acquisition/covid_hosp/state_timeseries/update.py b/src/acquisition/covid_hosp/state_timeseries/update.py index 7c8e79941..873de218b 100644 --- a/src/acquisition/covid_hosp/state_timeseries/update.py +++ b/src/acquisition/covid_hosp/state_timeseries/update.py @@ -11,17 +11,16 @@ class Update: + def run(network=Network): + """Acquire the most recent dataset, unless it was previously acquired. - def run(network=Network): - """Acquire the most recent dataset, unless it was previously acquired. + Returns + ------- + bool + Whether a new dataset was acquired. + """ - Returns - ------- - bool - Whether a new dataset was acquired. - """ - - return Utils.update_dataset(Database, network) + return Utils.update_dataset(Database, network) # main entry point diff --git a/src/acquisition/covidcast_nowcast/load_sensors.py b/src/acquisition/covidcast_nowcast/load_sensors.py index 73ce7eee5..6a181d236 100644 --- a/src/acquisition/covidcast_nowcast/load_sensors.py +++ b/src/acquisition/covidcast_nowcast/load_sensors.py @@ -82,8 +82,7 @@ def load_and_prepare_file(filepath: str, attributes: PathDetails) -> pd.DataFram def _move_after_processing(filepath, success): archive_dir = SUCCESS_DIR if success else FAIL_DIR - new_dir = os.path.dirname(filepath).replace( - "receiving", archive_dir) + new_dir = os.path.dirname(filepath).replace("receiving", archive_dir) os.makedirs(new_dir, exist_ok=True) move(filepath, filepath.replace("receiving", archive_dir)) print(f"{filepath} moved to {archive_dir}") @@ -96,10 +95,12 @@ def method(table, conn, keys, data_iter): meta, # specify lag column explicitly; lag is a reserved word sqlalchemy doesn't know about sqlalchemy.Column("lag", sqlalchemy.Integer, quote=True), - autoload=True) + autoload=True, + ) insert_stmt = sqlalchemy.dialects.mysql.insert(sql_table).values([dict(zip(keys, data)) for data in data_iter]) upsert_stmt = insert_stmt.on_duplicate_key_update({x.name: x for x in insert_stmt.inserted}) conn.execute(upsert_stmt) + return method diff --git a/src/acquisition/ecdc/ecdc_db_update.py b/src/acquisition/ecdc/ecdc_db_update.py index 63689c1d5..2a951b724 100644 --- a/src/acquisition/ecdc/ecdc_db_update.py +++ b/src/acquisition/ecdc/ecdc_db_update.py @@ -33,9 +33,8 @@ import argparse import datetime import glob -import subprocess -import random import os +import tempfile # third party import mysql.connector @@ -46,12 +45,14 @@ from delphi.utils.epiweek import delta_epiweeks from delphi.utils.epidate import EpiDate + def ensure_tables_exist(): - (u,p) = secrets.db.epi - cnx = mysql.connector.connect(user=u,password=p,database='epidata') + (u, p) = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") try: cursor = cnx.cursor() - cursor.execute(''' + cursor.execute( + """ CREATE TABLE IF NOT EXISTS `ecdc_ili` ( `id` INT(11) NOT NULL PRIMARY KEY AUTO_INCREMENT, `release_date` DATE NOT NULL, @@ -62,58 +63,63 @@ def ensure_tables_exist(): `incidence_rate` DOUBLE NOT NULL, UNIQUE KEY (`issue`, `epiweek`, `region`) ); - '''); + """ + ) cnx.commit() finally: cnx.close() + def safe_float(f): try: - return float(f.replace(',','')) + return float(f.replace(",", "")) except: return 0 + def safe_int(i): try: - return int(i.replace(',','')) + return int(i.replace(",", "")) except: return 0 -def get_rows(cnx, table='ecdc_ili'): - # Count and return the number of rows in the `ecdc_ili` table. - select = cnx.cursor() - select.execute('SELECT count(1) num FROM %s' % table) - for (num,) in select: - pass - select.close() - return num + +def get_rows(cnx, table="ecdc_ili"): + # Count and return the number of rows in the `ecdc_ili` table. + select = cnx.cursor() + select.execute("SELECT count(1) num FROM %s" % table) + for (num,) in select: + pass + select.close() + return num + def update_from_file(issue, date, dir, test_mode=False): # Read ECDC data from CSVs and insert into (or update) the database. # database connection u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - rows1 = get_rows(cnx, 'ecdc_ili') - print('rows before: %d' % (rows1)) + cnx = mysql.connector.connect(user=u, password=p, database="epidata") + rows1 = get_rows(cnx, "ecdc_ili") + print("rows before: %d" % (rows1)) insert = cnx.cursor() # load the data, ignoring empty rows - files = glob.glob(os.path.join(dir,"*.csv")) + files = glob.glob(os.path.join(dir, "*.csv")) rows = [] for filename in files: - with open(filename,'r') as f: + with open(filename, "r") as f: for l in f: - data = list(map(lambda s: s.strip().replace('"',''),l.split(','))) + data = list(map(lambda s: s.strip().replace('"', ""), l.split(","))) row = {} - row['epiweek'] = int(data[1][:4] + data[1][5:]) - row['region'] = data[4] - row['incidence_rate'] = data[3] + row["epiweek"] = int(data[1][:4] + data[1][5:]) + row["region"] = data[4] + row["incidence_rate"] = data[3] rows.append(row) - print(' loaded %d rows' % len(rows)) + print(" loaded %d rows" % len(rows)) entries = [obj for obj in rows if obj] - print(' found %d entries' % len(entries)) + print(" found %d entries" % len(entries)) - sql = ''' + sql = """ INSERT INTO `ecdc_ili` (`release_date`, `issue`, `epiweek`, `region`, `lag`, `incidence_rate`) @@ -122,13 +128,13 @@ def update_from_file(issue, date, dir, test_mode=False): ON DUPLICATE KEY UPDATE `release_date` = least(`release_date`, '%s'), `incidence_rate` = %s - ''' + """ for row in entries: - lag = delta_epiweeks(row['epiweek'], issue) - data_args = [row['incidence_rate']] + lag = delta_epiweeks(row["epiweek"], issue) + data_args = [row["incidence_rate"]] - insert_args = [date,issue,row['epiweek'],row['region'],lag] + data_args + insert_args = [date, issue, row["epiweek"], row["region"], lag] + data_args update_args = [date] + data_args try: insert.execute(sql % tuple(insert_args + update_args)) @@ -138,39 +144,28 @@ def update_from_file(issue, date, dir, test_mode=False): # cleanup insert.close() if test_mode: - print('test mode, not committing') + print("test mode, not committing") rows2 = rows1 else: cnx.commit() rows2 = get_rows(cnx) - print('rows after: %d (added %d)' % (rows2,rows2-rows1)) + print("rows after: %d (added %d)" % (rows2, rows2 - rows1)) cnx.close() + def main(): # args and usage parser = argparse.ArgumentParser() - parser.add_argument( - '--test', - action='store_true', - help='do dry run only, do not update the database' - ) - parser.add_argument( - '--file', - type=str, - help='load an existing zip file (otherwise fetch current data)' - ) - parser.add_argument( - '--issue', - type=int, - help='issue of the file (e.g. 201740); used iff --file is given' - ) + parser.add_argument("--test", action="store_true", help="do dry run only, do not update the database") + parser.add_argument("--file", type=str, help="load an existing zip file (otherwise fetch current data)") + parser.add_argument("--issue", type=int, help="issue of the file (e.g. 201740); used iff --file is given") args = parser.parse_args() if (args.file is None) != (args.issue is None): - raise Exception('--file and --issue must both be present or absent') + raise Exception("--file and --issue must both be present or absent") - date = datetime.datetime.now().strftime('%Y-%m-%d') - print('assuming release date is today, %s' % date) + date = datetime.datetime.now().strftime("%Y-%m-%d") + print("assuming release date is today, %s" % date) ensure_tables_exist() if args.file: @@ -182,29 +177,26 @@ def main(): max_tries = 5 while flag < max_tries: flag = flag + 1 - tmp_dir = ''.join(random.choice('0123456789abcdefghijklmnopqrstuvwxyz') for i in range(8)) - tmp_dir = 'downloads_' + tmp_dir - subprocess.call(["mkdir",tmp_dir]) - # Use temporary directory to avoid data from different time - # downloaded to same folder - download_ecdc_data(download_dir=tmp_dir) - issue = EpiDate.today().get_ew() - files = glob.glob('%s/*.csv' % tmp_dir) - for filename in files: - with open(filename,'r') as f: - _ = f.readline() - db_error = False - for filename in files: - try: - update_from_file(issue, date, filename, test_mode=args.test) - subprocess.call(["rm",filename]) - except: - db_error = True - subprocess.call(["rm","-r",tmp_dir]) - if not db_error: - break # Exit loop with success + with tempfile.TemporaryDirectory() as tmp_dir: + # Use temporary directory to avoid data from different time + # downloaded to same folder + download_ecdc_data(download_dir=tmp_dir) + issue = EpiDate.today().get_ew() + files = glob.glob(f"{tmp_dir}/*.csv") + for filename in files: + with open(filename, "r") as f: + _ = f.readline() + db_error = False + for filename in files: + try: + update_from_file(issue, date, filename, test_mode=args.test) + except: + db_error = True + if not db_error: + break # Exit loop with success if flag >= max_tries: - print('WARNING: Database `ecdc_ili` did not update successfully') + print("WARNING: Database `ecdc_ili` did not update successfully") + -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/src/acquisition/ecdc/ecdc_ili.py b/src/acquisition/ecdc/ecdc_ili.py index 1dd0505d1..bf11b9611 100644 --- a/src/acquisition/ecdc/ecdc_ili.py +++ b/src/acquisition/ecdc/ecdc_ili.py @@ -11,60 +11,60 @@ from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.support.ui import Select -from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC -def download_ecdc_data(download_dir = "downloads"): - url = 'https://flunewseurope.org/PrimaryCareData' +def download_ecdc_data(download_dir="downloads"): + url = "https://flunewseurope.org/PrimaryCareData" resp = requests.get(url) - soup = BeautifulSoup(resp.content, 'lxml') - mydivs = soup.findAll('div') + soup = BeautifulSoup(resp.content, "lxml") + mydivs = soup.findAll("div") for div in mydivs: dic = div.attrs - if dic.get('class')== ['graph-container'] and dic.get('id')== 'dinfl06': + if dic.get("class") == ["graph-container"] and dic.get("id") == "dinfl06": break # get new url of the ILI chunck - url = div.contents[1].attrs['src'] + url = div.contents[1].attrs["src"] opts = webdriver.firefox.options.Options() opts.set_headless() fp = webdriver.FirefoxProfile() - fp.set_preference("browser.download.folderList",2) - fp.set_preference("browser.download.manager.showWhenStarting",False) - fp.set_preference("browser.download.dir",os.path.abspath(download_dir)) - fp.set_preference("browser.helperApps.neverAsk.saveToDisk","text/csv") + fp.set_preference("browser.download.folderList", 2) + fp.set_preference("browser.download.manager.showWhenStarting", False) + fp.set_preference("browser.download.dir", os.path.abspath(download_dir)) + fp.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/csv") try: - driver = webdriver.Firefox(options=opts,firefox_profile=fp) + driver = webdriver.Firefox(options=opts, firefox_profile=fp) driver.get(url) for i in range(2, 54): # select country try: - WebDriverWait(driver,30).until(EC.element_to_be_clickable((By.ID,'fluNewsReportViewer_ctl04_ctl03_ddValue'))) - Select(driver.find_element_by_tag_name('select')).select_by_value(str(i)) + WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.ID, "fluNewsReportViewer_ctl04_ctl03_ddValue"))) + Select(driver.find_element_by_tag_name("select")).select_by_value(str(i)) time.sleep(3) - soup = BeautifulSoup(driver.page_source, 'html.parser') - options = soup.select('#fluNewsReportViewer_ctl04_ctl05_ddValue')[0].find_all('option') + soup = BeautifulSoup(driver.page_source, "html.parser") + options = soup.select("#fluNewsReportViewer_ctl04_ctl05_ddValue")[0].find_all("option") ind = 1 for j in range(len(options)): - if 'ILI' in str(options[j]): - pattern = re.compile(r'\d+') + if "ILI" in str(options[j]): + pattern = re.compile(r"\d+") ind = re.findall(pattern, str(options[j]))[0] break if type(ind) == str: # select clinical tyle - WebDriverWait(driver,30).until(EC.element_to_be_clickable((By.ID,'fluNewsReportViewer_ctl04_ctl05_ddValue'))) - Select(driver.find_element_by_id('fluNewsReportViewer_ctl04_ctl05_ddValue')).select_by_value(ind) - WebDriverWait(driver,30).until(EC.element_to_be_clickable((By.ID,'btnSelectExportType'))) - driver.find_element_by_id('btnSelectExportType').click() - WebDriverWait(driver,30).until(EC.element_to_be_clickable((By.ID,'btnExportToCsv'))) - driver.find_element_by_id('btnExportToCsv').click() + WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.ID, "fluNewsReportViewer_ctl04_ctl05_ddValue"))) + Select(driver.find_element_by_id("fluNewsReportViewer_ctl04_ctl05_ddValue")).select_by_value(ind) + WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.ID, "btnSelectExportType"))) + driver.find_element_by_id("btnSelectExportType").click() + WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.ID, "btnExportToCsv"))) + driver.find_element_by_id("btnExportToCsv").click() time.sleep(3) except: driver.get(url) except: - print('WARNING: ECDC Scraper may not have downloaded all of the available data.') - #cleanup - os.system('''pkill "firefox" ''') + print("WARNING: ECDC Scraper may not have downloaded all of the available data.") + # cleanup + os.system("""pkill "firefox" """) os.system('''pkill "(firefox-bin)"''') os.system('''pkill "geckodriver*"''') diff --git a/src/acquisition/flusurv/flusurv.py b/src/acquisition/flusurv/flusurv.py index 6b8d247ae..1e534b740 100644 --- a/src/acquisition/flusurv/flusurv.py +++ b/src/acquisition/flusurv/flusurv.py @@ -50,167 +50,170 @@ # all currently available FluSurv locations and their associated codes # the number pair represents NetworkID and CatchmentID location_codes = { - 'CA': (2, 1), - 'CO': (2, 2), - 'CT': (2, 3), - 'GA': (2, 4), - 'IA': (3, 5), - 'ID': (3, 6), - 'MD': (2, 7), - 'MI': (3, 8), - 'MN': (2, 9), - 'NM': (2, 11), - 'NY_albany': (2, 13), - 'NY_rochester': (2, 14), - 'OH': (3, 15), - 'OK': (3, 16), - 'OR': (2, 17), - 'RI': (3, 18), - 'SD': (3, 19), - 'TN': (2, 20), - 'UT': (3, 21), - 'network_all': (1, 22), - 'network_eip': (2, 22), - 'network_ihsp': (3, 22), + "CA": (2, 1), + "CO": (2, 2), + "CT": (2, 3), + "GA": (2, 4), + "IA": (3, 5), + "ID": (3, 6), + "MD": (2, 7), + "MI": (3, 8), + "MN": (2, 9), + "NM": (2, 11), + "NY_albany": (2, 13), + "NY_rochester": (2, 14), + "OH": (3, 15), + "OK": (3, 16), + "OR": (2, 17), + "RI": (3, 18), + "SD": (3, 19), + "TN": (2, 20), + "UT": (3, 21), + "network_all": (1, 22), + "network_eip": (2, 22), + "network_ihsp": (3, 22), } def fetch_json(path, payload, call_count=1, requests_impl=requests): - """Send a request to the server and return the parsed JSON response.""" - - # it's polite to self-identify this "bot" - delphi_url = 'https://delphi.cmu.edu/index.html' - user_agent = 'Mozilla/5.0 (compatible; delphibot/1.0; +%s)' % delphi_url - - # the FluSurv AMF server - flusurv_url = 'https://gis.cdc.gov/GRASP/Flu3/' + path - - # request headers - headers = { - 'Accept-Encoding': 'gzip', - 'User-Agent': user_agent, - } - if payload is not None: - headers['Content-Type'] = 'application/json;charset=UTF-8' - - # send the request and read the response - if payload is None: - method = requests_impl.get - data = None - else: - method = requests_impl.post - data = json.dumps(payload) - resp = method(flusurv_url, headers=headers, data=data) - - # check the HTTP status code - if resp.status_code == 500 and call_count <= 2: - # the server often fails with this status, so wait and retry - delay = 10 * call_count - print('got status %d, will retry in %d sec...' % (resp.status_code, delay)) - time.sleep(delay) - return fetch_json(path, payload, call_count=call_count + 1) - elif resp.status_code != 200: - raise Exception(['status code != 200', resp.status_code]) - - # check response mime type - if 'application/json' not in resp.headers.get('Content-Type', ''): - raise Exception('response is not json') - - # return the decoded json object - return resp.json() + """Send a request to the server and return the parsed JSON response.""" + + # it's polite to self-identify this "bot" + delphi_url = "https://delphi.cmu.edu/index.html" + user_agent = "Mozilla/5.0 (compatible; delphibot/1.0; +%s)" % delphi_url + + # the FluSurv AMF server + flusurv_url = "https://gis.cdc.gov/GRASP/Flu3/" + path + + # request headers + headers = { + "Accept-Encoding": "gzip", + "User-Agent": user_agent, + } + if payload is not None: + headers["Content-Type"] = "application/json;charset=UTF-8" + + # send the request and read the response + if payload is None: + method = requests_impl.get + data = None + else: + method = requests_impl.post + data = json.dumps(payload) + resp = method(flusurv_url, headers=headers, data=data) + + # check the HTTP status code + if resp.status_code == 500 and call_count <= 2: + # the server often fails with this status, so wait and retry + delay = 10 * call_count + print("got status %d, will retry in %d sec..." % (resp.status_code, delay)) + time.sleep(delay) + return fetch_json(path, payload, call_count=call_count + 1) + elif resp.status_code != 200: + raise Exception(["status code != 200", resp.status_code]) + + # check response mime type + if "application/json" not in resp.headers.get("Content-Type", ""): + raise Exception("response is not json") + + # return the decoded json object + return resp.json() def fetch_flusurv_object(location_code): - """Return decoded FluSurv JSON object for the given location.""" - return fetch_json('PostPhase03GetData', { - 'appversion': 'Public', - 'networkid': location_code[0], - 'cacthmentid': location_code[1], - }) + """Return decoded FluSurv JSON object for the given location.""" + return fetch_json( + "PostPhase03GetData", + { + "appversion": "Public", + "networkid": location_code[0], + "cacthmentid": location_code[1], + }, + ) def mmwrid_to_epiweek(mmwrid): - """Convert a CDC week index into an epiweek.""" + """Convert a CDC week index into an epiweek.""" - # Add the difference in IDs, which are sequential, to a reference epiweek, - # which is 2003w40 in this case. - epiweek_200340 = EpiDate(2003, 9, 28) - mmwrid_200340 = 2179 - return epiweek_200340.add_weeks(mmwrid - mmwrid_200340).get_ew() + # Add the difference in IDs, which are sequential, to a reference epiweek, + # which is 2003w40 in this case. + epiweek_200340 = EpiDate(2003, 9, 28) + mmwrid_200340 = 2179 + return epiweek_200340.add_weeks(mmwrid - mmwrid_200340).get_ew() def extract_from_object(data_in): - """ - Given a FluSurv data object, return hospitaliation rates. - - The returned object is indexed first by epiweek, then by zero-indexed age - group. - """ - - # an object to hold the result - data_out = {} - - # iterate over all seasons and age groups - for obj in data_in['busdata']['dataseries']: - if obj['age'] in (10, 11, 12): - # TODO(https://github.com/cmu-delphi/delphi-epidata/issues/242): - # capture as-of-yet undefined age groups 10, 11, and 12 - continue - age_index = obj['age'] - 1 - # iterage over weeks - for mmwrid, _, _, rate in obj['data']: - epiweek = mmwrid_to_epiweek(mmwrid) - if epiweek not in data_out: - # weekly rate of each age group - data_out[epiweek] = [None] * 9 - prev_rate = data_out[epiweek][age_index] - if prev_rate is None: - # this is the first time to see a rate for this epiweek/age - data_out[epiweek][age_index] = rate - elif prev_rate != rate: - # a different rate was already found for this epiweek/age - format_args = (epiweek, obj['age'], prev_rate, rate) - print('warning: %d %d %f != %f' % format_args) - - # sanity check the result - if len(data_out) == 0: - raise Exception('no data found') - - # print the result and return flu data - print('found data for %d weeks' % len(data_out)) - return data_out + """ + Given a FluSurv data object, return hospitaliation rates. + + The returned object is indexed first by epiweek, then by zero-indexed age + group. + """ + + # an object to hold the result + data_out = {} + + # iterate over all seasons and age groups + for obj in data_in["busdata"]["dataseries"]: + if obj["age"] in (10, 11, 12): + # TODO(https://github.com/cmu-delphi/delphi-epidata/issues/242): + # capture as-of-yet undefined age groups 10, 11, and 12 + continue + age_index = obj["age"] - 1 + # iterage over weeks + for mmwrid, _, _, rate in obj["data"]: + epiweek = mmwrid_to_epiweek(mmwrid) + if epiweek not in data_out: + # weekly rate of each age group + data_out[epiweek] = [None] * 9 + prev_rate = data_out[epiweek][age_index] + if prev_rate is None: + # this is the first time to see a rate for this epiweek/age + data_out[epiweek][age_index] = rate + elif prev_rate != rate: + # a different rate was already found for this epiweek/age + format_args = (epiweek, obj["age"], prev_rate, rate) + print("warning: %d %d %f != %f" % format_args) + + # sanity check the result + if len(data_out) == 0: + raise Exception("no data found") + + # print the result and return flu data + print("found data for %d weeks" % len(data_out)) + return data_out def get_data(location_code): - """ - Fetch and parse flu data for the given location. + """ + Fetch and parse flu data for the given location. - This method performs the following operations: - - fetches FluSurv data from CDC - - extracts and returns hospitaliation rates - """ + This method performs the following operations: + - fetches FluSurv data from CDC + - extracts and returns hospitaliation rates + """ - # fetch - print('[fetching flusurv data...]') - data_in = fetch_flusurv_object(location_code) + # fetch + print("[fetching flusurv data...]") + data_in = fetch_flusurv_object(location_code) - # extract - print('[extracting values...]') - data_out = extract_from_object(data_in) + # extract + print("[extracting values...]") + data_out = extract_from_object(data_in) - # return - print('[scraped successfully]') - return data_out + # return + print("[scraped successfully]") + return data_out def get_current_issue(): - """Scrape the current issue from the FluSurv main page.""" + """Scrape the current issue from the FluSurv main page.""" - # fetch - data = fetch_json('GetPhase03InitApp?appVersion=Public', None) + # fetch + data = fetch_json("GetPhase03InitApp?appVersion=Public", None) - # extract - date = datetime.strptime(data['loaddatetime'], '%b %d, %Y') + # extract + date = datetime.strptime(data["loaddatetime"], "%b %d, %Y") - # convert and return - return EpiDate(date.year, date.month, date.day).get_ew() + # convert and return + return EpiDate(date.year, date.month, date.day).get_ew() diff --git a/src/acquisition/flusurv/flusurv_update.py b/src/acquisition/flusurv/flusurv_update.py index 35fadba05..0715bba37 100644 --- a/src/acquisition/flusurv/flusurv_update.py +++ b/src/acquisition/flusurv/flusurv_update.py @@ -82,108 +82,101 @@ def get_rows(cur): - """Return the number of rows in the `flusurv` table.""" + """Return the number of rows in the `flusurv` table.""" - # count all rows - cur.execute('SELECT count(1) `num` FROM `flusurv`') - for (num,) in cur: - return num + # count all rows + cur.execute("SELECT count(1) `num` FROM `flusurv`") + for (num,) in cur: + return num def update(issue, location_name, test_mode=False): - """Fetch and store the currently avialble weekly FluSurv dataset.""" - - # fetch data - location_code = flusurv.location_codes[location_name] - print('fetching data for', location_name, location_code) - data = flusurv.get_data(location_code) - - # metadata - epiweeks = sorted(data.keys()) - location = location_name - release_date = str(EpiDate.today()) - - # connect to the database - u, p = secrets.db.epi - cnx = mysql.connector.connect( - host=secrets.db.host, user=u, password=p, database='epidata') - cur = cnx.cursor() - rows1 = get_rows(cur) - print('rows before: %d' % rows1) - - # SQL for insert/update - sql = ''' - INSERT INTO `flusurv` ( - `release_date`, `issue`, `epiweek`, `location`, `lag`, `rate_age_0`, - `rate_age_1`, `rate_age_2`, `rate_age_3`, `rate_age_4`, `rate_overall`, - `rate_age_5`, `rate_age_6`, `rate_age_7` - ) - VALUES ( - %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s - ) - ON DUPLICATE KEY UPDATE - `release_date` = least(`release_date`, %s), - `rate_age_0` = coalesce(%s, `rate_age_0`), - `rate_age_1` = coalesce(%s, `rate_age_1`), - `rate_age_2` = coalesce(%s, `rate_age_2`), - `rate_age_3` = coalesce(%s, `rate_age_3`), - `rate_age_4` = coalesce(%s, `rate_age_4`), - `rate_overall` = coalesce(%s, `rate_overall`), - `rate_age_5` = coalesce(%s, `rate_age_5`), - `rate_age_6` = coalesce(%s, `rate_age_6`), - `rate_age_7` = coalesce(%s, `rate_age_7`) - ''' - - # insert/update each row of data (one per epiweek) - for epiweek in epiweeks: - lag = delta_epiweeks(epiweek, issue) - if lag > 52: - # Ignore values older than one year, as (1) they are assumed not to - # change, and (2) it would adversely affect database performance if all - # values (including duplicates) were stored on each run. - continue - args_meta = [release_date, issue, epiweek, location, lag] - args_insert = data[epiweek] - args_update = [release_date] + data[epiweek] - cur.execute(sql, tuple(args_meta + args_insert + args_update)) - - # commit and disconnect - rows2 = get_rows(cur) - print('rows after: %d (+%d)' % (rows2, rows2 - rows1)) - cur.close() - if test_mode: - print('test mode: not committing database changes') - else: - cnx.commit() - cnx.close() + """Fetch and store the currently avialble weekly FluSurv dataset.""" + + # fetch data + location_code = flusurv.location_codes[location_name] + print("fetching data for", location_name, location_code) + data = flusurv.get_data(location_code) + + # metadata + epiweeks = sorted(data.keys()) + location = location_name + release_date = str(EpiDate.today()) + + # connect to the database + u, p = secrets.db.epi + cnx = mysql.connector.connect(host=secrets.db.host, user=u, password=p, database="epidata") + cur = cnx.cursor() + rows1 = get_rows(cur) + print("rows before: %d" % rows1) + + # SQL for insert/update + sql = """ + INSERT INTO `flusurv` ( + `release_date`, `issue`, `epiweek`, `location`, `lag`, `rate_age_0`, + `rate_age_1`, `rate_age_2`, `rate_age_3`, `rate_age_4`, `rate_overall`, + `rate_age_5`, `rate_age_6`, `rate_age_7` + ) + VALUES ( + %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s + ) + ON DUPLICATE KEY UPDATE + `release_date` = least(`release_date`, %s), + `rate_age_0` = coalesce(%s, `rate_age_0`), + `rate_age_1` = coalesce(%s, `rate_age_1`), + `rate_age_2` = coalesce(%s, `rate_age_2`), + `rate_age_3` = coalesce(%s, `rate_age_3`), + `rate_age_4` = coalesce(%s, `rate_age_4`), + `rate_overall` = coalesce(%s, `rate_overall`), + `rate_age_5` = coalesce(%s, `rate_age_5`), + `rate_age_6` = coalesce(%s, `rate_age_6`), + `rate_age_7` = coalesce(%s, `rate_age_7`) + """ + + # insert/update each row of data (one per epiweek) + for epiweek in epiweeks: + lag = delta_epiweeks(epiweek, issue) + if lag > 52: + # Ignore values older than one year, as (1) they are assumed not to + # change, and (2) it would adversely affect database performance if all + # values (including duplicates) were stored on each run. + continue + args_meta = [release_date, issue, epiweek, location, lag] + args_insert = data[epiweek] + args_update = [release_date] + data[epiweek] + cur.execute(sql, tuple(args_meta + args_insert + args_update)) + + # commit and disconnect + rows2 = get_rows(cur) + print("rows after: %d (+%d)" % (rows2, rows2 - rows1)) + cur.close() + if test_mode: + print("test mode: not committing database changes") + else: + cnx.commit() + cnx.close() def main(): - # args and usage - parser = argparse.ArgumentParser() - parser.add_argument( - 'location', - help='location for which data should be scraped (e.g. "CA" or "all")' - ) - parser.add_argument( - '--test', '-t', - default=False, action='store_true', help='do not commit database changes' - ) - args = parser.parse_args() - - # scrape current issue from the main page - issue = flusurv.get_current_issue() - print('current issue: %d' % issue) - - # fetch flusurv data - if args.location == 'all': - # all locations - for location in flusurv.location_codes.keys(): - update(issue, location, args.test) - else: - # single location - update(issue, args.location, args.test) - - -if __name__ == '__main__': - main() + # args and usage + parser = argparse.ArgumentParser() + parser.add_argument("location", help='location for which data should be scraped (e.g. "CA" or "all")') + parser.add_argument("--test", "-t", default=False, action="store_true", help="do not commit database changes") + args = parser.parse_args() + + # scrape current issue from the main page + issue = flusurv.get_current_issue() + print("current issue: %d" % issue) + + # fetch flusurv data + if args.location == "all": + # all locations + for location in flusurv.location_codes.keys(): + update(issue, location, args.test) + else: + # single location + update(issue, args.location, args.test) + + +if __name__ == "__main__": + main() diff --git a/src/acquisition/fluview/fluview.py b/src/acquisition/fluview/fluview.py index d723cbc59..a3298c4ce 100644 --- a/src/acquisition/fluview/fluview.py +++ b/src/acquisition/fluview/fluview.py @@ -34,183 +34,188 @@ class Key: - """ - Constants for navigating the metadata object contained in the web response - from CDC. - """ + """ + Constants for navigating the metadata object contained in the web response + from CDC. + """ - class TierType: - nat = 'National' - hhs = 'HHS Regions' - cen = 'Census Divisions' - sta = 'State' + class TierType: + nat = "National" + hhs = "HHS Regions" + cen = "Census Divisions" + sta = "State" - class TierListEntry: - hhs = 'hhsregion' - cen = 'censusregions' - sta = 'states' + class TierListEntry: + hhs = "hhsregion" + cen = "censusregions" + sta = "states" - class TierIdEntry: - hhs = 'hhsregionid' - cen = 'censusregionid' - sta = 'stateid' + class TierIdEntry: + hhs = "hhsregionid" + cen = "censusregionid" + sta = "stateid" def check_status(resp, status, content_type): - """Raise an exception if the status code or content type is unexpected.""" - if resp.status_code != status: - raise Exception('got unexpected status code: ' + str(resp.status_code)) - actual_type = resp.headers.get('Content-Type', None) - if actual_type is None or content_type not in actual_type.lower(): - raise Exception('got unexpected content type: ' + str(actual_type)) + """Raise an exception if the status code or content type is unexpected.""" + if resp.status_code != status: + raise Exception("got unexpected status code: " + str(resp.status_code)) + actual_type = resp.headers.get("Content-Type", None) + if actual_type is None or content_type not in actual_type.lower(): + raise Exception("got unexpected content type: " + str(actual_type)) def fetch_metadata(sess): - """ - Return metadata indicating the current issue and also numeric constants - representing the various locations. - """ - url = 'https://gis.cdc.gov/grasp/flu2/GetPhase02InitApp?appVersion=Public' - resp = sess.get(url) - check_status(resp, 200, 'application/json') - return resp.json() + """ + Return metadata indicating the current issue and also numeric constants + representing the various locations. + """ + url = "https://gis.cdc.gov/grasp/flu2/GetPhase02InitApp?appVersion=Public" + resp = sess.get(url) + check_status(resp, 200, "application/json") + return resp.json() def get_issue_and_locations(data): - """Extract the issue and per-tier location lists from the metadata object.""" - - def get_tier_ids(name): - for row in data['regiontypes']: - if row['description'] == name: - return row['regiontypeid'] - raise Exception() - - tier_ids = dict((name, get_tier_ids(name)) for name in ( - Key.TierType.nat, - Key.TierType.hhs, - Key.TierType.cen, - Key.TierType.sta, - )) - - location_ids = { - Key.TierType.nat: [0], - Key.TierType.hhs: [], - Key.TierType.cen: [], - Key.TierType.sta: [], - } - - # add location ids for HHS - for row in data[Key.TierListEntry.hhs]: - location_ids[Key.TierType.hhs].append(row[Key.TierIdEntry.hhs]) - location_ids[Key.TierType.hhs] = sorted(set(location_ids[Key.TierType.hhs])) - num = len(location_ids[Key.TierType.hhs]) - if num != 10: - raise Exception('expected 10 hhs regions, found %d' % num) - - # add location ids for census divisions - for row in data[Key.TierListEntry.cen]: - location_ids[Key.TierType.cen].append(row[Key.TierIdEntry.cen]) - location_ids[Key.TierType.cen] = sorted(set(location_ids[Key.TierType.cen])) - num = len(location_ids[Key.TierType.cen]) - if num != 9: - raise Exception('expected 9 census divisions, found %d' % num) - - # add location ids for states - for row in data[Key.TierListEntry.sta]: - location_ids[Key.TierType.sta].append(row[Key.TierIdEntry.sta]) - location_ids[Key.TierType.sta] = sorted(set(location_ids[Key.TierType.sta])) - num = len(location_ids[Key.TierType.sta]) - if num != 57: - raise Exception('expected 57 states/territories/cities, found %d' % num) - - # return a useful subset of the metadata - # (latest epiweek, latest season, tier ids, location ids) - return { - 'epiweek': data['mmwr'][-1]['yearweek'], - 'season_id': data['mmwr'][-1]['seasonid'], - 'tier_ids': tier_ids, - 'location_ids': location_ids, - } + """Extract the issue and per-tier location lists from the metadata object.""" + + def get_tier_ids(name): + for row in data["regiontypes"]: + if row["description"] == name: + return row["regiontypeid"] + raise Exception() + + tier_ids = dict( + (name, get_tier_ids(name)) + for name in ( + Key.TierType.nat, + Key.TierType.hhs, + Key.TierType.cen, + Key.TierType.sta, + ) + ) + + location_ids = { + Key.TierType.nat: [0], + Key.TierType.hhs: [], + Key.TierType.cen: [], + Key.TierType.sta: [], + } + + # add location ids for HHS + for row in data[Key.TierListEntry.hhs]: + location_ids[Key.TierType.hhs].append(row[Key.TierIdEntry.hhs]) + location_ids[Key.TierType.hhs] = sorted(set(location_ids[Key.TierType.hhs])) + num = len(location_ids[Key.TierType.hhs]) + if num != 10: + raise Exception("expected 10 hhs regions, found %d" % num) + + # add location ids for census divisions + for row in data[Key.TierListEntry.cen]: + location_ids[Key.TierType.cen].append(row[Key.TierIdEntry.cen]) + location_ids[Key.TierType.cen] = sorted(set(location_ids[Key.TierType.cen])) + num = len(location_ids[Key.TierType.cen]) + if num != 9: + raise Exception("expected 9 census divisions, found %d" % num) + + # add location ids for states + for row in data[Key.TierListEntry.sta]: + location_ids[Key.TierType.sta].append(row[Key.TierIdEntry.sta]) + location_ids[Key.TierType.sta] = sorted(set(location_ids[Key.TierType.sta])) + num = len(location_ids[Key.TierType.sta]) + if num != 57: + raise Exception("expected 57 states/territories/cities, found %d" % num) + + # return a useful subset of the metadata + # (latest epiweek, latest season, tier ids, location ids) + return { + "epiweek": data["mmwr"][-1]["yearweek"], + "season_id": data["mmwr"][-1]["seasonid"], + "tier_ids": tier_ids, + "location_ids": location_ids, + } def download_data(tier_id, location_ids, season_ids, filename): - """Download zipped ILINet data for the given locations and seasons.""" - - def get_entry(num, name=None): - return {'ID': num, 'Name': (name if name else num)} - - # download the data (in memory) - url = 'https://gis.cdc.gov/grasp/flu2/PostPhase02DataDownload' - data = { - 'AppVersion': 'Public', - 'DatasourceDT': [get_entry(1, 'ILINet'), get_entry(0, 'WHO_NREVSS')], - 'RegionTypeId': tier_id, - 'SubRegionsDT': [get_entry(loc) for loc in sorted(location_ids)], - 'SeasonsDT': [get_entry(season) for season in sorted(season_ids)], - } - resp = requests.post(url, json=data) - check_status(resp, 200, 'application/octet-stream') - payload = resp.content - - # save the data to file and return the file length - with open(filename, 'wb') as f: - f.write(payload) - return len(payload) + """Download zipped ILINet data for the given locations and seasons.""" + + def get_entry(num, name=None): + return {"ID": num, "Name": (name if name else num)} + + # download the data (in memory) + url = "https://gis.cdc.gov/grasp/flu2/PostPhase02DataDownload" + data = { + "AppVersion": "Public", + "DatasourceDT": [get_entry(1, "ILINet"), get_entry(0, "WHO_NREVSS")], + "RegionTypeId": tier_id, + "SubRegionsDT": [get_entry(loc) for loc in sorted(location_ids)], + "SeasonsDT": [get_entry(season) for season in sorted(season_ids)], + } + resp = requests.post(url, json=data) + check_status(resp, 200, "application/octet-stream") + payload = resp.content + + # save the data to file and return the file length + with open(filename, "wb") as f: + f.write(payload) + return len(payload) def save_latest(path=None): - """ - Save the latest two seasons of data for all locations, separately for each - location tier (i.e. national, HHS, census, and states). - """ - - # set up the session - sess = requests.session() - sess.headers.update({ - # it's polite to self-identify this "bot" - 'User-Agent': 'delphibot/1.0 (+https://delphi.cmu.edu/)', - }) - - # get metatdata - print('looking up ilinet metadata') - data = fetch_metadata(sess) - info = get_issue_and_locations(data) - issue = info['epiweek'] - print('current issue: %d' % issue) - - # establish timing - dt = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') - current_season = info['season_id'] - seasons = [s for s in range(current_season - 1, current_season + 1)] - - # make the destination path if it doesn't already exist - if path is not None: - os.makedirs(path, exist_ok=True) - - # download the data file for each tier - files = [] - for delphi_name, cdc_name in ( - ('nat', Key.TierType.nat), - ('hhs', Key.TierType.hhs), - ('cen', Key.TierType.cen), - ('sta', Key.TierType.sta), - ): - name = 'ilinet_%s_%d_%s.zip' % (delphi_name, issue, dt) - if path is None: - filename = name - else: - filename = os.path.join(path, name) - tier_id = info['tier_ids'][cdc_name] - locations = info['location_ids'][cdc_name] - - # download and show timing information - print('downloading %s' % delphi_name) - t0 = time.time() - size = download_data(tier_id, locations, seasons, filename) - t1 = time.time() - - print(' saved %s (%d bytes in %.1f seconds)' % (filename, size, t1 - t0)) - files.append(filename) - - # return the current issue and the list of downloaded files - return issue, files + """ + Save the latest two seasons of data for all locations, separately for each + location tier (i.e. national, HHS, census, and states). + """ + + # set up the session + sess = requests.session() + sess.headers.update( + { + # it's polite to self-identify this "bot" + "User-Agent": "delphibot/1.0 (+https://delphi.cmu.edu/)", + } + ) + + # get metatdata + print("looking up ilinet metadata") + data = fetch_metadata(sess) + info = get_issue_and_locations(data) + issue = info["epiweek"] + print("current issue: %d" % issue) + + # establish timing + dt = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + current_season = info["season_id"] + seasons = [s for s in range(current_season - 1, current_season + 1)] + + # make the destination path if it doesn't already exist + if path is not None: + os.makedirs(path, exist_ok=True) + + # download the data file for each tier + files = [] + for delphi_name, cdc_name in ( + ("nat", Key.TierType.nat), + ("hhs", Key.TierType.hhs), + ("cen", Key.TierType.cen), + ("sta", Key.TierType.sta), + ): + name = "ilinet_%s_%d_%s.zip" % (delphi_name, issue, dt) + if path is None: + filename = name + else: + filename = os.path.join(path, name) + tier_id = info["tier_ids"][cdc_name] + locations = info["location_ids"][cdc_name] + + # download and show timing information + print("downloading %s" % delphi_name) + t0 = time.time() + size = download_data(tier_id, locations, seasons, filename) + t1 = time.time() + + print(" saved %s (%d bytes in %.1f seconds)" % (filename, size, t1 - t0)) + files.append(filename) + + # return the current issue and the list of downloaded files + return issue, files diff --git a/src/acquisition/fluview/fluview_locations.py b/src/acquisition/fluview/fluview_locations.py index 9c851bc6f..e5ebe0fc3 100644 --- a/src/acquisition/fluview/fluview_locations.py +++ b/src/acquisition/fluview/fluview_locations.py @@ -15,100 +15,100 @@ # https://gis.cdc.gov/grasp/flu2/GetPhase02InitApp?appVersion=Public # The values are used in queries of Delphi's Epidata API. cdc_to_delphi = { - 'national': { - 'x': 'nat', - }, - 'hhs regions': { - 'region 1': 'hhs1', - 'region 2': 'hhs2', - 'region 3': 'hhs3', - 'region 4': 'hhs4', - 'region 5': 'hhs5', - 'region 6': 'hhs6', - 'region 7': 'hhs7', - 'region 8': 'hhs8', - 'region 9': 'hhs9', - 'region 10': 'hhs10', - }, - 'census regions': { - 'new england': 'cen1', - 'mid-atlantic': 'cen2', - 'east north central': 'cen3', - 'west north central': 'cen4', - 'south atlantic': 'cen5', - 'east south central': 'cen6', - 'west south central': 'cen7', - 'mountain': 'cen8', - 'pacific': 'cen9', - }, - 'states': { - # states/territories: two-letter ISO 3166 - 'alabama': 'al', - 'alaska': 'ak', - 'arizona': 'az', - 'arkansas': 'ar', - 'california': 'ca', - 'colorado': 'co', - 'connecticut': 'ct', - 'delaware': 'de', - 'florida': 'fl', - 'georgia': 'ga', - 'hawaii': 'hi', - 'idaho': 'id', - 'illinois': 'il', - 'indiana': 'in', - 'iowa': 'ia', - 'kansas': 'ks', - 'kentucky': 'ky', - 'louisiana': 'la', - 'maine': 'me', - 'maryland': 'md', - 'massachusetts': 'ma', - 'michigan': 'mi', - 'minnesota': 'mn', - 'mississippi': 'ms', - 'missouri': 'mo', - 'montana': 'mt', - 'nebraska': 'ne', - 'nevada': 'nv', - 'new hampshire': 'nh', - 'new jersey': 'nj', - 'new mexico': 'nm', - # Even though it's called "New York", this location doesn't include New - # York City ("jfk"). New York ("ny") is actually this *plus* jfk. - 'new york': 'ny_minus_jfk', - 'north carolina': 'nc', - 'north dakota': 'nd', - 'ohio': 'oh', - 'oklahoma': 'ok', - 'oregon': 'or', - 'pennsylvania': 'pa', - 'rhode island': 'ri', - 'south carolina': 'sc', - 'south dakota': 'sd', - 'tennessee': 'tn', - 'texas': 'tx', - 'utah': 'ut', - 'vermont': 'vt', - 'virginia': 'va', - 'washington': 'wa', - 'west virginia': 'wv', - 'wisconsin': 'wi', - 'wyoming': 'wy', - 'american samoa': 'as', - 'commonwealth of the northern mariana islands': 'mp', - 'district of columbia': 'dc', - 'guam': 'gu', - 'puerto rico': 'pr', - 'virgin islands': 'vi', - # cities: three-letter IATA - 'chicago': 'ord', - 'los angeles': 'lax', - 'new york city': 'jfk', - }, + "national": { + "x": "nat", + }, + "hhs regions": { + "region 1": "hhs1", + "region 2": "hhs2", + "region 3": "hhs3", + "region 4": "hhs4", + "region 5": "hhs5", + "region 6": "hhs6", + "region 7": "hhs7", + "region 8": "hhs8", + "region 9": "hhs9", + "region 10": "hhs10", + }, + "census regions": { + "new england": "cen1", + "mid-atlantic": "cen2", + "east north central": "cen3", + "west north central": "cen4", + "south atlantic": "cen5", + "east south central": "cen6", + "west south central": "cen7", + "mountain": "cen8", + "pacific": "cen9", + }, + "states": { + # states/territories: two-letter ISO 3166 + "alabama": "al", + "alaska": "ak", + "arizona": "az", + "arkansas": "ar", + "california": "ca", + "colorado": "co", + "connecticut": "ct", + "delaware": "de", + "florida": "fl", + "georgia": "ga", + "hawaii": "hi", + "idaho": "id", + "illinois": "il", + "indiana": "in", + "iowa": "ia", + "kansas": "ks", + "kentucky": "ky", + "louisiana": "la", + "maine": "me", + "maryland": "md", + "massachusetts": "ma", + "michigan": "mi", + "minnesota": "mn", + "mississippi": "ms", + "missouri": "mo", + "montana": "mt", + "nebraska": "ne", + "nevada": "nv", + "new hampshire": "nh", + "new jersey": "nj", + "new mexico": "nm", + # Even though it's called "New York", this location doesn't include New + # York City ("jfk"). New York ("ny") is actually this *plus* jfk. + "new york": "ny_minus_jfk", + "north carolina": "nc", + "north dakota": "nd", + "ohio": "oh", + "oklahoma": "ok", + "oregon": "or", + "pennsylvania": "pa", + "rhode island": "ri", + "south carolina": "sc", + "south dakota": "sd", + "tennessee": "tn", + "texas": "tx", + "utah": "ut", + "vermont": "vt", + "virginia": "va", + "washington": "wa", + "west virginia": "wv", + "wisconsin": "wi", + "wyoming": "wy", + "american samoa": "as", + "commonwealth of the northern mariana islands": "mp", + "district of columbia": "dc", + "guam": "gu", + "puerto rico": "pr", + "virgin islands": "vi", + # cities: three-letter IATA + "chicago": "ord", + "los angeles": "lax", + "new york city": "jfk", + }, } def get_location_name(region_type, region_name): - """Convert a CDC location type and name pair into a Delphi location name.""" - return cdc_to_delphi[region_type.lower()][region_name.lower()] + """Convert a CDC location type and name pair into a Delphi location name.""" + return cdc_to_delphi[region_type.lower()][region_name.lower()] diff --git a/src/acquisition/fluview/fluview_notify.py b/src/acquisition/fluview/fluview_notify.py index 13f0f3559..d4d426556 100644 --- a/src/acquisition/fluview/fluview_notify.py +++ b/src/acquisition/fluview/fluview_notify.py @@ -31,41 +31,41 @@ import delphi.operations.secrets as secrets -if __name__ == '__main__': - # Args and usage - parser = argparse.ArgumentParser() - parser.add_argument('-t', '--test', action='store_const', const=True, default=False, help="do dry run only, don't update the database") - args = parser.parse_args() +if __name__ == "__main__": + # Args and usage + parser = argparse.ArgumentParser() + parser.add_argument("-t", "--test", action="store_const", const=True, default=False, help="do dry run only, don't update the database") + args = parser.parse_args() - # connect - u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - cur = cnx.cursor() + # connect + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") + cur = cnx.cursor() - # get the last known issue from the automation table `variables` - cur.execute('SELECT `value` FROM automation.`variables` WHERE `name` = %s', ('most_recent_issue',)) - for (issue1,) in cur: - issue1 = int(issue1) - print('last known issue:', issue1) - # get the most recent issue from the epidata table `fluview` - cur.execute('SELECT max(`issue`) FROM `fluview`') - for (issue2,) in cur: - issue2 = int(issue2) - print('most recent issue:', issue2) + # get the last known issue from the automation table `variables` + cur.execute("SELECT `value` FROM automation.`variables` WHERE `name` = %s", ("most_recent_issue",)) + for (issue1,) in cur: + issue1 = int(issue1) + print("last known issue:", issue1) + # get the most recent issue from the epidata table `fluview` + cur.execute("SELECT max(`issue`) FROM `fluview`") + for (issue2,) in cur: + issue2 = int(issue2) + print("most recent issue:", issue2) - if issue2 > issue1: - print('new data is available!') - if args.test: - print('test mode - not making any changes') - else: - # update the variable - cur.execute('UPDATE automation.`variables` SET `value` = %s WHERE `name` = %s', (issue2, 'most_recent_issue')) - # queue the 'New FluView Available' flow - cur.execute('CALL automation.RunStep(36)') - elif issue2 < issue2: - raise Exception('most recent issue is older than the last known issue') + if issue2 > issue1: + print("new data is available!") + if args.test: + print("test mode - not making any changes") + else: + # update the variable + cur.execute("UPDATE automation.`variables` SET `value` = %s WHERE `name` = %s", (issue2, "most_recent_issue")) + # queue the 'New FluView Available' flow + cur.execute("CALL automation.RunStep(36)") + elif issue2 < issue2: + raise Exception("most recent issue is older than the last known issue") - # cleanup - cnx.commit() - cur.close() - cnx.close() + # cleanup + cnx.commit() + cur.close() + cnx.close() diff --git a/src/acquisition/fluview/fluview_update.py b/src/acquisition/fluview/fluview_update.py index 65bec7a40..bafa01855 100644 --- a/src/acquisition/fluview/fluview_update.py +++ b/src/acquisition/fluview/fluview_update.py @@ -130,398 +130,352 @@ from . import fluview_locations # sheet names -ILINET_SHEET = 'ILINet.csv' -PHL_SHEET = 'WHO_NREVSS_Public_Health_Labs.csv' -CL_SHEET = 'WHO_NREVSS_Clinical_Labs.csv' +ILINET_SHEET = "ILINet.csv" +PHL_SHEET = "WHO_NREVSS_Public_Health_Labs.csv" +CL_SHEET = "WHO_NREVSS_Clinical_Labs.csv" # table names -CL_TABLE = 'fluview_clinical' -PHL_TABLE = 'fluview_public' +CL_TABLE = "fluview_clinical" +PHL_TABLE = "fluview_public" + def optional_int(i): - return int(i) if i not in ('', 'X') else None + return int(i) if i not in ("", "X") else None + def optional_float(i, j): - return float(i) if i not in ('', 'X') else float(j) + return float(i) if i not in ("", "X") else float(j) + def nullable_float(i): - return float(i) if i not in ('', 'X') else None + return float(i) if i not in ("", "X") else None + def get_ilinet_data(row): - if row[0] == 'REGION TYPE' and row != [ - 'REGION TYPE', - 'REGION', - 'YEAR', - 'WEEK', - '% WEIGHTED ILI', - '%UNWEIGHTED ILI', - 'AGE 0-4', - 'AGE 25-49', - 'AGE 25-64', - 'AGE 5-24', - 'AGE 50-64', - 'AGE 65', - 'ILITOTAL', - 'NUM. OF PROVIDERS', - 'TOTAL PATIENTS' - ]: - raise Exception('header row has changed') - if len(row) == 1 or row[0] == 'REGION TYPE': - # this is a header row - return None - if row[5] == 'X': - # ILI isn't reported, ignore this row - return None - return { - 'location': fluview_locations.get_location_name(*row[:2]), - 'epiweek': join_epiweek(int(row[2]), int(row[3])), - 'wili': optional_float(*row[4:6]), - 'ili': float(row[5]), - 'age0': optional_int(row[6]), - 'age1': optional_int(row[9]), - 'age2': optional_int(row[8]), - 'age3': optional_int(row[7]), - 'age4': optional_int(row[10]), - 'age5': optional_int(row[11]), - 'n_ili': optional_int(row[12]), - 'n_providers': optional_int(row[13]), - 'n_patients': optional_int(row[14]), - } + if row[0] == "REGION TYPE" and row != [ + "REGION TYPE", + "REGION", + "YEAR", + "WEEK", + "% WEIGHTED ILI", + "%UNWEIGHTED ILI", + "AGE 0-4", + "AGE 25-49", + "AGE 25-64", + "AGE 5-24", + "AGE 50-64", + "AGE 65", + "ILITOTAL", + "NUM. OF PROVIDERS", + "TOTAL PATIENTS", + ]: + raise Exception("header row has changed") + if len(row) == 1 or row[0] == "REGION TYPE": + # this is a header row + return None + if row[5] == "X": + # ILI isn't reported, ignore this row + return None + return { + "location": fluview_locations.get_location_name(*row[:2]), + "epiweek": join_epiweek(int(row[2]), int(row[3])), + "wili": optional_float(*row[4:6]), + "ili": float(row[5]), + "age0": optional_int(row[6]), + "age1": optional_int(row[9]), + "age2": optional_int(row[8]), + "age3": optional_int(row[7]), + "age4": optional_int(row[10]), + "age5": optional_int(row[11]), + "n_ili": optional_int(row[12]), + "n_providers": optional_int(row[13]), + "n_patients": optional_int(row[14]), + } + def get_clinical_data(row): - if row[0] == 'REGION TYPE' and row != [ - 'REGION TYPE', - 'REGION', - 'YEAR', - 'WEEK', - 'TOTAL SPECIMENS', - 'TOTAL A', - 'TOTAL B', - 'PERCENT POSITIVE', - 'PERCENT A', - 'PERCENT B' - ]: - raise Exception('header row has changed for clinical lab data.') - if len(row) == 1 or row[0] == 'REGION TYPE': - # this is a header row - return None - if row[4] == 'X': - # data is not reported, ignore this row - return None - # ignore percentage calculations for now - return { - 'location': fluview_locations.get_location_name(*row[:2]), - 'epiweek': join_epiweek(int(row[2]), int(row[3])), - 'total_specimens': int(row[4]), - 'total_a': optional_int(row[5]), - 'total_b': optional_int(row[6]), - 'percent_positive': nullable_float(row[7]), - 'percent_a': nullable_float(row[8]), - 'percent_b': nullable_float(row[9]) - } + if row[0] == "REGION TYPE" and row != ["REGION TYPE", "REGION", "YEAR", "WEEK", "TOTAL SPECIMENS", "TOTAL A", "TOTAL B", "PERCENT POSITIVE", "PERCENT A", "PERCENT B"]: + raise Exception("header row has changed for clinical lab data.") + if len(row) == 1 or row[0] == "REGION TYPE": + # this is a header row + return None + if row[4] == "X": + # data is not reported, ignore this row + return None + # ignore percentage calculations for now + return { + "location": fluview_locations.get_location_name(*row[:2]), + "epiweek": join_epiweek(int(row[2]), int(row[3])), + "total_specimens": int(row[4]), + "total_a": optional_int(row[5]), + "total_b": optional_int(row[6]), + "percent_positive": nullable_float(row[7]), + "percent_a": nullable_float(row[8]), + "percent_b": nullable_float(row[9]), + } + def get_public_data(row): - hrow1 = [ - 'REGION TYPE', - 'REGION', - 'SEASON_DESCRIPTION', - 'TOTAL SPECIMENS', - 'A (2009 H1N1)', - 'A (H3)', - 'A (Subtyping not Performed)', - 'B', - 'BVic', - 'BYam', - 'H3N2v' - ] - hrow2 = [ - 'REGION TYPE', - 'REGION', - 'YEAR', - 'WEEK', - 'TOTAL SPECIMENS', - 'A (2009 H1N1)', - 'A (H3)', - 'A (Subtyping not Performed)', - 'B', - 'BVic', - 'BYam', - 'H3N2v' - ] - if row[0] == 'REGION TYPE' and row != hrow1 and row != hrow2: - raise Exception('header row has changed for public health lab data.') - if len(row) == 1 or row[0] == 'REGION TYPE': - # header row - return None - if row[3] == 'X': - # data is not reported, ignore this row - return None - # handle case where data is reported by season, not by epiweek - is_weekly = len(row) == len(hrow2) - # set epiweek - if is_weekly: - epiweek = join_epiweek(int(row[2]), int(row[3])) - else: - epiweek = int(row[2][7:11]) * 100 + 40 - # row offset - offset = 1 if is_weekly else 0 - return { - 'location': fluview_locations.get_location_name(*row[:2]), - 'epiweek': epiweek, - 'total_specimens': int(row[3 + offset]), - 'total_a_h1n1': optional_int(row[4+ offset]), - 'total_a_h3': optional_int(row[5 + offset]), - 'total_a_h3n2v': optional_int(row[10 + offset]), - 'total_a_no_sub': optional_int(row[6 + offset]), - 'total_b': optional_int(row[7 + offset]), - 'total_b_vic': optional_int(row[8 + offset]), - 'total_b_yam': optional_int(row[9 + offset]) - } - -def load_zipped_csv(filename, sheetname='ILINet.csv'): - """Read rows from a zipped CSV, which is expected to be named as specified - by the sheetname parameter. Default is ILINet.csv, for the default flu data.""" - with zipfile.ZipFile(filename) as f: - with f.open(sheetname) as ff: - return [row for row in csv.reader(io.StringIO(str(ff.read(), 'utf-8')))] - -def get_rows(cnx, table='fluview'): - """Count and return the number of rows in the `fluview` table. - Looking at the fluview table by default, but may pass parameter - to look at public health or clinical lab data instead.""" - select = cnx.cursor() - select.execute('SELECT count(1) num FROM %s' % table) - for (num,) in select: - pass - select.close() - return num + hrow1 = ["REGION TYPE", "REGION", "SEASON_DESCRIPTION", "TOTAL SPECIMENS", "A (2009 H1N1)", "A (H3)", "A (Subtyping not Performed)", "B", "BVic", "BYam", "H3N2v"] + hrow2 = ["REGION TYPE", "REGION", "YEAR", "WEEK", "TOTAL SPECIMENS", "A (2009 H1N1)", "A (H3)", "A (Subtyping not Performed)", "B", "BVic", "BYam", "H3N2v"] + if row[0] == "REGION TYPE" and row != hrow1 and row != hrow2: + raise Exception("header row has changed for public health lab data.") + if len(row) == 1 or row[0] == "REGION TYPE": + # header row + return None + if row[3] == "X": + # data is not reported, ignore this row + return None + # handle case where data is reported by season, not by epiweek + is_weekly = len(row) == len(hrow2) + # set epiweek + if is_weekly: + epiweek = join_epiweek(int(row[2]), int(row[3])) + else: + epiweek = int(row[2][7:11]) * 100 + 40 + # row offset + offset = 1 if is_weekly else 0 + return { + "location": fluview_locations.get_location_name(*row[:2]), + "epiweek": epiweek, + "total_specimens": int(row[3 + offset]), + "total_a_h1n1": optional_int(row[4 + offset]), + "total_a_h3": optional_int(row[5 + offset]), + "total_a_h3n2v": optional_int(row[10 + offset]), + "total_a_no_sub": optional_int(row[6 + offset]), + "total_b": optional_int(row[7 + offset]), + "total_b_vic": optional_int(row[8 + offset]), + "total_b_yam": optional_int(row[9 + offset]), + } + + +def load_zipped_csv(filename, sheetname="ILINet.csv"): + """Read rows from a zipped CSV, which is expected to be named as specified + by the sheetname parameter. Default is ILINet.csv, for the default flu data.""" + with zipfile.ZipFile(filename) as f: + with f.open(sheetname) as ff: + return [row for row in csv.reader(io.StringIO(str(ff.read(), "utf-8")))] + + +def get_rows(cnx, table="fluview"): + """Count and return the number of rows in the `fluview` table. + Looking at the fluview table by default, but may pass parameter + to look at public health or clinical lab data instead.""" + select = cnx.cursor() + select.execute("SELECT count(1) num FROM %s" % table) + for (num,) in select: + pass + select.close() + return num + def update_from_file_clinical(issue, date, filename, test_mode=False): - """ - Read WHO/NREVSS data from a zipped CSV and insert into (or update) the database. - """ - - # database connection - u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - rows1 = get_rows(cnx, CL_TABLE) - print('rows before: %d' % (rows1)) - insert = cnx.cursor() - - # load the data, ignoring empty rows - print('loading data from %s as issued on %d' % (filename, issue)) - rows = load_zipped_csv(filename, CL_SHEET) - print(' loaded %d rows' % len(rows)) - data = [get_clinical_data(row) for row in rows] - entries = [obj for obj in data if obj] - print(' found %d entries' % len(entries)) - - sql = ''' - INSERT INTO - `fluview_clinical` (`release_date`, `issue`, `epiweek`, `region`, `lag`, - `total_specimens`, `total_a`, `total_b`, `percent_positive`, `percent_a`, - `percent_b`) - VALUES - (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) - ON DUPLICATE KEY UPDATE - `release_date` = least(`release_date`, %s), - `total_specimens` = %s, - `total_a` = %s, - `total_b` = %s, - `percent_positive` = %s, - `percent_a` = %s, - `percent_b` = %s - ''' - - # insert each row - insert = cnx.cursor() - for row in entries: - lag = delta_epiweeks(row['epiweek'], issue) - args = [ - row['total_specimens'], row['total_a'], row['total_b'], - row['percent_positive'], row['percent_a'], row['percent_b'] - ] - ins_args = [date, issue, row['epiweek'], row['location'], lag] + args - upd_args = [date] + args - insert.execute(sql, ins_args + upd_args) - - # cleanup - insert.close() - if test_mode: - print('test mode, not committing') - rows2 = rows1 - else: - cnx.commit() - rows2 = get_rows(cnx) - print('rows after: %d (added %d)' % (rows2, rows2 - rows1)) - cnx.close() + """ + Read WHO/NREVSS data from a zipped CSV and insert into (or update) the database. + """ + + # database connection + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") + rows1 = get_rows(cnx, CL_TABLE) + print("rows before: %d" % (rows1)) + insert = cnx.cursor() + + # load the data, ignoring empty rows + print("loading data from %s as issued on %d" % (filename, issue)) + rows = load_zipped_csv(filename, CL_SHEET) + print(" loaded %d rows" % len(rows)) + data = [get_clinical_data(row) for row in rows] + entries = [obj for obj in data if obj] + print(" found %d entries" % len(entries)) + + sql = """ + INSERT INTO + `fluview_clinical` (`release_date`, `issue`, `epiweek`, `region`, `lag`, + `total_specimens`, `total_a`, `total_b`, `percent_positive`, `percent_a`, + `percent_b`) + VALUES + (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) + ON DUPLICATE KEY UPDATE + `release_date` = least(`release_date`, %s), + `total_specimens` = %s, + `total_a` = %s, + `total_b` = %s, + `percent_positive` = %s, + `percent_a` = %s, + `percent_b` = %s + """ + + # insert each row + insert = cnx.cursor() + for row in entries: + lag = delta_epiweeks(row["epiweek"], issue) + args = [row["total_specimens"], row["total_a"], row["total_b"], row["percent_positive"], row["percent_a"], row["percent_b"]] + ins_args = [date, issue, row["epiweek"], row["location"], lag] + args + upd_args = [date] + args + insert.execute(sql, ins_args + upd_args) + + # cleanup + insert.close() + if test_mode: + print("test mode, not committing") + rows2 = rows1 + else: + cnx.commit() + rows2 = get_rows(cnx) + print("rows after: %d (added %d)" % (rows2, rows2 - rows1)) + cnx.close() + def update_from_file_public(issue, date, filename, test_mode=False): - """ - Read WHO/NREVSS data from a zipped CSV and insert into (or update) the database. - """ - - # database connection - u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - rows1 = get_rows(cnx, PHL_TABLE) - print('rows before: %d' % (rows1)) - insert = cnx.cursor() - - # load the data, ignoring empty rows - print('loading data from %s as issued on %d' % (filename, issue)) - rows = load_zipped_csv(filename, PHL_SHEET) - print(' loaded %d rows' % len(rows)) - data = [get_public_data(row) for row in rows] - entries = [obj for obj in data if obj] - print(' found %d entries' % len(entries)) - - sql = ''' - INSERT INTO - `fluview_public` (`release_date`, `issue`, `epiweek`, `region`, `lag`, - `total_specimens`, `total_a_h1n1`, `total_a_h3`, `total_a_h3n2v`, - `total_a_no_sub`, `total_b`, `total_b_vic`, `total_b_yam`) - VALUES - (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) - ON DUPLICATE KEY UPDATE - `release_date` = least(`release_date`, %s), - `total_specimens` = %s, - `total_a_h1n1` = %s, - `total_a_h3` = %s, - `total_a_h3n2v` = %s, - `total_a_no_sub` = %s, - `total_b` = %s, - `total_b_vic` = %s, - `total_b_yam` = %s - ''' - - # insert each row - insert = cnx.cursor() - for row in entries: - lag = delta_epiweeks(row['epiweek'], issue) - args = [ - row['total_specimens'], row['total_a_h1n1'], row['total_a_h3'], - row['total_a_h3n2v'], row['total_a_no_sub'], row['total_b'], - row['total_b_vic'], row['total_b_yam'] - ] - ins_args = [date, issue, row['epiweek'], row['location'], lag] + args - upd_args = [date] + args - insert.execute(sql, ins_args + upd_args) - - # cleanup - insert.close() - if test_mode: - print('test mode, not committing') - rows2 = rows1 - else: - cnx.commit() - rows2 = get_rows(cnx) - print('rows after: %d (added %d)' % (rows2, rows2 - rows1)) - cnx.close() + """ + Read WHO/NREVSS data from a zipped CSV and insert into (or update) the database. + """ + + # database connection + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") + rows1 = get_rows(cnx, PHL_TABLE) + print("rows before: %d" % (rows1)) + insert = cnx.cursor() + + # load the data, ignoring empty rows + print("loading data from %s as issued on %d" % (filename, issue)) + rows = load_zipped_csv(filename, PHL_SHEET) + print(" loaded %d rows" % len(rows)) + data = [get_public_data(row) for row in rows] + entries = [obj for obj in data if obj] + print(" found %d entries" % len(entries)) + + sql = """ + INSERT INTO + `fluview_public` (`release_date`, `issue`, `epiweek`, `region`, `lag`, + `total_specimens`, `total_a_h1n1`, `total_a_h3`, `total_a_h3n2v`, + `total_a_no_sub`, `total_b`, `total_b_vic`, `total_b_yam`) + VALUES + (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) + ON DUPLICATE KEY UPDATE + `release_date` = least(`release_date`, %s), + `total_specimens` = %s, + `total_a_h1n1` = %s, + `total_a_h3` = %s, + `total_a_h3n2v` = %s, + `total_a_no_sub` = %s, + `total_b` = %s, + `total_b_vic` = %s, + `total_b_yam` = %s + """ + + # insert each row + insert = cnx.cursor() + for row in entries: + lag = delta_epiweeks(row["epiweek"], issue) + args = [row["total_specimens"], row["total_a_h1n1"], row["total_a_h3"], row["total_a_h3n2v"], row["total_a_no_sub"], row["total_b"], row["total_b_vic"], row["total_b_yam"]] + ins_args = [date, issue, row["epiweek"], row["location"], lag] + args + upd_args = [date] + args + insert.execute(sql, ins_args + upd_args) + + # cleanup + insert.close() + if test_mode: + print("test mode, not committing") + rows2 = rows1 + else: + cnx.commit() + rows2 = get_rows(cnx) + print("rows after: %d (added %d)" % (rows2, rows2 - rows1)) + cnx.close() + def update_from_file(issue, date, filename, test_mode=False): - """ - Read ILINet data from a zipped CSV and insert into (or update) the database. - """ - - # database connection - u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - rows1 = get_rows(cnx) - print('rows before: %d' % (rows1)) - insert = cnx.cursor() - - # load the data, ignoring empty rows - print('loading data from %s as issued on %d' % (filename, issue)) - rows = load_zipped_csv(filename) - print(' loaded %d rows' % len(rows)) - data = [get_ilinet_data(row) for row in rows] - entries = [obj for obj in data if obj] - print(' found %d entries' % len(entries)) - - sql = ''' - INSERT INTO - `fluview` (`release_date`, `issue`, `epiweek`, `region`, `lag`, `num_ili`, - `num_patients`, `num_providers`, `wili`, `ili`, `num_age_0`, `num_age_1`, - `num_age_2`, `num_age_3`, `num_age_4`, `num_age_5`) - VALUES - (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) - ON DUPLICATE KEY UPDATE - `release_date` = least(`release_date`, %s), - `num_ili` = %s, - `num_patients` = %s, - `num_providers` = %s, - `wili` = %s, - `ili` = %s, - `num_age_0` = coalesce(%s, `num_age_0`), - `num_age_1` = coalesce(%s, `num_age_1`), - `num_age_2` = coalesce(%s, `num_age_2`), - `num_age_3` = coalesce(%s, `num_age_3`), - `num_age_4` = coalesce(%s, `num_age_4`), - `num_age_5` = coalesce(%s, `num_age_5`) - ''' - - # insert each row - insert = cnx.cursor() - for row in entries: - lag = delta_epiweeks(row['epiweek'], issue) - args = [ - row['n_ili'], row['n_patients'], row['n_providers'], row['wili'], - row['ili'], row['age0'], row['age1'], row['age2'], row['age3'], - row['age4'], row['age5'] - ] - ins_args = [date, issue, row['epiweek'], row['location'], lag] + args - upd_args = [date] + args - insert.execute(sql, ins_args + upd_args) - - # cleanup - insert.close() - if test_mode: - print('test mode, not committing') - rows2 = rows1 - else: - cnx.commit() - rows2 = get_rows(cnx) - print('rows after: %d (added %d)' % (rows2, rows2 - rows1)) - cnx.close() + """ + Read ILINet data from a zipped CSV and insert into (or update) the database. + """ + + # database connection + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") + rows1 = get_rows(cnx) + print("rows before: %d" % (rows1)) + insert = cnx.cursor() + + # load the data, ignoring empty rows + print("loading data from %s as issued on %d" % (filename, issue)) + rows = load_zipped_csv(filename) + print(" loaded %d rows" % len(rows)) + data = [get_ilinet_data(row) for row in rows] + entries = [obj for obj in data if obj] + print(" found %d entries" % len(entries)) + + sql = """ + INSERT INTO + `fluview` (`release_date`, `issue`, `epiweek`, `region`, `lag`, `num_ili`, + `num_patients`, `num_providers`, `wili`, `ili`, `num_age_0`, `num_age_1`, + `num_age_2`, `num_age_3`, `num_age_4`, `num_age_5`) + VALUES + (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) + ON DUPLICATE KEY UPDATE + `release_date` = least(`release_date`, %s), + `num_ili` = %s, + `num_patients` = %s, + `num_providers` = %s, + `wili` = %s, + `ili` = %s, + `num_age_0` = coalesce(%s, `num_age_0`), + `num_age_1` = coalesce(%s, `num_age_1`), + `num_age_2` = coalesce(%s, `num_age_2`), + `num_age_3` = coalesce(%s, `num_age_3`), + `num_age_4` = coalesce(%s, `num_age_4`), + `num_age_5` = coalesce(%s, `num_age_5`) + """ + + # insert each row + insert = cnx.cursor() + for row in entries: + lag = delta_epiweeks(row["epiweek"], issue) + args = [row["n_ili"], row["n_patients"], row["n_providers"], row["wili"], row["ili"], row["age0"], row["age1"], row["age2"], row["age3"], row["age4"], row["age5"]] + ins_args = [date, issue, row["epiweek"], row["location"], lag] + args + upd_args = [date] + args + insert.execute(sql, ins_args + upd_args) + + # cleanup + insert.close() + if test_mode: + print("test mode, not committing") + rows2 = rows1 + else: + cnx.commit() + rows2 = get_rows(cnx) + print("rows after: %d (added %d)" % (rows2, rows2 - rows1)) + cnx.close() + def main(): - # args and usage - parser = argparse.ArgumentParser() - parser.add_argument( - '--test', - action='store_true', - help='do dry run only, do not update the database' - ) - parser.add_argument( - '--file', - type=str, - help='load an existing zip file (otherwise fetch current data)' - ) - parser.add_argument( - '--issue', - type=int, - help='issue of the file (e.g. 201740); used iff --file is given' - ) - args = parser.parse_args() - - if (args.file is None) != (args.issue is None): - raise Exception('--file and --issue must both be present or absent') - - date = datetime.datetime.now().strftime('%Y-%m-%d') - print('assuming release date is today, %s' % date) - - if args.file: - update_from_file(args.issue, date, args.file, test_mode=args.test) - update_from_file_clinical(args.issue, date, args.file, test_mode=args.test) - # TODO: header row has changed for public health lab data - # update_from_file_public(args.issue, date, args.file, test_mode=args.test) - else: - issue, files = fluview.save_latest(path='flu_data') - for filename in files: - update_from_file(issue, date, filename, test_mode=args.test) - update_from_file_clinical(issue, date, filename, test_mode=args.test) - # TODO: header row has changed for public health lab data - # update_from_file_public(issue, date, filename, test_mode=args.test) - -if __name__ == '__main__': - main() + # args and usage + parser = argparse.ArgumentParser() + parser.add_argument("--test", action="store_true", help="do dry run only, do not update the database") + parser.add_argument("--file", type=str, help="load an existing zip file (otherwise fetch current data)") + parser.add_argument("--issue", type=int, help="issue of the file (e.g. 201740); used iff --file is given") + args = parser.parse_args() + + if (args.file is None) != (args.issue is None): + raise Exception("--file and --issue must both be present or absent") + + date = datetime.datetime.now().strftime("%Y-%m-%d") + print("assuming release date is today, %s" % date) + + if args.file: + update_from_file(args.issue, date, args.file, test_mode=args.test) + update_from_file_clinical(args.issue, date, args.file, test_mode=args.test) + # TODO: header row has changed for public health lab data + # update_from_file_public(args.issue, date, args.file, test_mode=args.test) + else: + issue, files = fluview.save_latest(path="flu_data") + for filename in files: + update_from_file(issue, date, filename, test_mode=args.test) + update_from_file_clinical(issue, date, filename, test_mode=args.test) + # TODO: header row has changed for public health lab data + # update_from_file_public(issue, date, filename, test_mode=args.test) + + +if __name__ == "__main__": + main() diff --git a/src/acquisition/fluview/impute_missing_values.py b/src/acquisition/fluview/impute_missing_values.py index 7f9a23231..645daaba5 100644 --- a/src/acquisition/fluview/impute_missing_values.py +++ b/src/acquisition/fluview/impute_missing_values.py @@ -59,290 +59,281 @@ class Database: - """Database wrapper and abstraction layer.""" - - class Sql: - """Container for SQL constants.""" - - # Count the total number of imputed rows. - count_rows = ''' - SELECT - count(1) `num` - FROM - `fluview_imputed` - ''' - - # Find (issue, epiweek) pairs that exist in table `fluview` but not in - # table `fluview_imputed`. Note that only issues >= 201740 are selected - # because that's when CDC first started posting state-level ILINet data. - # This assumes that `fluview` is always missing at least one location. - find_missing_rows = ''' - SELECT - fv.`issue`, fv.`epiweek` - FROM ( + """Database wrapper and abstraction layer.""" + + class Sql: + """Container for SQL constants.""" + + # Count the total number of imputed rows. + count_rows = """ SELECT - `issue`, `epiweek` + count(1) `num` FROM - `fluview` + `fluview_imputed` + """ + + # Find (issue, epiweek) pairs that exist in table `fluview` but not in + # table `fluview_imputed`. Note that only issues >= 201740 are selected + # because that's when CDC first started posting state-level ILINet data. + # This assumes that `fluview` is always missing at least one location. + find_missing_rows = """ + SELECT + fv.`issue`, fv.`epiweek` + FROM ( + SELECT + `issue`, `epiweek` + FROM + `fluview` + WHERE + `issue` >= 201740 + GROUP BY + `issue`, `epiweek` + ) fv + LEFT JOIN ( + SELECT + `issue`, `epiweek` + FROM + `fluview_imputed` + GROUP BY + `issue`, `epiweek` + ) fvi + ON + fvi.`issue` = fv.`issue` AND fvi.`epiweek` = fv.`epiweek` WHERE - `issue` >= 201740 - GROUP BY - `issue`, `epiweek` - ) fv - LEFT JOIN ( + fvi.`issue` IS NULL + """ + + # Read all location rows from the `fluview` table for a given issue and + # epiweek. + get_known_values = """ SELECT - `issue`, `epiweek` + `region`, `num_ili`, `num_patients`, `num_providers` FROM - `fluview_imputed` - GROUP BY - `issue`, `epiweek` - ) fvi - ON - fvi.`issue` = fv.`issue` AND fvi.`epiweek` = fv.`epiweek` - WHERE - fvi.`issue` IS NULL - ''' - - # Read all location rows from the `fluview` table for a given issue and - # epiweek. - get_known_values = ''' - SELECT - `region`, `num_ili`, `num_patients`, `num_providers` - FROM - `fluview` - WHERE - `issue` = %s AND `epiweek` = %s - ''' - - # Insert location rows into the `fluview_imputed` table for a given issue - # and epiweek. - add_imputed_values = ''' - INSERT INTO - `fluview_imputed` ( - `issue`, - `epiweek`, - `region`, - `lag`, - `num_ili`, - `num_patients`, - `num_providers`, - `ili` - ) - VALUES - (%s, %s, %s, %s, %s, %s, %s, %s) - ''' - - def connect(self): - """Connect to the database.""" - u, p = secrets.db.epi - self.cnx = mysql.connector.connect(user=u, password=p, database='epidata') - self.cur = self.cnx.cursor() - - def close(self, commit): - """ - Close the connection to the database, committing or rolling back changes as - indicated. - """ - self.cur.close() - if commit: - self.cnx.commit() - else: - print('test mode, not committing') - self.cnx.close() - - def count_rows(self): - """Count and return the number of rows in the `fluview_imputed` table.""" - self.cur.execute(Database.Sql.count_rows) - for (num,) in self.cur: - return num - - def find_missing_rows(self): - """ - Find rows that still have missing values. Each missing row is uniquely - identified by an (issue, epiweek, location) tuple. This function finds the - first two. - """ + `fluview` + WHERE + `issue` = %s AND `epiweek` = %s + """ + + # Insert location rows into the `fluview_imputed` table for a given issue + # and epiweek. + add_imputed_values = """ + INSERT INTO + `fluview_imputed` ( + `issue`, + `epiweek`, + `region`, + `lag`, + `num_ili`, + `num_patients`, + `num_providers`, + `ili` + ) + VALUES + (%s, %s, %s, %s, %s, %s, %s, %s) + """ + + def connect(self): + """Connect to the database.""" + u, p = secrets.db.epi + self.cnx = mysql.connector.connect(user=u, password=p, database="epidata") + self.cur = self.cnx.cursor() + + def close(self, commit): + """ + Close the connection to the database, committing or rolling back changes as + indicated. + """ + self.cur.close() + if commit: + self.cnx.commit() + else: + print("test mode, not committing") + self.cnx.close() + + def count_rows(self): + """Count and return the number of rows in the `fluview_imputed` table.""" + self.cur.execute(Database.Sql.count_rows) + for (num,) in self.cur: + return num + + def find_missing_rows(self): + """ + Find rows that still have missing values. Each missing row is uniquely + identified by an (issue, epiweek, location) tuple. This function finds the + first two. + """ + + self.cur.execute(Database.Sql.find_missing_rows) + return [(issue, epiweek) for (issue, epiweek) in self.cur] + + def get_known_values(self, issue, epiweek): + """ + Fetch ILINet data for all locations available for the given issue and + epiweek. The returned value is a dict mapping from locations to ILI data. + """ + + self.cur.execute(Database.Sql.get_known_values, (issue, epiweek)) + return dict([(loc, (n_ili, n_pat, n_prov)) for (loc, n_ili, n_pat, n_prov) in self.cur]) + + def add_imputed_values(self, issue, epiweek, imputed): + """ + Store imputed ILINet data for the given locations on the given issue and + epiweek. The imputed value is a dict mapping from locations to ILI data. + """ + + for loc in imputed.keys(): + lag, n_ili, n_pat, n_prov, ili = imputed[loc] + args = (issue, epiweek, loc, lag, n_ili, n_pat, n_prov, ili) + self.cur.execute(Database.Sql.add_imputed_values, args) - self.cur.execute(Database.Sql.find_missing_rows) - return [(issue, epiweek) for (issue, epiweek) in self.cur] - def get_known_values(self, issue, epiweek): - """ - Fetch ILINet data for all locations available for the given issue and - epiweek. The returned value is a dict mapping from locations to ILI data. - """ +class StatespaceException(Exception): + """Used to indicate that imputation is not possible with the given inputs.""" - self.cur.execute(Database.Sql.get_known_values, (issue, epiweek)) - return dict([ - (loc, (n_ili, n_pat, n_prov)) - for - (loc, n_ili, n_pat, n_prov) - in self.cur - ]) - def add_imputed_values(self, issue, epiweek, imputed): +def get_location_graph(): """ - Store imputed ILINet data for the given locations on the given issue and - epiweek. The imputed value is a dict mapping from locations to ILI data. + Return a matrix where rows represent regions, columns represent atoms, and + each entry is a 1 if the region contains the atom, otherwise 0. The + corresponding lists of regions and atoms are also returned. """ - for loc in imputed.keys(): - lag, n_ili, n_pat, n_prov, ili = imputed[loc] - args = (issue, epiweek, loc, lag, n_ili, n_pat, n_prov, ili) - self.cur.execute(Database.Sql.add_imputed_values, args) - - -class StatespaceException(Exception): - """Used to indicate that imputation is not possible with the given inputs.""" - - -def get_location_graph(): - """ - Return a matrix where rows represent regions, columns represent atoms, and - each entry is a 1 if the region contains the atom, otherwise 0. The - corresponding lists of regions and atoms are also returned. - """ - - regions = sorted(Locations.region_list) - atoms = sorted(Locations.atom_list) - graph = np.zeros((len(regions), len(atoms))) - for i, r in enumerate(regions): - for a in Locations.region_map[r]: - j = atoms.index(a) - graph[i, j] = 1 - return graph, regions, atoms + regions = sorted(Locations.region_list) + atoms = sorted(Locations.atom_list) + graph = np.zeros((len(regions), len(atoms))) + for i, r in enumerate(regions): + for a in Locations.region_map[r]: + j = atoms.index(a) + graph[i, j] = 1 + return graph, regions, atoms def get_fusion_parameters(known_locations): - """ - Return a matrix that fuses known ILI values into unknown ILI values. The - corresponding lists of known and unknown locations are also returned. + """ + Return a matrix that fuses known ILI values into unknown ILI values. The + corresponding lists of known and unknown locations are also returned. - The goal is to infer ILI data in all locations, given ILI data in some - partial set of locations. This function takes a sensor fusion approach. + The goal is to infer ILI data in all locations, given ILI data in some + partial set of locations. This function takes a sensor fusion approach. - Let $z$ be a column vector of values in reported locations. Let $y$ be the - desired column vector of values in unreported locations. With matrices $H$ - (mapping from latent state to reported values), $W$ (mapping from latent - state to unreported values), and $R = I$ (covariance, which is identity): + Let $z$ be a column vector of values in reported locations. Let $y$ be the + desired column vector of values in unreported locations. With matrices $H$ + (mapping from latent state to reported values), $W$ (mapping from latent + state to unreported values), and $R = I$ (covariance, which is identity): - $y = W (H^T R^{-1} H)^{-1} H^T R^{-1} z$ - $y = W (H^T H)^{-1} H^T z$ + $y = W (H^T R^{-1} H)^{-1} H^T R^{-1} z$ + $y = W (H^T H)^{-1} H^T z$ - This is equavalent to OLS regression with an added translation from atomic - locations to missing locations. Unknown values are computed as a linear - combination of known values. - """ + This is equavalent to OLS regression with an added translation from atomic + locations to missing locations. Unknown values are computed as a linear + combination of known values. + """ - graph, regions, atoms = get_location_graph() - is_known = np.array([r in known_locations for r in regions]) - is_unknown = np.logical_not(is_known) - if not np.any(is_known): - raise StatespaceException('no values are known') - if not np.any(is_unknown): - raise StatespaceException('no values are unknown') + graph, regions, atoms = get_location_graph() + is_known = np.array([r in known_locations for r in regions]) + is_unknown = np.logical_not(is_known) + if not np.any(is_known): + raise StatespaceException("no values are known") + if not np.any(is_unknown): + raise StatespaceException("no values are unknown") - H = graph[is_known, :] - W = graph[is_unknown, :] - if np.linalg.matrix_rank(H) != len(atoms): - raise StatespaceException('system is underdetermined') + H = graph[is_known, :] + W = graph[is_unknown, :] + if np.linalg.matrix_rank(H) != len(atoms): + raise StatespaceException("system is underdetermined") - HtH = np.dot(H.T, H) - HtH_inv = np.linalg.inv(HtH) - H_pseudo_inv = np.dot(HtH_inv, H.T) - fuser = np.dot(W, H_pseudo_inv) + HtH = np.dot(H.T, H) + HtH_inv = np.linalg.inv(HtH) + H_pseudo_inv = np.dot(HtH_inv, H.T) + fuser = np.dot(W, H_pseudo_inv) - locations = np.array(regions) - filter_locations = lambda selected: list(map(str, locations[selected])) - return fuser, filter_locations(is_known), filter_locations(is_unknown) + locations = np.array(regions) + filter_locations = lambda selected: list(map(str, locations[selected])) + return fuser, filter_locations(is_known), filter_locations(is_unknown) def get_lag_and_ili(issue, epiweek, num_ili, num_patients): - """ - Compute and return reporting lag and percent ILI from imputed ILINet data. - """ - lag = delta_epiweeks(epiweek, issue) - ili = 100.0 * (0 if num_patients == 0 else num_ili / num_patients) - return lag, ili + """ + Compute and return reporting lag and percent ILI from imputed ILINet data. + """ + lag = delta_epiweeks(epiweek, issue) + ili = 100.0 * (0 if num_patients == 0 else num_ili / num_patients) + return lag, ili def impute_missing_values(database, test_mode=False): - """ - Determine whether values are missing for any states and territories. If so, - impute them and store them in the database. - """ - - # database connection - database.connect() - rows1 = database.count_rows() - print('rows before: %d' % (rows1)) - - # iterate over missing epiweeks - missing_rows = database.find_missing_rows() - print('missing data for %d epiweeks' % len(missing_rows)) - for issue, epiweek in missing_rows: - print('i=%d e=%d' % (issue, epiweek)) - - # get known values from table `fluview` - known_values = database.get_known_values(issue, epiweek) - - # Unlike most other state-level data, which typically begins publicly on - # 2010w40, data for PR begins on 2013w40. Before this, there are no reports - # for PR. Here we assume that no report is equivalent to a report of all - # zeros (number of ILI, patients, and providers). That's mostly true, with - # the notable exception of wILI, but that's not relevant here. By assuming - # that PR reports zero on those weeks, it's possible to impute values for - # VI, which are otherwise not reported until 2015w40. - assume_pr_zero = epiweek < 201340 and 'pr' not in known_values - if assume_pr_zero: - known_values['pr'] = (0, 0, 0) - - # get the imputation matrix and lists of known and unknown locations - F, known, unknown = get_fusion_parameters(known_values.keys()) - - # finally, impute the missing values - z = np.array([known_values[k] for k in known]) - y = np.dot(F, z) - - # possibly also record the assumptions made for PR - if assume_pr_zero: - unknown.append('pr') - y = np.vstack((y, [known_values['pr']])) - - # add lag and percent ILI to the data for each imputed location - imputed_values = {} - for loc, values in zip(unknown, y): - n_ili, n_pat, n_prov = map(int, np.rint(values)) - lag, ili = get_lag_and_ili(issue, epiweek, n_ili, n_pat) - imputed_values[loc] = (lag, n_ili, n_pat, n_prov, ili) - print(' %s: %s' % (loc, str(imputed_values[loc]))) - - # save all imputed values in table `fluview_imputed` - database.add_imputed_values(issue, epiweek, imputed_values) - - # database cleanup - rows2 = database.count_rows() - print('rows after: %d (added %d)' % (rows2, rows2 - rows1)) - commit = not test_mode - database.close(commit) + """ + Determine whether values are missing for any states and territories. If so, + impute them and store them in the database. + """ + + # database connection + database.connect() + rows1 = database.count_rows() + print("rows before: %d" % (rows1)) + + # iterate over missing epiweeks + missing_rows = database.find_missing_rows() + print("missing data for %d epiweeks" % len(missing_rows)) + for issue, epiweek in missing_rows: + print("i=%d e=%d" % (issue, epiweek)) + + # get known values from table `fluview` + known_values = database.get_known_values(issue, epiweek) + + # Unlike most other state-level data, which typically begins publicly on + # 2010w40, data for PR begins on 2013w40. Before this, there are no reports + # for PR. Here we assume that no report is equivalent to a report of all + # zeros (number of ILI, patients, and providers). That's mostly true, with + # the notable exception of wILI, but that's not relevant here. By assuming + # that PR reports zero on those weeks, it's possible to impute values for + # VI, which are otherwise not reported until 2015w40. + assume_pr_zero = epiweek < 201340 and "pr" not in known_values + if assume_pr_zero: + known_values["pr"] = (0, 0, 0) + + # get the imputation matrix and lists of known and unknown locations + F, known, unknown = get_fusion_parameters(known_values.keys()) + + # finally, impute the missing values + z = np.array([known_values[k] for k in known]) + y = np.dot(F, z) + + # possibly also record the assumptions made for PR + if assume_pr_zero: + unknown.append("pr") + y = np.vstack((y, [known_values["pr"]])) + + # add lag and percent ILI to the data for each imputed location + imputed_values = {} + for loc, values in zip(unknown, y): + n_ili, n_pat, n_prov = map(int, np.rint(values)) + lag, ili = get_lag_and_ili(issue, epiweek, n_ili, n_pat) + imputed_values[loc] = (lag, n_ili, n_pat, n_prov, ili) + print(" %s: %s" % (loc, str(imputed_values[loc]))) + + # save all imputed values in table `fluview_imputed` + database.add_imputed_values(issue, epiweek, imputed_values) + + # database cleanup + rows2 = database.count_rows() + print("rows after: %d (added %d)" % (rows2, rows2 - rows1)) + commit = not test_mode + database.close(commit) def get_argument_parser(): - """Set up command line arguments and usage.""" - parser = argparse.ArgumentParser() - parser.add_argument( - '--test', - action='store_true', - help='do dry run only, do not update the database' - ) - return parser + """Set up command line arguments and usage.""" + parser = argparse.ArgumentParser() + parser.add_argument("--test", action="store_true", help="do dry run only, do not update the database") + return parser def main(): - """Run this script from the command line.""" - args = get_argument_parser().parse_args() - impute_missing_values(Database(), test_mode=args.test) + """Run this script from the command line.""" + args = get_argument_parser().parse_args() + impute_missing_values(Database(), test_mode=args.test) -if __name__ == '__main__': - main() +if __name__ == "__main__": + main() diff --git a/src/acquisition/ght/ght_update.py b/src/acquisition/ght/ght_update.py index c1e9b8d94..1cd5c5693 100644 --- a/src/acquisition/ght/ght_update.py +++ b/src/acquisition/ght/ght_update.py @@ -1,4 +1,4 @@ -''' +""" =============== === Purpose === =============== @@ -63,7 +63,7 @@ * fixed multiple-word queries (surround with quotes) 2015-12-01 * Original version -''' +""" # standard library import argparse @@ -88,304 +88,304 @@ # 2010-04-19 and 2015-05-05 # see: https://www.google.com/trends/correlate TERMS = [ - '/m/0cycc', - 'influenza type a', - 'flu duration', - 'flu fever', - 'treating flu', - 'fever flu', - 'flu recovery', - 'braun thermoscan', - 'oscillococcinum', - 'treating the flu', - 'cold or flu', - 'flu versus cold', - 'flu remedies', - 'contagious flu', - 'type a influenza', - 'flu or cold', - 'duration of flu', - 'cold versus flu', - 'flu cough', - 'flu headache', - 'thermoscan', - 'influenza incubation period', - 'flu lasts', - 'length of flu', - 'flu stomach', - 'cold vs flu', - 'flu and fever', - 'getting over the flu', - 'influenza a', - 'treatment for flu', - 'flu length', - 'treatment for the flu', - 'influenza symptoms', - 'over the counter flu', - 'flu complications', - 'cold and flu symptoms', - 'influenza incubation', - 'treatment of flu', - 'human temperature', - 'low body', - 'flu contagious', - 'robitussin ac', - 'flu how long', - 'ear thermometer', - 'flu contagious period', - 'treat flu', - 'cough flu', - 'low body temperature', - 'expectorant', - 'flu and cold', - 'rapid flu', - 'flu vs. cold', - 'how to treat the flu', - 'how long does the flu last?', - 'viral pneumonia', - 'flu in kids', - 'type a flu', - 'influenza treatment', - 'fighting the flu', - 'flu relief', - 'treat the flu', - 'flu medicine', - 'dangerous fever', - 'what is influenza', - 'tussin', - 'low body temp', - 'flu care', - 'flu in infants', - 'flu dizziness', - 'feed a fever', - 'flu vs cold', - 'flu vomiting', - 'bacterial pneumonia', - 'flu activity', - 'flu chills', - 'anas barbariae', - 'flu germs', - 'tylenol cold', - 'how to get over the flu', - 'flu in children', - 'influenza a and b', - 'duration of the flu', - 'cold symptoms', - 'flu report', - 'rapid flu test', - 'flu relapse', - 'get over the flu', - 'flu during pregnancy', - 'flu recovery time', - 'cure for flu', - 'tamiflu and breastfeeding', - 'flu chest pain', - 'flu treatment', - 'flu nausea', - 'remedies for the flu', - 'tamiflu in pregnancy', - 'side effects of tamiflu', - 'how to treat flu', - 'viral bronchitis', - 'flu how long contagious', - 'flu remedy', + "/m/0cycc", + "influenza type a", + "flu duration", + "flu fever", + "treating flu", + "fever flu", + "flu recovery", + "braun thermoscan", + "oscillococcinum", + "treating the flu", + "cold or flu", + "flu versus cold", + "flu remedies", + "contagious flu", + "type a influenza", + "flu or cold", + "duration of flu", + "cold versus flu", + "flu cough", + "flu headache", + "thermoscan", + "influenza incubation period", + "flu lasts", + "length of flu", + "flu stomach", + "cold vs flu", + "flu and fever", + "getting over the flu", + "influenza a", + "treatment for flu", + "flu length", + "treatment for the flu", + "influenza symptoms", + "over the counter flu", + "flu complications", + "cold and flu symptoms", + "influenza incubation", + "treatment of flu", + "human temperature", + "low body", + "flu contagious", + "robitussin ac", + "flu how long", + "ear thermometer", + "flu contagious period", + "treat flu", + "cough flu", + "low body temperature", + "expectorant", + "flu and cold", + "rapid flu", + "flu vs. cold", + "how to treat the flu", + "how long does the flu last?", + "viral pneumonia", + "flu in kids", + "type a flu", + "influenza treatment", + "fighting the flu", + "flu relief", + "treat the flu", + "flu medicine", + "dangerous fever", + "what is influenza", + "tussin", + "low body temp", + "flu care", + "flu in infants", + "flu dizziness", + "feed a fever", + "flu vs cold", + "flu vomiting", + "bacterial pneumonia", + "flu activity", + "flu chills", + "anas barbariae", + "flu germs", + "tylenol cold", + "how to get over the flu", + "flu in children", + "influenza a and b", + "duration of the flu", + "cold symptoms", + "flu report", + "rapid flu test", + "flu relapse", + "get over the flu", + "flu during pregnancy", + "flu recovery time", + "cure for flu", + "tamiflu and breastfeeding", + "flu chest pain", + "flu treatment", + "flu nausea", + "remedies for the flu", + "tamiflu in pregnancy", + "side effects of tamiflu", + "how to treat flu", + "viral bronchitis", + "flu how long contagious", + "flu remedy", ] # a list of all US states, including DC and the US as a whole LOCATIONS = [ - 'US', - 'AL', - 'AK', - 'AZ', - 'AR', - 'CA', - 'CO', - 'CT', - 'DC', - 'DE', - 'FL', - 'GA', - 'HI', - 'ID', - 'IL', - 'IN', - 'IA', - 'KS', - 'KY', - 'LA', - 'ME', - 'MD', - 'MA', - 'MI', - 'MN', - 'MS', - 'MO', - 'MT', - 'NE', - 'NV', - 'NH', - 'NJ', - 'NM', - 'NY', - 'NC', - 'ND', - 'OH', - 'OK', - 'OR', - 'PA', - 'RI', - 'SC', - 'SD', - 'TN', - 'TX', - 'UT', - 'VT', - 'VA', - 'WA', - 'WV', - 'WI', - 'WY', + "US", + "AL", + "AK", + "AZ", + "AR", + "CA", + "CO", + "CT", + "DC", + "DE", + "FL", + "GA", + "HI", + "ID", + "IL", + "IN", + "IA", + "KS", + "KY", + "LA", + "ME", + "MD", + "MA", + "MI", + "MN", + "MS", + "MO", + "MT", + "NE", + "NV", + "NH", + "NJ", + "NM", + "NY", + "NC", + "ND", + "OH", + "OK", + "OR", + "PA", + "RI", + "SC", + "SD", + "TN", + "TX", + "UT", + "VT", + "VA", + "WA", + "WV", + "WI", + "WY", ] -def update(locations, terms, first=None, last=None, countries=['US']): - # connect to the database - u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - cur = cnx.cursor() +def update(locations, terms, first=None, last=None, countries=["US"]): + # connect to the database + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") + cur = cnx.cursor() - def get_num_rows(): - cur.execute('SELECT count(1) `num` FROM `ght`') - for (num,) in cur: - pass - return num + def get_num_rows(): + cur.execute("SELECT count(1) `num` FROM `ght`") + for (num,) in cur: + pass + return num - # check from 4 weeks preceeding the last week with data through this week - cur.execute('SELECT max(`epiweek`) `ew0`, yearweek(now(), 6) `ew1` FROM `ght`') - for (ew0, ew1) in cur: - ew0 = 200401 if ew0 is None else flu.add_epiweeks(ew0, -4) - ew0 = ew0 if first is None else first - ew1 = ew1 if last is None else last - print('Checking epiweeks between %d and %d...' % (ew0, ew1)) + # check from 4 weeks preceeding the last week with data through this week + cur.execute("SELECT max(`epiweek`) `ew0`, yearweek(now(), 6) `ew1` FROM `ght`") + for (ew0, ew1) in cur: + ew0 = 200401 if ew0 is None else flu.add_epiweeks(ew0, -4) + ew0 = ew0 if first is None else first + ew1 = ew1 if last is None else last + print("Checking epiweeks between %d and %d..." % (ew0, ew1)) - # keep track of how many rows were added - rows_before = get_num_rows() + # keep track of how many rows were added + rows_before = get_num_rows() - # check Google Trends for new and/or revised data - sql = ''' + # check Google Trends for new and/or revised data + sql = """ INSERT INTO `ght` (`query`, `location`, `epiweek`, `value`) VALUES (%s, %s, %s, %s) ON DUPLICATE KEY UPDATE `value` = %s - ''' - total_rows = 0 - ght = GHT(API_KEY) - for term in terms: - print(' [%s] using term' % term) - ll, cl = len(locations), len(countries) - for i in range(max(ll,cl)): - location = locations[i] if i < ll else locations[0] - country = countries[i] if i < cl else countries[0] - try: - #term2 = ('"%s"' % term) if ' ' in term else term - term2 = term - attempt = 0 - while True: - attempt += 1 - try: - result = ght.get_data(ew0, ew1, location, term2, country=country) - break - except Exception as ex: - if attempt >= 5: - raise ex - else: - delay = 2 ** attempt - print(' [%s|%s] caught exception (will retry in %ds):' % (term, location, delay), ex) - time.sleep(delay) - values = [p['value'] for p in result['data']['lines'][0]['points']] - ew = result['start_week'] - num_missing = 0 - for v in values: - # Default SQL location value for US country for backwards compatibility - # i.e. California's location is still stored as 'CA', - # and having location == 'US' is still stored as 'US' - sql_location = location if location != NO_LOCATION_STR else country - - # Change SQL location for non-US countries - if country != 'US': - # Underscore added to distinguish countries from 2-letter US states - sql_location = country + "_" - if location != NO_LOCATION_STR: - sql_location = sql_location + location - sql_data = (term, sql_location, ew, v, v) - cur.execute(sql, sql_data) - total_rows += 1 - if v == 0: - num_missing += 1 - #print(' [%s|%s|%d] missing value' % (term, location, ew)) - ew = flu.add_epiweeks(ew, 1) - if num_missing > 0: - print(' [%s|%s] missing %d/%d value(s)' % (term, location, num_missing, len(values))) - except Exception as ex: - print(' [%s|%s] caught exception (will NOT retry):' % (term, location), ex) - - # keep track of how many rows were added - rows_after = get_num_rows() - print('Inserted %d/%d row(s)'%(rows_after - rows_before, total_rows)) - - # cleanup - cur.close() - cnx.commit() - cnx.close() + """ + total_rows = 0 + ght = GHT(API_KEY) + for term in terms: + print(" [%s] using term" % term) + ll, cl = len(locations), len(countries) + for i in range(max(ll, cl)): + location = locations[i] if i < ll else locations[0] + country = countries[i] if i < cl else countries[0] + try: + # term2 = ('"%s"' % term) if ' ' in term else term + term2 = term + attempt = 0 + while True: + attempt += 1 + try: + result = ght.get_data(ew0, ew1, location, term2, country=country) + break + except Exception as ex: + if attempt >= 5: + raise ex + else: + delay = 2**attempt + print(" [%s|%s] caught exception (will retry in %ds):" % (term, location, delay), ex) + time.sleep(delay) + values = [p["value"] for p in result["data"]["lines"][0]["points"]] + ew = result["start_week"] + num_missing = 0 + for v in values: + # Default SQL location value for US country for backwards compatibility + # i.e. California's location is still stored as 'CA', + # and having location == 'US' is still stored as 'US' + sql_location = location if location != NO_LOCATION_STR else country + + # Change SQL location for non-US countries + if country != "US": + # Underscore added to distinguish countries from 2-letter US states + sql_location = country + "_" + if location != NO_LOCATION_STR: + sql_location = sql_location + location + sql_data = (term, sql_location, ew, v, v) + cur.execute(sql, sql_data) + total_rows += 1 + if v == 0: + num_missing += 1 + # print(' [%s|%s|%d] missing value' % (term, location, ew)) + ew = flu.add_epiweeks(ew, 1) + if num_missing > 0: + print(" [%s|%s] missing %d/%d value(s)" % (term, location, num_missing, len(values))) + except Exception as ex: + print(" [%s|%s] caught exception (will NOT retry):" % (term, location), ex) + + # keep track of how many rows were added + rows_after = get_num_rows() + print("Inserted %d/%d row(s)" % (rows_after - rows_before, total_rows)) + + # cleanup + cur.close() + cnx.commit() + cnx.close() def main(): - # args and usage - parser = argparse.ArgumentParser() - parser.add_argument('location', action='store', type=str, default=None, help='location(s) (ex: all; US; TX; CA,LA,WY)') - parser.add_argument('term', action='store', type=str, default=None, help='term/query/topic (ex: all; /m/0cycc; "flu fever")') - parser.add_argument('--first', '-f', default=None, type=int, help='first epiweek override') - parser.add_argument('--last', '-l', default=None, type=int, help='last epiweek override') - parser.add_argument('--country', '-c', default='US', type=str, help='location country (ex: US; BR)') - args = parser.parse_args() - - # sanity check - first, last = args.first, args.last - if first is not None: - flu.check_epiweek(first) - if last is not None: - flu.check_epiweek(last) - if first is not None and last is not None and first > last: - raise Exception('epiweeks in the wrong order') - - # decide what to update - if args.location.lower() == 'all': - locations = LOCATIONS - elif args.location.lower() == 'none': - locations = [NO_LOCATION_STR] - else: - locations = args.location.upper().split(',') - if args.term.lower() == 'all': - terms = TERMS - else: - terms = [args.term] - - # country argument - # Check that country follows ISO 1366 Alpha-2 code. - # See https://www.iso.org/obp/ui/#search. - countries = args.country.upper().split(',') - if not all(map(lambda x: len(x) == 2, countries)): - raise Exception('country name must be two letters (ISO 1366 Alpha-2)') - - # if length of locations and countries is > 1, need to be the same - if len(locations) > 1 and len(countries) > 1 and len(locations) != len(countries): - raise Exception('locations and countries must be length 1, or same length') - - # run the update - update(locations, terms, first, last, countries) - - -if __name__ == '__main__': - main() + # args and usage + parser = argparse.ArgumentParser() + parser.add_argument("location", action="store", type=str, default=None, help="location(s) (ex: all; US; TX; CA,LA,WY)") + parser.add_argument("term", action="store", type=str, default=None, help='term/query/topic (ex: all; /m/0cycc; "flu fever")') + parser.add_argument("--first", "-f", default=None, type=int, help="first epiweek override") + parser.add_argument("--last", "-l", default=None, type=int, help="last epiweek override") + parser.add_argument("--country", "-c", default="US", type=str, help="location country (ex: US; BR)") + args = parser.parse_args() + + # sanity check + first, last = args.first, args.last + if first is not None: + flu.check_epiweek(first) + if last is not None: + flu.check_epiweek(last) + if first is not None and last is not None and first > last: + raise Exception("epiweeks in the wrong order") + + # decide what to update + if args.location.lower() == "all": + locations = LOCATIONS + elif args.location.lower() == "none": + locations = [NO_LOCATION_STR] + else: + locations = args.location.upper().split(",") + if args.term.lower() == "all": + terms = TERMS + else: + terms = [args.term] + + # country argument + # Check that country follows ISO 1366 Alpha-2 code. + # See https://www.iso.org/obp/ui/#search. + countries = args.country.upper().split(",") + if not all(map(lambda x: len(x) == 2, countries)): + raise Exception("country name must be two letters (ISO 1366 Alpha-2)") + + # if length of locations and countries is > 1, need to be the same + if len(locations) > 1 and len(countries) > 1 and len(locations) != len(countries): + raise Exception("locations and countries must be length 1, or same length") + + # run the update + update(locations, terms, first, last, countries) + + +if __name__ == "__main__": + main() diff --git a/src/acquisition/ght/google_health_trends.py b/src/acquisition/ght/google_health_trends.py index 66a11c227..29d2a5f8e 100644 --- a/src/acquisition/ght/google_health_trends.py +++ b/src/acquisition/ght/google_health_trends.py @@ -1,4 +1,4 @@ -''' +""" =============== === Purpose === =============== @@ -18,7 +18,7 @@ + sample command line usage + extract array of values from returned data * separated GHT class from ght_update.py -''' +""" # standard library import argparse @@ -31,109 +31,110 @@ from delphi.utils.epidate import EpiDate import delphi.utils.epiweek as flu -NO_LOCATION_STR = 'none' +NO_LOCATION_STR = "none" + class GHT: - # Google Trends API endpoint - DISCOVERY_URL = 'https://www.googleapis.com/discovery/v1/apis/trends/v1beta/rest' - - def __init__(self, key, delay=1): - self.service = build('trends', 'v1beta', developerKey=key, discoveryServiceUrl=GHT.DISCOVERY_URL) - self.delay = delay - - # converts a YYYYWW week into a YYYY-MM-DD date (using Sunday of the week) - @staticmethod - def _ew2date(ew): - # parse the epiweek - year, week = flu.split_epiweek(ew) - # get the date object (middle of the week; Wednesday) - date = EpiDate.from_epiweek(year, week) - # go to the first day of the week (Sunday) - date = date.add_days(-3) - # date as string - return str(date) - - # get data from Google APIs - # see: https://developers.google.com/apis-explorer/#p/trends/v1beta/trends.getTimelinesForHealth - def get_data(self, start_week, end_week, location, term, resolution='week', country='US'): - start_date = GHT._ew2date(start_week) - end_date = GHT._ew2date(end_week) - num_weeks = flu.delta_epiweeks(start_week, end_week) + 1 - - # getTimelinesForHealth parameters - params = { - 'terms': term, - 'time_startDate': start_date, - 'time_endDate': end_date, - 'timelineResolution': resolution, - } - # We have a special check for the US for backwards compatibility. - # i.e. if the country is 'US' AND the location is 'US', just put the geo-restriction for country. - # In contrast, another country might have a sub-region with initials 'US' and we want the region restriction instead. - if country == 'US': - if location == 'US' or location == NO_LOCATION_STR: - params['geoRestriction_country'] = 'US' - else: - params['geoRestriction_region'] = 'US-' + location - else: - if location == NO_LOCATION_STR: - params['geoRestriction_country'] = country - else: - params['geoRestriction_region'] = country + '-' + location - - # make the API call - data = self.service.getTimelinesForHealth(**params).execute() - - # extract the values - try: - values = [p['value'] for p in data['lines'][0]['points']] - except: - values = None - - # throttle request rate - time.sleep(self.delay) - - # return the results - return { - 'start_week': start_week, - 'end_week': end_week, - 'num_weeks': num_weeks, - 'location': location, - 'country' : country, - 'term': term, - 'resolution': resolution, - 'data': data, - 'values': values, - } + # Google Trends API endpoint + DISCOVERY_URL = "https://www.googleapis.com/discovery/v1/apis/trends/v1beta/rest" + + def __init__(self, key, delay=1): + self.service = build("trends", "v1beta", developerKey=key, discoveryServiceUrl=GHT.DISCOVERY_URL) + self.delay = delay + + # converts a YYYYWW week into a YYYY-MM-DD date (using Sunday of the week) + @staticmethod + def _ew2date(ew): + # parse the epiweek + year, week = flu.split_epiweek(ew) + # get the date object (middle of the week; Wednesday) + date = EpiDate.from_epiweek(year, week) + # go to the first day of the week (Sunday) + date = date.add_days(-3) + # date as string + return str(date) + + # get data from Google APIs + # see: https://developers.google.com/apis-explorer/#p/trends/v1beta/trends.getTimelinesForHealth + def get_data(self, start_week, end_week, location, term, resolution="week", country="US"): + start_date = GHT._ew2date(start_week) + end_date = GHT._ew2date(end_week) + num_weeks = flu.delta_epiweeks(start_week, end_week) + 1 + + # getTimelinesForHealth parameters + params = { + "terms": term, + "time_startDate": start_date, + "time_endDate": end_date, + "timelineResolution": resolution, + } + # We have a special check for the US for backwards compatibility. + # i.e. if the country is 'US' AND the location is 'US', just put the geo-restriction for country. + # In contrast, another country might have a sub-region with initials 'US' and we want the region restriction instead. + if country == "US": + if location == "US" or location == NO_LOCATION_STR: + params["geoRestriction_country"] = "US" + else: + params["geoRestriction_region"] = "US-" + location + else: + if location == NO_LOCATION_STR: + params["geoRestriction_country"] = country + else: + params["geoRestriction_region"] = country + "-" + location + + # make the API call + data = self.service.getTimelinesForHealth(**params).execute() + + # extract the values + try: + values = [p["value"] for p in data["lines"][0]["points"]] + except: + values = None + + # throttle request rate + time.sleep(self.delay) + + # return the results + return { + "start_week": start_week, + "end_week": end_week, + "num_weeks": num_weeks, + "location": location, + "country": country, + "term": term, + "resolution": resolution, + "data": data, + "values": values, + } def main(): - # args and usage - parser = argparse.ArgumentParser() - parser.add_argument('apikey', action='store', type=str, default=None, help='API key') - parser.add_argument('startweek', action='store', type=int, default=None, help='first week (ex: 201440)') - parser.add_argument('endweek', action='store', type=int, default=None, help='last week (ex: 201520)') - parser.add_argument('location', action='store', type=str, default=None, help='location (ex: US)') - parser.add_argument('term', action='store', type=str, default=None, help='term/query/topic (ex: /m/0cycc)') - args = parser.parse_args() - - # get the data - ght = GHT(args.apikey) - result = ght.get_data(args.startweek, args.endweek, args.location, args.term) - values = result['values'] - - # sanity check - expected_weeks = result['num_weeks'] - received_weeks = len([v for v in values if v is not None and type(v) == float and v >= 0]) - if expected_weeks != received_weeks: - raise Exception('expected %d weeks, received %d' % (expected_weeks, received_weeks)) - - # results - epiweeks = [ew for ew in flu.range_epiweeks(args.startweek, args.endweek, inclusive=True)] - for (epiweek, value) in zip(epiweeks, values): - print('%6d: %.3f' % (epiweek, value)) - - -if __name__ == '__main__': - main() + # args and usage + parser = argparse.ArgumentParser() + parser.add_argument("apikey", action="store", type=str, default=None, help="API key") + parser.add_argument("startweek", action="store", type=int, default=None, help="first week (ex: 201440)") + parser.add_argument("endweek", action="store", type=int, default=None, help="last week (ex: 201520)") + parser.add_argument("location", action="store", type=str, default=None, help="location (ex: US)") + parser.add_argument("term", action="store", type=str, default=None, help="term/query/topic (ex: /m/0cycc)") + args = parser.parse_args() + + # get the data + ght = GHT(args.apikey) + result = ght.get_data(args.startweek, args.endweek, args.location, args.term) + values = result["values"] + + # sanity check + expected_weeks = result["num_weeks"] + received_weeks = len([v for v in values if v is not None and type(v) == float and v >= 0]) + if expected_weeks != received_weeks: + raise Exception("expected %d weeks, received %d" % (expected_weeks, received_weeks)) + + # results + epiweeks = [ew for ew in flu.range_epiweeks(args.startweek, args.endweek, inclusive=True)] + for (epiweek, value) in zip(epiweeks, values): + print("%6d: %.3f" % (epiweek, value)) + + +if __name__ == "__main__": + main() diff --git a/src/acquisition/kcdc/kcdc_update.py b/src/acquisition/kcdc/kcdc_update.py index 70c167738..2b01bbcf9 100644 --- a/src/acquisition/kcdc/kcdc_update.py +++ b/src/acquisition/kcdc/kcdc_update.py @@ -42,12 +42,14 @@ from delphi.utils.epiweek import delta_epiweeks, range_epiweeks, add_epiweeks from delphi.utils.epidate import EpiDate + def ensure_tables_exist(): - (u,p) = secrets.db.epi - cnx = mysql.connector.connect(user=u,password=p,database='epidata') + (u, p) = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") try: cursor = cnx.cursor() - cursor.execute(''' + cursor.execute( + """ CREATE TABLE IF NOT EXISTS `kcdc_ili` ( `id` INT(11) NOT NULL PRIMARY KEY AUTO_INCREMENT, `release_date` DATE NOT NULL, @@ -58,69 +60,71 @@ def ensure_tables_exist(): `ili` DOUBLE NOT NULL, UNIQUE KEY (`issue`, `epiweek`, `region`) ); - '''); + """ + ) cnx.commit() finally: cnx.close() + def safe_float(f): try: - return float(f.replace(',','')) + return float(f.replace(",", "")) except: return 0 + def safe_int(i): try: - return int(i.replace(',','')) + return int(i.replace(",", "")) except: return 0 -def get_rows(cnx, table='kcdc_ili'): - # Count and return the number of rows in the `kcdc_ili` table. - select = cnx.cursor() - select.execute('SELECT count(1) num FROM %s' % table) - for (num,) in select: - pass - select.close() - return num + +def get_rows(cnx, table="kcdc_ili"): + # Count and return the number of rows in the `kcdc_ili` table. + select = cnx.cursor() + select.execute("SELECT count(1) num FROM %s" % table) + for (num,) in select: + pass + select.close() + return num + def get_kcdc_data(): issue = EpiDate.today().get_ew() - last_season = issue//100 + (1 if issue % 100 > 35 else 0) - url = 'http://www.cdc.go.kr/npt/biz/npp/iss/influenzaListAjax.do' - params = { - 'icdNm': 'influenza', - 'startYear': '2004', # Started in 2004 - 'endYear': str(last_season) - } + last_season = issue // 100 + (1 if issue % 100 > 35 else 0) + url = "https://www.cdc.go.kr/npt/biz/npp/iss/influenzaListAjax.do" + params = {"icdNm": "influenza", "startYear": "2004", "endYear": str(last_season)} # Started in 2004 response = requests.post(url, params) datas = response.json() - data = datas['data'] + data = datas["data"] ews = [] ilis = [] ew1 = 200436 - for year in range(2004,last_season): - year_data = data[year-2004] + for year in range(2004, last_season): + year_data = data[year - 2004] if year > 2004: ew1 = ews[-1] + 1 - ili_yr = year_data["VALUE"].split('`') - ili_yr = [float(f) for f in ili_yr if f != ''] - ew2 = add_epiweeks(ew1,len(ili_yr)) - new_ews = list(range_epiweeks(ew1,ew2)) + ili_yr = year_data["VALUE"].split("`") + ili_yr = [float(f) for f in ili_yr if f != ""] + ew2 = add_epiweeks(ew1, len(ili_yr)) + new_ews = list(range_epiweeks(ew1, ew2)) for i in range(len(new_ews)): j = float(ili_yr[i]) ilis.append(j) ews.append(new_ews[i]) return ews, ilis + def update_from_data(ews, ilis, date, issue, test_mode=False): u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') + cnx = mysql.connector.connect(user=u, password=p, database="epidata") rows1 = get_rows(cnx) - print('rows before: %d' % (rows1)) + print("rows before: %d" % (rows1)) insert = cnx.cursor() - sql = ''' + sql = """ INSERT INTO `kcdc_ili` (`release_date`, `issue`, `epiweek`, `region`, `lag`, `ili`) @@ -129,15 +133,15 @@ def update_from_data(ews, ilis, date, issue, test_mode=False): ON DUPLICATE KEY UPDATE `release_date` = least(`release_date`, '%s'), `ili` = %s - ''' + """ for i in range(len(ews)): ew = ews[i] ili = ilis[i] lag = delta_epiweeks(ews[i], issue) - insert_args = [date,issue,ew,'ROK',lag,ili] - update_args = [date,ili] + insert_args = [date, issue, ew, "ROK", lag, ili] + update_args = [date, ili] try: insert.execute(sql % tuple(insert_args + update_args)) except Exception: @@ -146,34 +150,31 @@ def update_from_data(ews, ilis, date, issue, test_mode=False): # cleanup insert.close() if test_mode: - print('test mode, not committing') + print("test mode, not committing") rows2 = rows1 else: cnx.commit() rows2 = get_rows(cnx) - print('rows after: %d (added %d)' % (rows2,rows2-rows1)) + print("rows after: %d (added %d)" % (rows2, rows2 - rows1)) cnx.close() + def main(): # args and usage parser = argparse.ArgumentParser() - parser.add_argument( - '--test', - action='store_true', - help='do dry run only, do not update the database' - ) + parser.add_argument("--test", action="store_true", help="do dry run only, do not update the database") args = parser.parse_args() - date = datetime.datetime.now().strftime('%Y-%m-%d') - print('assuming release date is today, %s' % date) + date = datetime.datetime.now().strftime("%Y-%m-%d") + print("assuming release date is today, %s" % date) issue = EpiDate.today().get_ew() ensure_tables_exist() - ews,ilis = get_kcdc_data() + ews, ilis = get_kcdc_data() update_from_data(ews, ilis, date, issue, test_mode=args.test) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/src/acquisition/nidss/taiwan_nidss.py b/src/acquisition/nidss/taiwan_nidss.py index 27da863e1..d55ddf7e5 100644 --- a/src/acquisition/nidss/taiwan_nidss.py +++ b/src/acquisition/nidss/taiwan_nidss.py @@ -4,7 +4,7 @@ =============== Scrapes weekly flu data from Taiwan's National Infectious Disease Statistics -System (NIDSS): http://nidss.cdc.gov.tw/en/ +System (NIDSS): https://nidss.cdc.gov.tw/en/ ================= @@ -37,233 +37,230 @@ class NIDSS: - """An API for scraping the NIDSS site.""" + """An API for scraping the NIDSS site.""" - # The page where the flu data is kept - FLU_URL = 'https://nidss.cdc.gov.tw/en/CDCWNH01.aspx?dc=wnh' + # The page where the flu data is kept + FLU_URL = "https://nidss.cdc.gov.tw/en/CDCWNH01.aspx?dc=wnh" - # Link to the dengue data - DENGUE_URL = 'http://nidss.cdc.gov.tw/Download/Weekly_Age_County_Gender_061.csv' + # Link to the dengue data + DENGUE_URL = "https://nidss.cdc.gov.tw/Download/Weekly_Age_County_Gender_061.csv" - # Translate location names to English - # https://en.wikipedia.org/wiki/List_of_administrative_divisions_of_Taiwan - _TRANSLATED = { - b'5Y2X5oqV57ij': 'Nantou_County', - b'5Y+w5Lit5biC': 'Taichung_City', - b'5Y+w5YyX5biC': 'Taipei_City', - b'5Y+w5Y2X5biC': 'Tainan_City', - b'5Y+w5p2x57ij': 'Taitung_County', - b'5ZiJ576p5biC': 'Chiayi_City', - b'5ZiJ576p57ij': 'Chiayi_County', - b'5Z+66ZqG5biC': 'Keelung_City', - b'5a6c6Jit57ij': 'Yilan_County', - b'5bGP5p2x57ij': 'Pingtung_County', - b'5b2w5YyW57ij': 'Changhua_County', - b'5paw5YyX5biC': 'New_Taipei_City', - b'5paw56u55biC': 'Hsinchu_City', - b'5paw56u557ij': 'Hsinchu_County', - b'5qGD5ZyS5biC': 'Taoyuan_City', - b'5r6O5rmW57ij': 'Penghu_County', - b'6Iqx6JOu57ij': 'Hualien_County', - b'6IuX5qCX57ij': 'Miaoli_County', - b'6YeR6ZaA57ij': 'Kinmen_County', - b'6Zuy5p6X57ij': 'Yunlin_County', - b'6auY6ZuE5biC': 'Kaohsiung_City', - b'6YCj5rGf57ij': 'Lienchiang_County', - } + # Translate location names to English + # https://en.wikipedia.org/wiki/List_of_administrative_divisions_of_Taiwan + _TRANSLATED = { + b"5Y2X5oqV57ij": "Nantou_County", + b"5Y+w5Lit5biC": "Taichung_City", + b"5Y+w5YyX5biC": "Taipei_City", + b"5Y+w5Y2X5biC": "Tainan_City", + b"5Y+w5p2x57ij": "Taitung_County", + b"5ZiJ576p5biC": "Chiayi_City", + b"5ZiJ576p57ij": "Chiayi_County", + b"5Z+66ZqG5biC": "Keelung_City", + b"5a6c6Jit57ij": "Yilan_County", + b"5bGP5p2x57ij": "Pingtung_County", + b"5b2w5YyW57ij": "Changhua_County", + b"5paw5YyX5biC": "New_Taipei_City", + b"5paw56u55biC": "Hsinchu_City", + b"5paw56u557ij": "Hsinchu_County", + b"5qGD5ZyS5biC": "Taoyuan_City", + b"5r6O5rmW57ij": "Penghu_County", + b"6Iqx6JOu57ij": "Hualien_County", + b"6IuX5qCX57ij": "Miaoli_County", + b"6YeR6ZaA57ij": "Kinmen_County", + b"6Zuy5p6X57ij": "Yunlin_County", + b"6auY6ZuE5biC": "Kaohsiung_City", + b"6YCj5rGf57ij": "Lienchiang_County", + } - # Map locations to regions - # https://en.wikipedia.org/wiki/List_of_administrative_divisions_of_Taiwan - # https://en.wikipedia.org/wiki/Regions_of_Taiwan#Hexchotomy - LOCATION_TO_REGION = { - # Taipei - 'Taipei_City': 'Taipei', - 'Keelung_City': 'Taipei', - 'New_Taipei_City': 'Taipei', - 'Yilan_County': 'Taipei', - 'Kinmen_County': 'Taipei', - 'Lienchiang_County': 'Taipei', - # Northern - 'Hsinchu_City': 'Northern', - 'Taoyuan_City': 'Northern', - 'Hsinchu_County': 'Northern', - 'Miaoli_County': 'Northern', - # Central - 'Taichung_City': 'Central', - 'Changhua_County': 'Central', - 'Nantou_County': 'Central', - # Southern - 'Tainan_City': 'Southern', - 'Chiayi_City': 'Southern', - 'Yunlin_County': 'Southern', - 'Chiayi_County': 'Southern', - # Kaoping - 'Kaohsiung_City': 'Kaoping', - 'Pingtung_County': 'Kaoping', - 'Penghu_County': 'Kaoping', - # Eastern - 'Hualien_County': 'Eastern', - 'Taitung_County': 'Eastern', - } + # Map locations to regions + # https://en.wikipedia.org/wiki/List_of_administrative_divisions_of_Taiwan + # https://en.wikipedia.org/wiki/Regions_of_Taiwan#Hexchotomy + LOCATION_TO_REGION = { + # Taipei + "Taipei_City": "Taipei", + "Keelung_City": "Taipei", + "New_Taipei_City": "Taipei", + "Yilan_County": "Taipei", + "Kinmen_County": "Taipei", + "Lienchiang_County": "Taipei", + # Northern + "Hsinchu_City": "Northern", + "Taoyuan_City": "Northern", + "Hsinchu_County": "Northern", + "Miaoli_County": "Northern", + # Central + "Taichung_City": "Central", + "Changhua_County": "Central", + "Nantou_County": "Central", + # Southern + "Tainan_City": "Southern", + "Chiayi_City": "Southern", + "Yunlin_County": "Southern", + "Chiayi_County": "Southern", + # Kaoping + "Kaohsiung_City": "Kaoping", + "Pingtung_County": "Kaoping", + "Penghu_County": "Kaoping", + # Eastern + "Hualien_County": "Eastern", + "Taitung_County": "Eastern", + } - @staticmethod - def _get_metadata(html): - issue_pattern = re.compile('^.*Latest available data: Week (\\d+), (\\d{4})\\..*$') - release_pattern = re.compile('^.*Data as of \\d+:\\d+:\\d+, (\\d{4})/(\\d{2})/(\\d{2})\\..*$') - issue, release = None, None - for line in html.split('\n'): - match = issue_pattern.match(line) - if match is not None: - year, week = int(match.group(2)), int(match.group(1)) - issue = year * 100 + week - match = release_pattern.match(line) - if match is not None: - year, month, day = int(match.group(1)), int(match.group(2)), int(match.group(3)) - release = '%04d-%02d-%02d' % (year, month, day) - if issue is None or release is None: - raise Exception('metadata not found') - return issue, release + @staticmethod + def _get_metadata(html): + issue_pattern = re.compile("^.*Latest available data: Week (\\d+), (\\d{4})\\..*$") + release_pattern = re.compile("^.*Data as of \\d+:\\d+:\\d+, (\\d{4})/(\\d{2})/(\\d{2})\\..*$") + issue, release = None, None + for line in html.split("\n"): + match = issue_pattern.match(line) + if match is not None: + year, week = int(match.group(2)), int(match.group(1)) + issue = year * 100 + week + match = release_pattern.match(line) + if match is not None: + year, month, day = int(match.group(1)), int(match.group(2)), int(match.group(3)) + release = "%04d-%02d-%02d" % (year, month, day) + if issue is None or release is None: + raise Exception("metadata not found") + return issue, release - @staticmethod - def _get_flu_data(html): - week_pattern = re.compile('^categories: \\[(.*)\\],$') - value_pattern = re.compile('^series: \\[(.*)\\],$') - data = {} - parsing_ili = True - for line in html.split('\n'): - line = line.strip() - match = week_pattern.match(line) - if match is not None: - weeks = [int(x[1:-1]) for x in match.group(1).split(',')] - for week in weeks: - check_epiweek(week) - if week not in data: - data[week] = {} - match = value_pattern.match(line) - if match is not None: - for item in match.group(1).split('},{'): - parts = item.replace('{', '').replace('}', '').strip().split(' ') - location = parts[1][1:-2] - def num(value): - if parsing_ili: - return float(value) - else: - if '.' in value: - raise Exception('expected type int for visits') - return int(value) - values = [num(x) for x in parts[3][1:-1].split(',')] - unit = 'ili' if parsing_ili else 'visits' - if len(weeks) != len(values): - raise Exception('len(weeks) != len(values)') - for week, value in zip(weeks, values): - if location not in data[week]: - data[week][location] = {} - data[week][location][unit] = value - parsing_ili = False - if len(data) == 0: - raise Exception('no data') - return data + @staticmethod + def _get_flu_data(html): + week_pattern = re.compile("^categories: \\[(.*)\\],$") + value_pattern = re.compile("^series: \\[(.*)\\],$") + data = {} + parsing_ili = True + for line in html.split("\n"): + line = line.strip() + match = week_pattern.match(line) + if match is not None: + weeks = [int(x[1:-1]) for x in match.group(1).split(",")] + for week in weeks: + check_epiweek(week) + if week not in data: + data[week] = {} + match = value_pattern.match(line) + if match is not None: + for item in match.group(1).split("},{"): + parts = item.replace("{", "").replace("}", "").strip().split(" ") + location = parts[1][1:-2] + + def num(value): + if parsing_ili: + return float(value) + else: + if "." in value: + raise Exception("expected type int for visits") + return int(value) - @staticmethod - def get_flu_data(): - # Fetch the flu page - response = requests.get(NIDSS.FLU_URL) - if response.status_code != 200: - raise Exception('request failed [%d]' % response.status_code) - html = response.text - # Parse metadata - latest_week, release_date = NIDSS._get_metadata(html) - # Parse flu data - data = NIDSS._get_flu_data(html) - # Return results indexed by week and location - return latest_week, release_date, data + values = [num(x) for x in parts[3][1:-1].split(",")] + unit = "ili" if parsing_ili else "visits" + if len(weeks) != len(values): + raise Exception("len(weeks) != len(values)") + for week, value in zip(weeks, values): + if location not in data[week]: + data[week][location] = {} + data[week][location][unit] = value + parsing_ili = False + if len(data) == 0: + raise Exception("no data") + return data - @staticmethod - def get_dengue_data(first_week, last_week): - # Check week order - if first_week > last_week: - first_week, last_week = last_week, first_week - # Bounds check - if first_week < 200301 or last_week < 200301: - raise Exception('week out of range') - # Initialize data by week and location (zeroes are not reported) - data = {} - for week in range_epiweeks(first_week, add_epiweeks(last_week, 1)): - data[week] = {} - for location in NIDSS.LOCATION_TO_REGION.keys(): - data[week][location] = 0 - # Download CSV - response = requests.get(NIDSS.DENGUE_URL) - if response.status_code != 200: - raise Exception('export Dengue failed [%d]' % response.status_code) - csv = response.content.decode('big5-tw') - # Parse the data - lines = [l.strip() for l in csv.split('\n')[1:] if l.strip() != ''] - for line in lines: - fields = line.split(',') - location_b64 = base64.b64encode(fields[3].encode('utf-8')) - location = NIDSS._TRANSLATED[location_b64] - # Fields currently unused: - # region = NIDSS.LOCATION_TO_REGION[location] - # imported_b64 = base64.b64encode(fields[6].encode('utf-8')) - # imported = imported_b64 == b'5piv' - # sex = fields[5] - # age = fields[7] - count = int(fields[8]) - year = int(fields[1]) - week = int(fields[2]) - # Week 53 was reported each year in 2003-2007 - if year < 2008 and year != 2003 and week > 52: - week = 52 - # Epiweek system change in 2009 - # See also: http://research.undefinedx.com/forum/index.php?topic=300.0 - if year == 2009: - week -= 1 - if week == 0: - year, week = 2008, 53 - epiweek = year * 100 + week - if epiweek < first_week or epiweek > last_week: - # Outside of the requested range - continue - if epiweek not in data or location not in data[epiweek]: - # Not a vaild U.S. epiweek - raise Exception('data missing %d-%s' % (epiweek, location)) - # Add the counts to the location on this epiweek - data[epiweek][location] += count - # Return results indexed by week and location - return data + @staticmethod + def get_flu_data(): + # Fetch the flu page + response = requests.get(NIDSS.FLU_URL) + if response.status_code != 200: + raise Exception("request failed [%d]" % response.status_code) + html = response.text + # Parse metadata + latest_week, release_date = NIDSS._get_metadata(html) + # Parse flu data + data = NIDSS._get_flu_data(html) + # Return results indexed by week and location + return latest_week, release_date, data + + @staticmethod + def get_dengue_data(first_week, last_week): + # Check week order + if first_week > last_week: + first_week, last_week = last_week, first_week + # Bounds check + if first_week < 200301 or last_week < 200301: + raise Exception("week out of range") + # Initialize data by week and location (zeroes are not reported) + data = {} + for week in range_epiweeks(first_week, add_epiweeks(last_week, 1)): + data[week] = {} + for location in NIDSS.LOCATION_TO_REGION.keys(): + data[week][location] = 0 + # Download CSV + response = requests.get(NIDSS.DENGUE_URL) + if response.status_code != 200: + raise Exception("export Dengue failed [%d]" % response.status_code) + csv = response.content.decode("big5-tw") + # Parse the data + lines = [l.strip() for l in csv.split("\n")[1:] if l.strip() != ""] + for line in lines: + fields = line.split(",") + location_b64 = base64.b64encode(fields[3].encode("utf-8")) + location = NIDSS._TRANSLATED[location_b64] + # Fields currently unused: + # region = NIDSS.LOCATION_TO_REGION[location] + # imported_b64 = base64.b64encode(fields[6].encode('utf-8')) + # imported = imported_b64 == b'5piv' + # sex = fields[5] + # age = fields[7] + count = int(fields[8]) + year = int(fields[1]) + week = int(fields[2]) + # Week 53 was reported each year in 2003-2007 + if year < 2008 and year != 2003 and week > 52: + week = 52 + # Epiweek system change in 2009 + # See also: https://research.undefinedx.com/forum/index.php?topic=300.0 + if year == 2009: + week -= 1 + if week == 0: + year, week = 2008, 53 + epiweek = year * 100 + week + if epiweek < first_week or epiweek > last_week: + # Outside of the requested range + continue + if epiweek not in data or location not in data[epiweek]: + # Not a vaild U.S. epiweek + raise Exception("data missing %d-%s" % (epiweek, location)) + # Add the counts to the location on this epiweek + data[epiweek][location] += count + # Return results indexed by week and location + return data def main(): - # Args and usage - parser = argparse.ArgumentParser() - parser.add_argument( - 'epiweek', - action='store', - type=int, - help='fetch data on this epiweek (ex: 201537)' - ) - args = parser.parse_args() - ew = args.epiweek + # Args and usage + parser = argparse.ArgumentParser() + parser.add_argument("epiweek", action="store", type=int, help="fetch data on this epiweek (ex: 201537)") + args = parser.parse_args() + ew = args.epiweek - # Get the data - latest_week, release_date, fdata = NIDSS.get_flu_data() - ddata = NIDSS.get_dengue_data(ew, ew) + # Get the data + latest_week, release_date, fdata = NIDSS.get_flu_data() + ddata = NIDSS.get_dengue_data(ew, ew) - # Print the results - print('*** Meta ***') - print('latest_week:', latest_week) - print('release_date:', release_date) - print('*** Flu ***') - for region in sorted(list(fdata[ew].keys())): - visits, ili = fdata[ew][region]['visits'], fdata[ew][region]['ili'] - print('region=%s | visits=%d | ili=%.3f' % (region, visits, ili)) - print('*** Dengue ***') - for location in sorted(list(ddata[ew].keys())): - region = NIDSS.LOCATION_TO_REGION[location] - count = ddata[ew][location] - print('location=%s | region=%s | count=%d' % (location, region, count)) + # Print the results + print("*** Meta ***") + print("latest_week:", latest_week) + print("release_date:", release_date) + print("*** Flu ***") + for region in sorted(list(fdata[ew].keys())): + visits, ili = fdata[ew][region]["visits"], fdata[ew][region]["ili"] + print("region=%s | visits=%d | ili=%.3f" % (region, visits, ili)) + print("*** Dengue ***") + for location in sorted(list(ddata[ew].keys())): + region = NIDSS.LOCATION_TO_REGION[location] + count = ddata[ew][location] + print("location=%s | region=%s | count=%d" % (location, region, count)) -if __name__ == '__main__': - main() +if __name__ == "__main__": + main() diff --git a/src/acquisition/nidss/taiwan_update.py b/src/acquisition/nidss/taiwan_update.py index 830a7738d..4ba8b1778 100644 --- a/src/acquisition/nidss/taiwan_update.py +++ b/src/acquisition/nidss/taiwan_update.py @@ -87,92 +87,86 @@ # Get a row count just to know how many new rows are inserted def get_rows(cnx): - select = cnx.cursor() - select.execute('SELECT count(1) num FROM nidss_flu') - for (num,) in select: - rows_flu = num - select.execute('SELECT count(1) num FROM nidss_dengue') - for (num,) in select: - rows_dengue = num - select.close() - return (rows_flu, rows_dengue) + select = cnx.cursor() + select.execute("SELECT count(1) num FROM nidss_flu") + for (num,) in select: + rows_flu = num + select.execute("SELECT count(1) num FROM nidss_dengue") + for (num,) in select: + rows_dengue = num + select.close() + return (rows_flu, rows_dengue) def update(test_mode=False): - # test mode - if test_mode: - print('test mode enabled: changes will not be saved') - - # Database connection - u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - rows1 = get_rows(cnx) - print('rows before (flu): %d' % (rows1[0])) - print('rows before (dengue): %d' % (rows1[1])) - insert = cnx.cursor() - sql_flu = ''' - INSERT INTO - `nidss_flu` (`release_date`, `issue`, `epiweek`, `region`, `lag`, `visits`, `ili`) - VALUES - (%s, %s, %s, %s, %s, %s, %s) - ON DUPLICATE KEY UPDATE - `release_date` = least(`release_date`, %s), `visits` = %s, `ili` = %s - ''' - sql_dengue = ''' - INSERT INTO - `nidss_dengue` (`epiweek`, `location`, `region`, `count`) - VALUES - (%s, %s, %s, %s) - ON DUPLICATE KEY UPDATE - `count` = %s - ''' - - # Scrape flu data - current_week, release_date, data = NIDSS.get_flu_data() - for epiweek in sorted(list(data.keys())): - lag = delta_epiweeks(epiweek, current_week) - for region in data[epiweek].keys(): - visits, ili = data[epiweek][region]['visits'], data[epiweek][region]['ili'] - params1 = [release_date, current_week, epiweek, region, lag, visits, ili] - params2 = [release_date, visits, ili] - insert.execute(sql_flu, tuple(params1 + params2)) - - # Scrape dengue data from the past year - data = NIDSS.get_dengue_data(add_epiweeks(current_week, -51), current_week) - for epiweek in sorted(list(data.keys())): - for location in sorted(list(data[epiweek].keys())): - region = NIDSS.LOCATION_TO_REGION[location] - count = data[epiweek][location] - params = (epiweek, location, region, count, count) - insert.execute(sql_dengue, params) - - # Cleanup - insert.close() - rows2 = get_rows(cnx) - print('rows after (flu): %d (added %d)' % (rows2[0], rows2[0] - rows1[0])) - print('rows after (dengue): %d (added %d)' % (rows2[1], rows2[1] - rows1[1])) - if test_mode: - print('test mode: changes not commited') - else: - cnx.commit() - cnx.close() + # test mode + if test_mode: + print("test mode enabled: changes will not be saved") + + # Database connection + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") + rows1 = get_rows(cnx) + print("rows before (flu): %d" % (rows1[0])) + print("rows before (dengue): %d" % (rows1[1])) + insert = cnx.cursor() + sql_flu = """ + INSERT INTO + `nidss_flu` (`release_date`, `issue`, `epiweek`, `region`, `lag`, `visits`, `ili`) + VALUES + (%s, %s, %s, %s, %s, %s, %s) + ON DUPLICATE KEY UPDATE + `release_date` = least(`release_date`, %s), `visits` = %s, `ili` = %s + """ + sql_dengue = """ + INSERT INTO + `nidss_dengue` (`epiweek`, `location`, `region`, `count`) + VALUES + (%s, %s, %s, %s) + ON DUPLICATE KEY UPDATE + `count` = %s + """ + + # Scrape flu data + current_week, release_date, data = NIDSS.get_flu_data() + for epiweek in sorted(list(data.keys())): + lag = delta_epiweeks(epiweek, current_week) + for region in data[epiweek].keys(): + visits, ili = data[epiweek][region]["visits"], data[epiweek][region]["ili"] + params1 = [release_date, current_week, epiweek, region, lag, visits, ili] + params2 = [release_date, visits, ili] + insert.execute(sql_flu, tuple(params1 + params2)) + + # Scrape dengue data from the past year + data = NIDSS.get_dengue_data(add_epiweeks(current_week, -51), current_week) + for epiweek in sorted(list(data.keys())): + for location in sorted(list(data[epiweek].keys())): + region = NIDSS.LOCATION_TO_REGION[location] + count = data[epiweek][location] + params = (epiweek, location, region, count, count) + insert.execute(sql_dengue, params) + + # Cleanup + insert.close() + rows2 = get_rows(cnx) + print("rows after (flu): %d (added %d)" % (rows2[0], rows2[0] - rows1[0])) + print("rows after (dengue): %d (added %d)" % (rows2[1], rows2[1] - rows1[1])) + if test_mode: + print("test mode: changes not commited") + else: + cnx.commit() + cnx.close() def main(): - # args and usage - parser = argparse.ArgumentParser() - parser.add_argument( - '--test', - '-t', - action='store_true', - default=False, - help='test mode, do not commit changes' - ) - args = parser.parse_args() - - # fetch and store NIDSS data - update(args.test) - - -if __name__ == '__main__': - main() + # args and usage + parser = argparse.ArgumentParser() + parser.add_argument("--test", "-t", action="store_true", default=False, help="test mode, do not commit changes") + args = parser.parse_args() + + # fetch and store NIDSS data + update(args.test) + + +if __name__ == "__main__": + main() diff --git a/src/acquisition/norostat/norostat_add_history.py b/src/acquisition/norostat/norostat_add_history.py index 64fd11ff7..05c29d69b 100644 --- a/src/acquisition/norostat/norostat_add_history.py +++ b/src/acquisition/norostat/norostat_add_history.py @@ -18,28 +18,31 @@ from . import norostat_raw - def main(): - norostat_sql.ensure_tables_exist() - snapshot_dir = os.path.expanduser("~/norostat_history/wayback/websites/www.cdc.gov/norovirus/reporting/norostat/data-table.html/") - snapshot_version_counter = collections.Counter() - for subdir in os.listdir(snapshot_dir): - if re.match(r'[0-9]+', subdir) is not None: - # appears to be snapshot dir - snapshot_version_counter[subdir] = 0 # register that loop found this snapshot directory - for norostat_capitalization in ["norostat","noroSTAT"]: - time.sleep(0.002) # ensure parse times are unique, assuming OS can accurately sleep and measure to ms precision - path = os.path.join(snapshot_dir,subdir,"norovirus","reporting",norostat_capitalization,"data-table.html") - if os.path.isfile(path): - print("Processing file ", path) - with open(path, 'r') as datatable_file: - content = datatable_file.read() - wide_raw = norostat_raw.parse_content_to_wide_raw(content) - long_raw = norostat_raw.melt_wide_raw_to_long_raw(wide_raw) - norostat_sql.record_long_raw(long_raw) - snapshot_version_counter[subdir] += 1 - print('Successfully uploaded the following snapshots, with the count indicating the number of data-table versions found inside each snapshot (expected to be 1, or maybe 2 if there was a change in capitalization; 0 indicates the NoroSTAT page was not found within a snapshot directory); just "Counter()" indicates no snapshot directories were found:', snapshot_version_counter) - norostat_sql.update_point() + norostat_sql.ensure_tables_exist() + snapshot_dir = os.path.expanduser("~/norostat_history/wayback/websites/www.cdc.gov/norovirus/reporting/norostat/data-table.html/") + snapshot_version_counter = collections.Counter() + for subdir in os.listdir(snapshot_dir): + if re.match(r"[0-9]+", subdir) is not None: + # appears to be snapshot dir + snapshot_version_counter[subdir] = 0 # register that loop found this snapshot directory + for norostat_capitalization in ["norostat", "noroSTAT"]: + time.sleep(0.002) # ensure parse times are unique, assuming OS can accurately sleep and measure to ms precision + path = os.path.join(snapshot_dir, subdir, "norovirus", "reporting", norostat_capitalization, "data-table.html") + if os.path.isfile(path): + print("Processing file ", path) + with open(path, "r") as datatable_file: + content = datatable_file.read() + wide_raw = norostat_raw.parse_content_to_wide_raw(content) + long_raw = norostat_raw.melt_wide_raw_to_long_raw(wide_raw) + norostat_sql.record_long_raw(long_raw) + snapshot_version_counter[subdir] += 1 + print( + 'Successfully uploaded the following snapshots, with the count indicating the number of data-table versions found inside each snapshot (expected to be 1, or maybe 2 if there was a change in capitalization; 0 indicates the NoroSTAT page was not found within a snapshot directory); just "Counter()" indicates no snapshot directories were found:', + snapshot_version_counter, + ) + norostat_sql.update_point() + -if __name__ == '__main__': - main() +if __name__ == "__main__": + main() diff --git a/src/acquisition/norostat/norostat_raw.py b/src/acquisition/norostat/norostat_raw.py index 582de9684..db7f5ace9 100644 --- a/src/acquisition/norostat/norostat_raw.py +++ b/src/acquisition/norostat/norostat_raw.py @@ -8,7 +8,6 @@ """ - # standard library import datetime import re @@ -22,91 +21,88 @@ # first party from .norostat_utils import * + def fetch_content(norostat_datatable_url="https://www.cdc.gov/norovirus/reporting/norostat/data-table.html"): - """Download NoroSTAT data-table. Returns the html content.""" - headers = { - 'User-Agent': 'delphibot/1.0 (+https://delphi.cmu.edu/)', - } - resp = requests.get(norostat_datatable_url, headers=headers) - expect_value_eq(resp.status_code, 200, - 'Wanted status code {}. Received: ') - expect_value_eq(resp.headers.get("Content-Type"), "text/html", - 'Expected Content-Type "{}"; Received ') - return resp.content + """Download NoroSTAT data-table. Returns the html content.""" + headers = { + "User-Agent": "delphibot/1.0 (+https://delphi.cmu.edu/)", + } + resp = requests.get(norostat_datatable_url, headers=headers) + expect_value_eq(resp.status_code, 200, "Wanted status code {}. Received: ") + expect_value_eq(resp.headers.get("Content-Type"), "text/html", 'Expected Content-Type "{}"; Received ') + return resp.content + def save_sample_content(content, f="sample_content.pickle"): - """Save the content from fetch_content into a pickle file for most testing (don't download unnecessarily).""" - with open(f, "wb") as handle: - pickle.dump(content, handle) + """Save the content from fetch_content into a pickle file for most testing (don't download unnecessarily).""" + with open(f, "wb") as handle: + pickle.dump(content, handle) + def load_sample_content(f="sample_content.pickle"): - """Load data from a past call to fetch_content from a pickle file for most testing (don't download unnecessarily).""" - with open(f, "rb") as handle: - content = pickle.load(handle) - return content + """Load data from a past call to fetch_content from a pickle file for most testing (don't download unnecessarily).""" + with open(f, "rb") as handle: + content = pickle.load(handle) + return content + def parse_content_to_wide_raw(content): - """Convert the html content for the data-table into a wide data frame, then stick it in a tuple along with the release_date, parse_time, and (constant) location.""" - parse_time = datetime.datetime.now() - html_root = lxml.html.fromstring(content) - # Extract the release date, a.k.a. dateModified, a.k.a. "Page last updated" date; ~Dec 2018 this is only available in a meta tag; previously, it was available in a visible span - dateModified_meta_elts = html_root.xpath('//meta[@property="cdc:last_updated"]') - dateModified_span_elts = html_root.xpath('//span[@itemprop="dateModified"]') - if len(dateModified_meta_elts) == 1: - [dateModified_meta_elt] = dateModified_meta_elts - dateModified = dateModified_meta_elt.attrib['content'] - elif len(dateModified_span_elts) == 1: - [dateModified_span_elt] = dateModified_span_elts - dateModified = dateModified_span_elt.text - else: - raise Exception("Could not find the expected number of dateModified meta or span tags.") - # FIXME check/enforce locale - release_date = datetime.datetime.strptime(dateModified, "%B %d, %Y").date() - # Check that table description still specifies suspected&confirmed norovirus - # outbreaks (insensitive to case of certain letters and allowing for both old - # "to the" and new "through the" text), then extract list of states from the - # description: - [description_elt] = html_root.xpath('''//p[ + """Convert the html content for the data-table into a wide data frame, then stick it in a tuple along with the release_date, parse_time, and (constant) location.""" + parse_time = datetime.datetime.now() + html_root = lxml.html.fromstring(content) + # Extract the release date, a.k.a. dateModified, a.k.a. "Page last updated" date; ~Dec 2018 this is only available in a meta tag; previously, it was available in a visible span + dateModified_meta_elts = html_root.xpath('//meta[@property="cdc:last_updated"]') + dateModified_span_elts = html_root.xpath('//span[@itemprop="dateModified"]') + if len(dateModified_meta_elts) == 1: + [dateModified_meta_elt] = dateModified_meta_elts + dateModified = dateModified_meta_elt.attrib["content"] + elif len(dateModified_span_elts) == 1: + [dateModified_span_elt] = dateModified_span_elts + dateModified = dateModified_span_elt.text + else: + raise Exception("Could not find the expected number of dateModified meta or span tags.") + # FIXME check/enforce locale + release_date = datetime.datetime.strptime(dateModified, "%B %d, %Y").date() + # Check that table description still specifies suspected&confirmed norovirus + # outbreaks (insensitive to case of certain letters and allowing for both old + # "to the" and new "through the" text), then extract list of states from the + # description: + [description_elt] = html_root.xpath( + """//p[ contains(translate(text(), "SCNORHD", "scnorhd"), "suspected and confirmed norovirus outbreaks reported by state health departments in") and ( contains(text(), "to the") or contains(text(), "through the") ) - ]''') - location = re.match(".*?[Dd]epartments in (.*?) (?:to)|(?:through) the.*$", description_elt.text).group(1) - # Attempt to find exactly 1 table (note: it would be nice to filter on the - # associated caption, but no such caption is present in earlier versions): - [table] = html_root.xpath('//table') - # Convert html table to DataFrame: - # Directly reading in the table with pd.read_html performs unwanted dtype - # inference, but reveals the column names: - [wide_raw_df_with_unwanted_conversions] = pd.read_html(lxml.html.tostring(table)) - # We want all columns to be string columns. However, there does not appear - # to be an option to disable dtype inference in pd.read_html. Hide all - # entries inside 1-tuple wrappers using pre-dtype-inference converters, - # then unpack afterward (the entries fed to the converters should already - # be strings, but "convert" them to strings just in case): - [wide_raw_df_with_wrappers] = pd.read_html( - lxml.html.tostring(table), - converters= {col: lambda entry: (str(entry),) - for col in wide_raw_df_with_unwanted_conversions.columns} - ) - # Unwrap entries: - wide_raw_df = wide_raw_df_with_wrappers.applymap(lambda wrapper: wrapper[0]) - # Check format: - expect_value_eq(wide_raw_df.columns[0], "Week", - 'Expected raw_colnames[0] to be "{}"; encountered ') - for colname in wide_raw_df.columns: - expect_result_eq(dtype_kind, wide_raw_df[colname].head(), "O", - 'Expected (head of) "%s" column to have dtype kind "{}"; instead had dtype kind & head '%(colname)) - # Pack up df with metadata: - wide_raw = (wide_raw_df, release_date, parse_time, location) - return wide_raw + ]""" + ) + location = re.match(".*?[Dd]epartments in (.*?) (?:to)|(?:through) the.*$", description_elt.text).group(1) + # Attempt to find exactly 1 table (note: it would be nice to filter on the + # associated caption, but no such caption is present in earlier versions): + [table] = html_root.xpath("//table") + # Convert html table to DataFrame: + # Directly reading in the table with pd.read_html performs unwanted dtype + # inference, but reveals the column names: + [wide_raw_df_with_unwanted_conversions] = pd.read_html(lxml.html.tostring(table)) + # We want all columns to be string columns. However, there does not appear + # to be an option to disable dtype inference in pd.read_html. Hide all + # entries inside 1-tuple wrappers using pre-dtype-inference converters, + # then unpack afterward (the entries fed to the converters should already + # be strings, but "convert" them to strings just in case): + [wide_raw_df_with_wrappers] = pd.read_html(lxml.html.tostring(table), converters={col: lambda entry: (str(entry),) for col in wide_raw_df_with_unwanted_conversions.columns}) + # Unwrap entries: + wide_raw_df = wide_raw_df_with_wrappers.applymap(lambda wrapper: wrapper[0]) + # Check format: + expect_value_eq(wide_raw_df.columns[0], "Week", 'Expected raw_colnames[0] to be "{}"; encountered ') + for colname in wide_raw_df.columns: + expect_result_eq(dtype_kind, wide_raw_df[colname].head(), "O", 'Expected (head of) "%s" column to have dtype kind "{}"; instead had dtype kind & head ' % (colname)) + # Pack up df with metadata: + wide_raw = (wide_raw_df, release_date, parse_time, location) + return wide_raw + def melt_wide_raw_to_long_raw(wide_raw): - (wide_raw_df, release_date, parse_time, location) = wide_raw - long_raw_df = wide_raw_df \ - .melt(id_vars=["Week"], var_name="measurement_type", value_name="value") \ - .rename(index=str, columns={"Week": "week"}) - long_raw = (long_raw_df, release_date, parse_time, location) - return long_raw + (wide_raw_df, release_date, parse_time, location) = wide_raw + long_raw_df = wide_raw_df.melt(id_vars=["Week"], var_name="measurement_type", value_name="value").rename(index=str, columns={"Week": "week"}) + long_raw = (long_raw_df, release_date, parse_time, location) + return long_raw diff --git a/src/acquisition/norostat/norostat_sql.py b/src/acquisition/norostat/norostat_sql.py index 168e275eb..4e9e1ffba 100644 --- a/src/acquisition/norostat/norostat_sql.py +++ b/src/acquisition/norostat/norostat_sql.py @@ -61,363 +61,430 @@ # if there is no such version, this table will not be created or used; uses # interned string id's + def ensure_tables_exist(): - (u, p) = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - try: - cursor = cnx.cursor() - cursor.execute(''' - CREATE TABLE IF NOT EXISTS `norostat_raw_datatable_version_list` ( - `release_date` DATE NOT NULL, - `parse_time` DATETIME(6) NOT NULL, - PRIMARY KEY (`release_date`, `parse_time`) - ); - ''') - cursor.execute(''' - CREATE TABLE IF NOT EXISTS `norostat_raw_datatable_measurement_type_pool` ( - `measurement_type_id` INT NOT NULL PRIMARY KEY AUTO_INCREMENT, - `measurement_type` NVARCHAR(255) NOT NULL UNIQUE KEY - ); - ''') - cursor.execute(''' - CREATE TABLE IF NOT EXISTS `norostat_raw_datatable_location_pool` ( - `location_id` INT NOT NULL PRIMARY KEY AUTO_INCREMENT, - `location` NVARCHAR(255) NOT NULL UNIQUE KEY - ); - ''') - cursor.execute(''' - CREATE TABLE IF NOT EXISTS `norostat_raw_datatable_week_pool` ( - `week_id` INT NOT NULL PRIMARY KEY AUTO_INCREMENT, - `week` NVARCHAR(255) NOT NULL UNIQUE KEY - ); - ''') - cursor.execute(''' - CREATE TABLE IF NOT EXISTS `norostat_raw_datatable_diffs` ( - `release_date` DATE NOT NULL, - `parse_time` DATETIME(6) NOT NULL, - `measurement_type_id` INT NOT NULL, - `location_id` INT NOT NULL, - `week_id` INT NOT NULL, - `new_value` NVARCHAR(255), -- allow NULL, with meaning "removed" - FOREIGN KEY (`release_date`,`parse_time`) REFERENCES `norostat_raw_datatable_version_list` (`release_date`,`parse_time`), - FOREIGN KEY (`measurement_type_id`) REFERENCES `norostat_raw_datatable_measurement_type_pool` (`measurement_type_id`), - FOREIGN KEY (`location_id`) REFERENCES `norostat_raw_datatable_location_pool` (`location_id`), - FOREIGN KEY (`week_id`) REFERENCES `norostat_raw_datatable_week_pool` (`week_id`), - UNIQUE KEY (`measurement_type_id`, `location_id`, `week_id`, `release_date`, `parse_time`, `new_value`), - PRIMARY KEY (`release_date`, `parse_time`, `measurement_type_id`, `location_id`, `week_id`) - -- (the indices here are larger than the data, but reducing the key - -- sizes and adding an id somehow seems to result in larger index sizes - -- somehow) - ); - ''') - cnx.commit() - finally: - cnx.close() + (u, p) = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") + try: + cursor = cnx.cursor() + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS `norostat_raw_datatable_version_list` ( + `release_date` DATE NOT NULL, + `parse_time` DATETIME(6) NOT NULL, + PRIMARY KEY (`release_date`, `parse_time`) + ); + """ + ) + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS `norostat_raw_datatable_measurement_type_pool` ( + `measurement_type_id` INT NOT NULL PRIMARY KEY AUTO_INCREMENT, + `measurement_type` NVARCHAR(255) NOT NULL UNIQUE KEY + ); + """ + ) + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS `norostat_raw_datatable_location_pool` ( + `location_id` INT NOT NULL PRIMARY KEY AUTO_INCREMENT, + `location` NVARCHAR(255) NOT NULL UNIQUE KEY + ); + """ + ) + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS `norostat_raw_datatable_week_pool` ( + `week_id` INT NOT NULL PRIMARY KEY AUTO_INCREMENT, + `week` NVARCHAR(255) NOT NULL UNIQUE KEY + ); + """ + ) + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS `norostat_raw_datatable_diffs` ( + `release_date` DATE NOT NULL, + `parse_time` DATETIME(6) NOT NULL, + `measurement_type_id` INT NOT NULL, + `location_id` INT NOT NULL, + `week_id` INT NOT NULL, + `new_value` NVARCHAR(255), -- allow NULL, with meaning "removed" + FOREIGN KEY (`release_date`,`parse_time`) REFERENCES `norostat_raw_datatable_version_list` (`release_date`,`parse_time`), + FOREIGN KEY (`measurement_type_id`) REFERENCES `norostat_raw_datatable_measurement_type_pool` (`measurement_type_id`), + FOREIGN KEY (`location_id`) REFERENCES `norostat_raw_datatable_location_pool` (`location_id`), + FOREIGN KEY (`week_id`) REFERENCES `norostat_raw_datatable_week_pool` (`week_id`), + UNIQUE KEY (`measurement_type_id`, `location_id`, `week_id`, `release_date`, `parse_time`, `new_value`), + PRIMARY KEY (`release_date`, `parse_time`, `measurement_type_id`, `location_id`, `week_id`) + -- (the indices here are larger than the data, but reducing the key + -- sizes and adding an id somehow seems to result in larger index sizes + -- somehow) + ); + """ + ) + cnx.commit() + finally: + cnx.close() + def dangerously_drop_all_norostat_tables(): - (u, p) = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - try: - cursor = cnx.cursor() - # Drop tables in reverse order (to avoid foreign key related errors): - cursor.execute(''' - DROP TABLE IF EXISTS `norostat_point_diffs`, - `norostat_point_version_list`, - `norostat_raw_datatable_diffs`, - `norostat_raw_datatable_week_pool`, - `norostat_raw_datatable_location_pool`, - `norostat_raw_datatable_measurement_type_pool`, - `norostat_raw_datatable_version_list`; - ''') - cnx.commit() # (might do nothing; each DROP commits itself anyway) - finally: - cnx.close() + (u, p) = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") + try: + cursor = cnx.cursor() + # Drop tables in reverse order (to avoid foreign key related errors): + cursor.execute( + """ + DROP TABLE IF EXISTS `norostat_point_diffs`, + `norostat_point_version_list`, + `norostat_raw_datatable_diffs`, + `norostat_raw_datatable_week_pool`, + `norostat_raw_datatable_location_pool`, + `norostat_raw_datatable_measurement_type_pool`, + `norostat_raw_datatable_version_list`; + """ + ) + cnx.commit() # (might do nothing; each DROP commits itself anyway) + finally: + cnx.close() + def record_long_raw(long_raw): - (long_raw_df, release_date, parse_time, location) = long_raw - (u, p) = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - try: - cursor = cnx.cursor() - cnx.start_transaction(isolation_level='SERIALIZABLE') - # Create, populate `norostat_raw_datatable_parsed`: - cursor.execute(''' - CREATE TEMPORARY TABLE `norostat_raw_datatable_parsed` ( - `measurement_type` NVARCHAR(255) NOT NULL, - `location` NVARCHAR(255) NOT NULL, - `week` NVARCHAR(255) NOT NULL, - `value` NVARCHAR(255) NOT NULL, -- forbid NULL; has special external meaning (see above) - PRIMARY KEY (`measurement_type`, `location`, `week`) - ) ENGINE=MEMORY; - ''') - cursor.executemany(''' - INSERT INTO `norostat_raw_datatable_parsed` (`week`,`measurement_type`,`value`,`location`) - VALUES (%s, %s, %s, %s); - ''', [(week, measurement_type, value, location) for - (week, measurement_type, value) in long_raw_df[["week","measurement_type","value"]].astype(str).itertuples(index=False, name=None) - ]) - # Create, populate `norostat_raw_datatable_previous`: - cursor.execute(''' - CREATE TEMPORARY TABLE `norostat_raw_datatable_previous` ( - `measurement_type_id` INT NOT NULL, - `location_id` INT NOT NULL, - `week_id` INT NOT NULL, - `value` NVARCHAR(255) NOT NULL, -- forbid NULL; has special external meaning (see above) - -- would like but not allowed: FOREIGN KEY (`measurement_type_id`) REFERENCES `norostat_raw_datatable_measurement_type_pool` (`measurement_type_id`), - -- would like but not allowed: FOREIGN KEY (`location_id`) REFERENCES `norostat_raw_datatable_location_pool` (`location_id`), - -- would like but not allowed: FOREIGN KEY (`week_id`) REFERENCES `norostat_raw_datatable_week_pool` (`week_id`), - PRIMARY KEY (`measurement_type_id`, `location_id`, `week_id`) - ) ENGINE=MEMORY; - ''') - cursor.execute(''' - INSERT INTO `norostat_raw_datatable_previous` (`measurement_type_id`, `location_id`, `week_id`, `value`) - SELECT `latest`.`measurement_type_id`, `latest`.`location_id`, `latest`.`week_id`, `latest`.`new_value` - FROM `norostat_raw_datatable_diffs` AS `latest` - -- Get the latest `new_value` by "group" (measurement_type, location, week) - -- using the fact that there are no later measurements belonging to the - -- same group (find NULL entries in `later`.{release_date,parse_time} - -- in the LEFT JOIN below); if the latest `new_value` is NULL, don't - -- include it in the result; it means that the corresponding cell/entry - -- has been removed from the data-table: - LEFT JOIN ( - SELECT * FROM `norostat_raw_datatable_diffs` - WHERE (`release_date`,`parse_time`) <= (%s,%s) - ) `later` - ON `latest`.`measurement_type_id` = `later`.`measurement_type_id` AND - `latest`.`location_id` = `later`.`location_id` AND - `latest`.`week_id` = `later`.`week_id` AND - (`latest`.`release_date`, `latest`.`parse_time`) < - (`later`.`release_date`, `later`.`parse_time`) - WHERE (`latest`.`release_date`, `latest`.`parse_time`) <= (%s, %s) AND - `later`.`parse_time` IS NULL AND - `latest`.`new_value` IS NOT NULL; - ''', (release_date, parse_time, release_date, parse_time)) - # Find next recorded `release_date`, `parse_time` if any; create, populate - # `norostat_raw_datatable_next` if there is such a version: - cursor.execute(''' - SELECT `release_date`, `parse_time` - FROM `norostat_raw_datatable_version_list` - WHERE (`release_date`, `parse_time`) > (%s,%s) - ORDER BY `release_date`, `parse_time` - LIMIT 1 - ''', (release_date, parse_time)) - next_version_if_any = cursor.fetchall() - expect_result_in(len, next_version_if_any, (0,1), - 'Bug: expected next-version query to return a number of results in {}; instead have len & val ') - if len(next_version_if_any) != 0: - cursor.execute(''' - CREATE TEMPORARY TABLE `norostat_raw_datatable_next` ( - `measurement_type_id` INT NOT NULL, - `location_id` INT NOT NULL, - `week_id` INT NOT NULL, - `value` NVARCHAR(255) NOT NULL, -- forbid NULL; has special external meaning (see above) - -- would like but not allowed: FOREIGN KEY (`measurement_type_id`) REFERENCES `norostat_raw_datatable_measurement_type_pool` (`measurement_type_id`), - -- would like but not allowed: FOREIGN KEY (`location_id`) REFERENCES `norostat_raw_datatable_location_pool` (`location_id`), - -- would like but not allowed: FOREIGN KEY (`week_id`) REFERENCES `norostat_raw_datatable_week_pool` (`week_id`), - PRIMARY KEY (`measurement_type_id`, `location_id`, `week_id`) - ) ENGINE=MEMORY; - ''') - cursor.execute(''' - INSERT INTO `norostat_raw_datatable_next` (`measurement_type_id`, `location_id`, `week_id`, `value`) - SELECT `latest`.`measurement_type_id`, `latest`.`location_id`, `latest`.`week_id`, `latest`.`new_value` - FROM `norostat_raw_datatable_diffs` AS `latest` - -- Get the latest `new_value` by "group" (measurement_type, location, week) - -- using the fact that there are no later measurements belonging to the - -- same group (find NULL entries in `later`.{release_date,parse_time} - -- in the LEFT JOIN below); if the latest `new_value` is NULL, don't - -- include it in the result; it means that the corresponding cell/entry - -- has been removed from the data-table: - LEFT JOIN ( - SELECT * FROM `norostat_raw_datatable_diffs` - WHERE (`release_date`,`parse_time`) <= (%s, %s) - ) `later` - ON `latest`.`measurement_type_id` = `later`.`measurement_type_id` AND - `latest`.`location_id` = `later`.`location_id` AND - `latest`.`week_id` = `later`.`week_id` AND - (`latest`.`release_date`, `latest`.`parse_time`) < - (`later`.`release_date`, `later`.`parse_time`) - WHERE (`latest`.`release_date`, `latest`.`parse_time`) <= (%s, %s) AND - `later`.`parse_time` IS NULL AND - `latest`.`new_value` IS NOT NULL -- NULL means value was removed - ''', next_version_if_any[0]+next_version_if_any[0]) - # Register new version in version list: + (long_raw_df, release_date, parse_time, location) = long_raw + (u, p) = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") try: - cursor.execute(''' - INSERT INTO `norostat_raw_datatable_version_list` (`release_date`, `parse_time`) - VALUES (%s, %s) - ''', (release_date, parse_time)) - except mysql.connector.errors.IntegrityError as e: - raise Exception(['Encountered an IntegrityError when updating the norostat_raw_datatable_version_list table; this probably indicates that a version with the same `release_date` and `parse_time` was already added to the database; parse_time has limited resolution, so this can happen from populating the database too quickly when there are duplicate release dates; original error: ', e]) - # Add any new measurement_type, location, or week strings to the associated - # string pools: - cursor.execute(''' - INSERT INTO `norostat_raw_datatable_measurement_type_pool` (`measurement_type`) - SELECT DISTINCT `measurement_type` - FROM `norostat_raw_datatable_parsed` - WHERE `measurement_type` NOT IN ( - SELECT `norostat_raw_datatable_measurement_type_pool`.`measurement_type` - FROM `norostat_raw_datatable_measurement_type_pool` - ); - ''') - cursor.execute(''' - INSERT INTO `norostat_raw_datatable_location_pool` (`location`) - SELECT DISTINCT `location` - FROM `norostat_raw_datatable_parsed` - WHERE `location` NOT IN ( - SELECT `norostat_raw_datatable_location_pool`.`location` - FROM `norostat_raw_datatable_location_pool` - ); - ''') - cursor.execute(''' - INSERT INTO `norostat_raw_datatable_week_pool` (`week`) - SELECT DISTINCT `week` - FROM `norostat_raw_datatable_parsed` - WHERE `week` NOT IN ( - SELECT `norostat_raw_datatable_week_pool`.`week` - FROM `norostat_raw_datatable_week_pool` - ); - ''') - # Record diff: [newly parsed version "minus" previous version] (first, - # record additions/updates, then record deletions): - cursor.execute(''' - INSERT INTO `norostat_raw_datatable_diffs` (`measurement_type_id`, `location_id`, `week_id`, `release_date`, `parse_time`, `new_value`) - SELECT `measurement_type_id`, `location_id`, `week_id`, %s, %s, `value` - FROM `norostat_raw_datatable_parsed` - LEFT JOIN `norostat_raw_datatable_measurement_type_pool` USING (`measurement_type`) - LEFT JOIN `norostat_raw_datatable_location_pool` USING (`location`) - LEFT JOIN `norostat_raw_datatable_week_pool` USING (`week`) - WHERE (`measurement_type_id`, `location_id`, `week_id`, `value`) NOT IN ( - SELECT `norostat_raw_datatable_previous`.`measurement_type_id`, - `norostat_raw_datatable_previous`.`location_id`, - `norostat_raw_datatable_previous`.`week_id`, - `norostat_raw_datatable_previous`.`value` - FROM `norostat_raw_datatable_previous` - ); - ''', (release_date, parse_time)) - cursor.execute(''' - INSERT INTO `norostat_raw_datatable_diffs` (`measurement_type_id`, `location_id`, `week_id`, `release_date`, `parse_time`, `new_value`) - SELECT `measurement_type_id`, `location_id`, `week_id`, %s, %s, NULL - FROM `norostat_raw_datatable_previous` - WHERE (`measurement_type_id`, `location_id`, `week_id`) NOT IN ( - SELECT `norostat_raw_datatable_measurement_type_pool`.`measurement_type_id`, - `norostat_raw_datatable_location_pool`.`location_id`, - `norostat_raw_datatable_week_pool`.`week_id` - FROM `norostat_raw_datatable_parsed` - LEFT JOIN `norostat_raw_datatable_measurement_type_pool` USING (`measurement_type`) - LEFT JOIN `norostat_raw_datatable_location_pool` USING (`location`) - LEFT JOIN `norostat_raw_datatable_week_pool` USING (`week`) - ); - ''', (release_date, parse_time)) - # If there is an already-recorded next version, its diff is invalidated by - # the insertion of the newly parsed version; delete the [next version - # "minus" previous version] diff and record the [next version "minus" newly - # parsed] diff: - if len(next_version_if_any) != 0: - cursor.execute(''' - DELETE FROM `norostat_raw_datatable_diffs` - WHERE `release_date`=%s AND `parse_time`=%s; - ''', next_version_if_any[0]) - cursor.execute(''' - INSERT INTO `norostat_raw_datatable_diffs` (`measurement_type_id`, `location_id`, `week_id`, `release_date`, `parse_time`, `new_value`) - SELECT `measurement_type_id`, `location_id`, `week_id`, %s, %s, `value` - FROM `norostat_raw_datatable_next` - WHERE (`measurement_type_id`, `location_id`, `week_id`, `value`) NOT IN ( - SELECT - `norostat_raw_datatable_measurement_type_pool`.`measurement_type_id`, - `norostat_raw_datatable_location_pool`.`location_id`, - `norostat_raw_datatable_week_pool`.`week_id`, - `norostat_raw_datatable_parsed`.`value` + cursor = cnx.cursor() + cnx.start_transaction(isolation_level="SERIALIZABLE") + # Create, populate `norostat_raw_datatable_parsed`: + cursor.execute( + """ + CREATE TEMPORARY TABLE `norostat_raw_datatable_parsed` ( + `measurement_type` NVARCHAR(255) NOT NULL, + `location` NVARCHAR(255) NOT NULL, + `week` NVARCHAR(255) NOT NULL, + `value` NVARCHAR(255) NOT NULL, -- forbid NULL; has special external meaning (see above) + PRIMARY KEY (`measurement_type`, `location`, `week`) + ) ENGINE=MEMORY; + """ + ) + cursor.executemany( + """ + INSERT INTO `norostat_raw_datatable_parsed` (`week`,`measurement_type`,`value`,`location`) + VALUES (%s, %s, %s, %s); + """, + [(week, measurement_type, value, location) for (week, measurement_type, value) in long_raw_df[["week", "measurement_type", "value"]].astype(str).itertuples(index=False, name=None)], + ) + # Create, populate `norostat_raw_datatable_previous`: + cursor.execute( + """ + CREATE TEMPORARY TABLE `norostat_raw_datatable_previous` ( + `measurement_type_id` INT NOT NULL, + `location_id` INT NOT NULL, + `week_id` INT NOT NULL, + `value` NVARCHAR(255) NOT NULL, -- forbid NULL; has special external meaning (see above) + -- would like but not allowed: FOREIGN KEY (`measurement_type_id`) REFERENCES `norostat_raw_datatable_measurement_type_pool` (`measurement_type_id`), + -- would like but not allowed: FOREIGN KEY (`location_id`) REFERENCES `norostat_raw_datatable_location_pool` (`location_id`), + -- would like but not allowed: FOREIGN KEY (`week_id`) REFERENCES `norostat_raw_datatable_week_pool` (`week_id`), + PRIMARY KEY (`measurement_type_id`, `location_id`, `week_id`) + ) ENGINE=MEMORY; + """ + ) + cursor.execute( + """ + INSERT INTO `norostat_raw_datatable_previous` (`measurement_type_id`, `location_id`, `week_id`, `value`) + SELECT `latest`.`measurement_type_id`, `latest`.`location_id`, `latest`.`week_id`, `latest`.`new_value` + FROM `norostat_raw_datatable_diffs` AS `latest` + -- Get the latest `new_value` by "group" (measurement_type, location, week) + -- using the fact that there are no later measurements belonging to the + -- same group (find NULL entries in `later`.{release_date,parse_time} + -- in the LEFT JOIN below); if the latest `new_value` is NULL, don't + -- include it in the result; it means that the corresponding cell/entry + -- has been removed from the data-table: + LEFT JOIN ( + SELECT * FROM `norostat_raw_datatable_diffs` + WHERE (`release_date`,`parse_time`) <= (%s,%s) + ) `later` + ON `latest`.`measurement_type_id` = `later`.`measurement_type_id` AND + `latest`.`location_id` = `later`.`location_id` AND + `latest`.`week_id` = `later`.`week_id` AND + (`latest`.`release_date`, `latest`.`parse_time`) < + (`later`.`release_date`, `later`.`parse_time`) + WHERE (`latest`.`release_date`, `latest`.`parse_time`) <= (%s, %s) AND + `later`.`parse_time` IS NULL AND + `latest`.`new_value` IS NOT NULL; + """, + (release_date, parse_time, release_date, parse_time), + ) + # Find next recorded `release_date`, `parse_time` if any; create, populate + # `norostat_raw_datatable_next` if there is such a version: + cursor.execute( + """ + SELECT `release_date`, `parse_time` + FROM `norostat_raw_datatable_version_list` + WHERE (`release_date`, `parse_time`) > (%s,%s) + ORDER BY `release_date`, `parse_time` + LIMIT 1 + """, + (release_date, parse_time), + ) + next_version_if_any = cursor.fetchall() + expect_result_in(len, next_version_if_any, (0, 1), "Bug: expected next-version query to return a number of results in {}; instead have len & val ") + if len(next_version_if_any) != 0: + cursor.execute( + """ + CREATE TEMPORARY TABLE `norostat_raw_datatable_next` ( + `measurement_type_id` INT NOT NULL, + `location_id` INT NOT NULL, + `week_id` INT NOT NULL, + `value` NVARCHAR(255) NOT NULL, -- forbid NULL; has special external meaning (see above) + -- would like but not allowed: FOREIGN KEY (`measurement_type_id`) REFERENCES `norostat_raw_datatable_measurement_type_pool` (`measurement_type_id`), + -- would like but not allowed: FOREIGN KEY (`location_id`) REFERENCES `norostat_raw_datatable_location_pool` (`location_id`), + -- would like but not allowed: FOREIGN KEY (`week_id`) REFERENCES `norostat_raw_datatable_week_pool` (`week_id`), + PRIMARY KEY (`measurement_type_id`, `location_id`, `week_id`) + ) ENGINE=MEMORY; + """ + ) + cursor.execute( + """ + INSERT INTO `norostat_raw_datatable_next` (`measurement_type_id`, `location_id`, `week_id`, `value`) + SELECT `latest`.`measurement_type_id`, `latest`.`location_id`, `latest`.`week_id`, `latest`.`new_value` + FROM `norostat_raw_datatable_diffs` AS `latest` + -- Get the latest `new_value` by "group" (measurement_type, location, week) + -- using the fact that there are no later measurements belonging to the + -- same group (find NULL entries in `later`.{release_date,parse_time} + -- in the LEFT JOIN below); if the latest `new_value` is NULL, don't + -- include it in the result; it means that the corresponding cell/entry + -- has been removed from the data-table: + LEFT JOIN ( + SELECT * FROM `norostat_raw_datatable_diffs` + WHERE (`release_date`,`parse_time`) <= (%s, %s) + ) `later` + ON `latest`.`measurement_type_id` = `later`.`measurement_type_id` AND + `latest`.`location_id` = `later`.`location_id` AND + `latest`.`week_id` = `later`.`week_id` AND + (`latest`.`release_date`, `latest`.`parse_time`) < + (`later`.`release_date`, `later`.`parse_time`) + WHERE (`latest`.`release_date`, `latest`.`parse_time`) <= (%s, %s) AND + `later`.`parse_time` IS NULL AND + `latest`.`new_value` IS NOT NULL -- NULL means value was removed + """, + next_version_if_any[0] + next_version_if_any[0], + ) + # Register new version in version list: + try: + cursor.execute( + """ + INSERT INTO `norostat_raw_datatable_version_list` (`release_date`, `parse_time`) + VALUES (%s, %s) + """, + (release_date, parse_time), + ) + except mysql.connector.errors.IntegrityError as e: + raise Exception( + [ + "Encountered an IntegrityError when updating the norostat_raw_datatable_version_list table; this probably indicates that a version with the same `release_date` and `parse_time` was already added to the database; parse_time has limited resolution, so this can happen from populating the database too quickly when there are duplicate release dates; original error: ", + e, + ] + ) + # Add any new measurement_type, location, or week strings to the associated + # string pools: + cursor.execute( + """ + INSERT INTO `norostat_raw_datatable_measurement_type_pool` (`measurement_type`) + SELECT DISTINCT `measurement_type` + FROM `norostat_raw_datatable_parsed` + WHERE `measurement_type` NOT IN ( + SELECT `norostat_raw_datatable_measurement_type_pool`.`measurement_type` + FROM `norostat_raw_datatable_measurement_type_pool` + ); + """ + ) + cursor.execute( + """ + INSERT INTO `norostat_raw_datatable_location_pool` (`location`) + SELECT DISTINCT `location` + FROM `norostat_raw_datatable_parsed` + WHERE `location` NOT IN ( + SELECT `norostat_raw_datatable_location_pool`.`location` + FROM `norostat_raw_datatable_location_pool` + ); + """ + ) + cursor.execute( + """ + INSERT INTO `norostat_raw_datatable_week_pool` (`week`) + SELECT DISTINCT `week` + FROM `norostat_raw_datatable_parsed` + WHERE `week` NOT IN ( + SELECT `norostat_raw_datatable_week_pool`.`week` + FROM `norostat_raw_datatable_week_pool` + ); + """ + ) + # Record diff: [newly parsed version "minus" previous version] (first, + # record additions/updates, then record deletions): + cursor.execute( + """ + INSERT INTO `norostat_raw_datatable_diffs` (`measurement_type_id`, `location_id`, `week_id`, `release_date`, `parse_time`, `new_value`) + SELECT `measurement_type_id`, `location_id`, `week_id`, %s, %s, `value` FROM `norostat_raw_datatable_parsed` LEFT JOIN `norostat_raw_datatable_measurement_type_pool` USING (`measurement_type`) LEFT JOIN `norostat_raw_datatable_location_pool` USING (`location`) LEFT JOIN `norostat_raw_datatable_week_pool` USING (`week`) + WHERE (`measurement_type_id`, `location_id`, `week_id`, `value`) NOT IN ( + SELECT `norostat_raw_datatable_previous`.`measurement_type_id`, + `norostat_raw_datatable_previous`.`location_id`, + `norostat_raw_datatable_previous`.`week_id`, + `norostat_raw_datatable_previous`.`value` + FROM `norostat_raw_datatable_previous` + ); + """, + (release_date, parse_time), + ) + cursor.execute( + """ + INSERT INTO `norostat_raw_datatable_diffs` (`measurement_type_id`, `location_id`, `week_id`, `release_date`, `parse_time`, `new_value`) + SELECT `measurement_type_id`, `location_id`, `week_id`, %s, %s, NULL + FROM `norostat_raw_datatable_previous` + WHERE (`measurement_type_id`, `location_id`, `week_id`) NOT IN ( + SELECT `norostat_raw_datatable_measurement_type_pool`.`measurement_type_id`, + `norostat_raw_datatable_location_pool`.`location_id`, + `norostat_raw_datatable_week_pool`.`week_id` + FROM `norostat_raw_datatable_parsed` + LEFT JOIN `norostat_raw_datatable_measurement_type_pool` USING (`measurement_type`) + LEFT JOIN `norostat_raw_datatable_location_pool` USING (`location`) + LEFT JOIN `norostat_raw_datatable_week_pool` USING (`week`) + ); + """, + (release_date, parse_time), + ) + # If there is an already-recorded next version, its diff is invalidated by + # the insertion of the newly parsed version; delete the [next version + # "minus" previous version] diff and record the [next version "minus" newly + # parsed] diff: + if len(next_version_if_any) != 0: + cursor.execute( + """ + DELETE FROM `norostat_raw_datatable_diffs` + WHERE `release_date`=%s AND `parse_time`=%s; + """, + next_version_if_any[0], + ) + cursor.execute( + """ + INSERT INTO `norostat_raw_datatable_diffs` (`measurement_type_id`, `location_id`, `week_id`, `release_date`, `parse_time`, `new_value`) + SELECT `measurement_type_id`, `location_id`, `week_id`, %s, %s, `value` + FROM `norostat_raw_datatable_next` + WHERE (`measurement_type_id`, `location_id`, `week_id`, `value`) NOT IN ( + SELECT + `norostat_raw_datatable_measurement_type_pool`.`measurement_type_id`, + `norostat_raw_datatable_location_pool`.`location_id`, + `norostat_raw_datatable_week_pool`.`week_id`, + `norostat_raw_datatable_parsed`.`value` + FROM `norostat_raw_datatable_parsed` + LEFT JOIN `norostat_raw_datatable_measurement_type_pool` USING (`measurement_type`) + LEFT JOIN `norostat_raw_datatable_location_pool` USING (`location`) + LEFT JOIN `norostat_raw_datatable_week_pool` USING (`week`) + ); + """, + next_version_if_any[0], + ) + cursor.execute( + """ + INSERT INTO `norostat_raw_datatable_diffs` (`measurement_type_id`, `location_id`, `week_id`, `release_date`, `parse_time`, `new_value`) + SELECT `measurement_type_id`, `location_id`, `week_id`, %s, %s, NULL + FROM `norostat_raw_datatable_parsed` + LEFT JOIN `norostat_raw_datatable_measurement_type_pool` USING (`measurement_type`) + LEFT JOIN `norostat_raw_datatable_location_pool` USING (`location`) + LEFT JOIN `norostat_raw_datatable_week_pool` USING (`week`) + WHERE (`measurement_type_id`, `location_id`, `week_id`) NOT IN ( + SELECT `norostat_raw_datatable_next`.`measurement_type_id`, + `norostat_raw_datatable_next`.`location_id`, + `norostat_raw_datatable_next`.`week_id` + FROM `norostat_raw_datatable_next` + ); + """, + next_version_if_any[0], + ) + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS `norostat_point_version_list` ( + `release_date` DATE NOT NULL, + `parse_time` DATETIME(6) NOT NULL, + FOREIGN KEY (`release_date`,`parse_time`) REFERENCES `norostat_raw_datatable_version_list` (`release_date`,`parse_time`), + PRIMARY KEY (`release_date`, `parse_time`) ); - ''', next_version_if_any[0]) - cursor.execute(''' - INSERT INTO `norostat_raw_datatable_diffs` (`measurement_type_id`, `location_id`, `week_id`, `release_date`, `parse_time`, `new_value`) - SELECT `measurement_type_id`, `location_id`, `week_id`, %s, %s, NULL - FROM `norostat_raw_datatable_parsed` - LEFT JOIN `norostat_raw_datatable_measurement_type_pool` USING (`measurement_type`) - LEFT JOIN `norostat_raw_datatable_location_pool` USING (`location`) - LEFT JOIN `norostat_raw_datatable_week_pool` USING (`week`) - WHERE (`measurement_type_id`, `location_id`, `week_id`) NOT IN ( - SELECT `norostat_raw_datatable_next`.`measurement_type_id`, - `norostat_raw_datatable_next`.`location_id`, - `norostat_raw_datatable_next`.`week_id` - FROM `norostat_raw_datatable_next` + """ + ) + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS `norostat_point_diffs` ( + `release_date` DATE NOT NULL, + `parse_time` datetime(6) NOT NULL, + `location_id` INT NOT NULL, + `epiweek` INT NOT NULL, + `new_value` NVARCHAR(255), -- allow NULL, with meaning "removed" + FOREIGN KEY (`release_date`,`parse_time`) REFERENCES `norostat_point_version_list` (`release_date`,`parse_time`), + FOREIGN KEY (`location_id`) REFERENCES norostat_raw_datatable_location_pool (`location_id`), + UNIQUE KEY (`location_id`, `epiweek`, `release_date`, `parse_time`, `new_value`), + PRIMARY KEY (`release_date`, `parse_time`, `location_id`, `epiweek`) ); - ''', next_version_if_any[0]) - cursor.execute(''' - CREATE TABLE IF NOT EXISTS `norostat_point_version_list` ( - `release_date` DATE NOT NULL, - `parse_time` DATETIME(6) NOT NULL, - FOREIGN KEY (`release_date`,`parse_time`) REFERENCES `norostat_raw_datatable_version_list` (`release_date`,`parse_time`), - PRIMARY KEY (`release_date`, `parse_time`) - ); - ''') - cursor.execute(''' - CREATE TABLE IF NOT EXISTS `norostat_point_diffs` ( - `release_date` DATE NOT NULL, - `parse_time` datetime(6) NOT NULL, - `location_id` INT NOT NULL, - `epiweek` INT NOT NULL, - `new_value` NVARCHAR(255), -- allow NULL, with meaning "removed" - FOREIGN KEY (`release_date`,`parse_time`) REFERENCES `norostat_point_version_list` (`release_date`,`parse_time`), - FOREIGN KEY (`location_id`) REFERENCES norostat_raw_datatable_location_pool (`location_id`), - UNIQUE KEY (`location_id`, `epiweek`, `release_date`, `parse_time`, `new_value`), - PRIMARY KEY (`release_date`, `parse_time`, `location_id`, `epiweek`) - ); - ''') - cnx.commit() # (might do nothing; each statement above takes effect and/or commits immediately) - finally: - cnx.close() + """ + ) + cnx.commit() # (might do nothing; each statement above takes effect and/or commits immediately) + finally: + cnx.close() + def update_point(): - (u, p) = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - try: - cursor = cnx.cursor() - cnx.start_transaction(isolation_level='serializable') - cursor.execute(''' - SELECT `release_date`, `parse_time`, `measurement_type`, `location_id`, `week`, `new_value` - FROM `norostat_raw_datatable_diffs` - LEFT JOIN `norostat_raw_datatable_measurement_type_pool` USING (`measurement_type_id`) - LEFT JOIN `norostat_raw_datatable_week_pool` USING (`week_id`) - WHERE (`release_date`, `parse_time`) NOT IN ( - SELECT `norostat_point_version_list`.`release_date`, - `norostat_point_version_list`.`parse_time` - FROM `norostat_point_version_list` - ); - ''') - raw_datatable_diff_selection = cursor.fetchall() - prog = re.compile(r"[0-9]+-[0-9]+$") - point_diff_insertion = [ - (release_date, parse_time, location_id, - season_db_to_epiweek(measurement_type, week), - int(new_value_str) if new_value_str is not None else None + (u, p) = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") + try: + cursor = cnx.cursor() + cnx.start_transaction(isolation_level="serializable") + cursor.execute( + """ + SELECT `release_date`, `parse_time`, `measurement_type`, `location_id`, `week`, `new_value` + FROM `norostat_raw_datatable_diffs` + LEFT JOIN `norostat_raw_datatable_measurement_type_pool` USING (`measurement_type_id`) + LEFT JOIN `norostat_raw_datatable_week_pool` USING (`week_id`) + WHERE (`release_date`, `parse_time`) NOT IN ( + SELECT `norostat_point_version_list`.`release_date`, + `norostat_point_version_list`.`parse_time` + FROM `norostat_point_version_list` + ); + """ + ) + raw_datatable_diff_selection = cursor.fetchall() + prog = re.compile(r"[0-9]+-[0-9]+$") + point_diff_insertion = [ + (release_date, parse_time, location_id, season_db_to_epiweek(measurement_type, week), int(new_value_str) if new_value_str is not None else None) + for (release_date, parse_time, measurement_type, location_id, week, new_value_str) in raw_datatable_diff_selection + if prog.match(measurement_type) is not None and new_value_str != "" + ] + cursor.execute( + """ + INSERT INTO `norostat_point_version_list` (`release_date`, `parse_time`) + SELECT DISTINCT `release_date`, `parse_time` + FROM `norostat_raw_datatable_version_list` + WHERE (`release_date`, `parse_time`) NOT IN ( + SELECT `norostat_point_version_list`.`release_date`, + `norostat_point_version_list`.`parse_time` + FROM `norostat_point_version_list` + ); + """ ) - for (release_date, parse_time, measurement_type, location_id, week, new_value_str) - in raw_datatable_diff_selection - if prog.match(measurement_type) is not None and - new_value_str != "" - ] - cursor.execute(''' - INSERT INTO `norostat_point_version_list` (`release_date`, `parse_time`) - SELECT DISTINCT `release_date`, `parse_time` - FROM `norostat_raw_datatable_version_list` - WHERE (`release_date`, `parse_time`) NOT IN ( - SELECT `norostat_point_version_list`.`release_date`, - `norostat_point_version_list`.`parse_time` - FROM `norostat_point_version_list` - ); - ''') - cursor.executemany(''' - INSERT INTO `norostat_point_diffs` (`release_date`, `parse_time`, `location_id`, `epiweek`, `new_value`) - VALUES (%s, %s, %s, %s, %s) - ''', point_diff_insertion) - cnx.commit() - finally: - cnx.close() + cursor.executemany( + """ + INSERT INTO `norostat_point_diffs` (`release_date`, `parse_time`, `location_id`, `epiweek`, `new_value`) + VALUES (%s, %s, %s, %s, %s) + """, + point_diff_insertion, + ) + cnx.commit() + finally: + cnx.close() + # note there are more efficient ways to calculate diffs without forming ..._next table # todo give indices names diff --git a/src/acquisition/norostat/norostat_update.py b/src/acquisition/norostat/norostat_update.py index 4b0021dd5..3d0263ff1 100644 --- a/src/acquisition/norostat/norostat_update.py +++ b/src/acquisition/norostat/norostat_update.py @@ -14,15 +14,16 @@ def main(): - # Download the data: - # content = norostat_raw.load_sample_content() - content = norostat_raw.fetch_content() - # norostat_raw.save_sample_content(content) - wide_raw = norostat_raw.parse_content_to_wide_raw(content) - long_raw = norostat_raw.melt_wide_raw_to_long_raw(wide_raw) - norostat_sql.ensure_tables_exist() - norostat_sql.record_long_raw(long_raw) - norostat_sql.update_point() + # Download the data: + # content = norostat_raw.load_sample_content() + content = norostat_raw.fetch_content() + # norostat_raw.save_sample_content(content) + wide_raw = norostat_raw.parse_content_to_wide_raw(content) + long_raw = norostat_raw.melt_wide_raw_to_long_raw(wide_raw) + norostat_sql.ensure_tables_exist() + norostat_sql.record_long_raw(long_raw) + norostat_sql.update_point() -if __name__ == '__main__': - main() + +if __name__ == "__main__": + main() diff --git a/src/acquisition/norostat/norostat_utils.py b/src/acquisition/norostat/norostat_utils.py index a99a4dc96..1285e7867 100644 --- a/src/acquisition/norostat/norostat_utils.py +++ b/src/acquisition/norostat/norostat_utils.py @@ -7,38 +7,48 @@ # helper funs for checking expectations, throwing exceptions on violations: def expect_value_eq(encountered, expected, mismatch_format): - if encountered != expected: - raise Exception([mismatch_format.format(expected), encountered]) + if encountered != expected: + raise Exception([mismatch_format.format(expected), encountered]) + + def expect_result_eq(f, value, expected, mismatch_format): - result = f(value) - if result != expected: - raise Exception([mismatch_format.format(expected), result, value]) + result = f(value) + if result != expected: + raise Exception([mismatch_format.format(expected), result, value]) + + def expect_value_in(encountered, expected_candidates, mismatch_format): - if encountered not in expected_candidates: - raise Exception([mismatch_format.format(expected_candidates), encountered]) + if encountered not in expected_candidates: + raise Exception([mismatch_format.format(expected_candidates), encountered]) + + def expect_result_in(f, value, expected_candidates, mismatch_format): - result = f(value) - if result not in expected_candidates: - raise Exception([mismatch_format.format(expected_candidates), result, value]) + result = f(value) + if result not in expected_candidates: + raise Exception([mismatch_format.format(expected_candidates), result, value]) + + def expect_str_contains(encountered, regex, mismatch_format): - if re.search(regex, encountered) is None: - raise Exception([mismatch_format.format(regex), encountered]) + if re.search(regex, encountered) is None: + raise Exception([mismatch_format.format(regex), encountered]) + # helper fun used with expect_* funs to check value of .dtype.kind: def dtype_kind(numpy_like): - return numpy_like.dtype.kind + return numpy_like.dtype.kind + # helper fun used to convert season string ("YYYY-YY" or "YYYY-YYYY") and # "Week" string (strptime format "%d-%b") to the corresponding epiweek; assumes # by default that dates >= 1-Aug correspond to weeks of the first year: def season_db_to_epiweek(season_str, db_date_str, first_db_date_of_season_str="1-Aug"): - year_strs = season_str.split("-") - first_year = int(year_strs[0]) - second_year = first_year + 1 - # FIXME check/enforce locale - first_date_of_season = datetime.datetime.strptime(first_db_date_of_season_str+"-"+str(first_year), "%d-%b-%Y").date() - date_using_first_year = datetime.datetime.strptime(db_date_str+"-"+str(first_year), "%d-%b-%Y").date() - date_using_second_year = datetime.datetime.strptime(db_date_str+"-"+str(second_year), "%d-%b-%Y").date() - date = date_using_first_year if date_using_first_year >= first_date_of_season else date_using_second_year - epiweek = EpiDate(date.year, date.month, date.day).get_ew() - return epiweek + year_strs = season_str.split("-") + first_year = int(year_strs[0]) + second_year = first_year + 1 + # FIXME check/enforce locale + first_date_of_season = datetime.datetime.strptime(first_db_date_of_season_str + "-" + str(first_year), "%d-%b-%Y").date() + date_using_first_year = datetime.datetime.strptime(db_date_str + "-" + str(first_year), "%d-%b-%Y").date() + date_using_second_year = datetime.datetime.strptime(db_date_str + "-" + str(second_year), "%d-%b-%Y").date() + date = date_using_first_year if date_using_first_year >= first_date_of_season else date_using_second_year + epiweek = EpiDate(date.year, date.month, date.day).get_ew() + return epiweek diff --git a/src/acquisition/paho/paho_db_update.py b/src/acquisition/paho/paho_db_update.py index d07885f79..d463a915c 100644 --- a/src/acquisition/paho/paho_db_update.py +++ b/src/acquisition/paho/paho_db_update.py @@ -50,9 +50,8 @@ import csv import datetime import glob -import subprocess -import random from io import StringIO +import tempfile # third party import mysql.connector @@ -64,12 +63,14 @@ from delphi.utils.epiweek import delta_epiweeks, check_epiweek from delphi.utils.epidate import EpiDate + def ensure_tables_exist(): - (u,p) = secrets.db.epi - cnx = mysql.connector.connect(user=u,password=p,database='epidata') + (u, p) = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") try: cursor = cnx.cursor() - cursor.execute(''' + cursor.execute( + """ CREATE TABLE IF NOT EXISTS `paho_dengue` ( `id` INT(11) NOT NULL PRIMARY KEY AUTO_INCREMENT, `release_date` DATE NOT NULL, @@ -85,35 +86,40 @@ def ensure_tables_exist(): `num_deaths` INT(11) NOT NULL, UNIQUE KEY (`issue`, `epiweek`, `region`) ); - '''); + """ + ) cnx.commit() finally: cnx.close() + def safe_float(f): try: - return float(f.replace(',','')) + return float(f.replace(",", "")) except: return 0 + def safe_int(i): try: - return int(i.replace(',','')) + return int(i.replace(",", "")) except: return 0 -def get_rows(cnx, table='paho_dengue'): - # Count and return the number of rows in the `fluview` table. - select = cnx.cursor() - select.execute('SELECT count(1) num FROM %s' % table) - for (num,) in select: - pass - select.close() - return num + +def get_rows(cnx, table="paho_dengue"): + # Count and return the number of rows in the `fluview` table. + select = cnx.cursor() + select.execute("SELECT count(1) num FROM %s" % table) + for (num,) in select: + pass + select.close() + return num + def get_paho_row(row): - if row[0] == "\ufeffIncidence Rate (c)" and row != "\ufeffIncidence Rate (c),(SD/D) x100 (e),CFR (f),ID,Country or Subregion,Deaths,EW,Confirmed,Epidemiological Week (a),Pop (no usar),Serotype,Severe Dengue (d),Total of Dengue Cases (b),Year,Population x 1000".split(","): - raise Exception('PAHO header row has changed') + if (row[0] == "\ufeffIncidence Rate (c)" and row != "\ufeffIncidence Rate (c),(SD/D) x100 (e),CFR (f),ID,Country or Subregion,Deaths,EW,Confirmed,Epidemiological Week (a),Pop (no usar),Serotype,Severe Dengue (d),Total of Dengue Cases (b),Year,Population x 1000".split(",")): + raise Exception("PAHO header row has changed") if len(row) == 1 or row[0] == "Incidence Rate (c)": # this is a header row return None @@ -128,23 +134,24 @@ def get_paho_row(row): except: return None try: - check_epiweek(safe_int(row[13])*100 + safe_int(row[8]), safe_int(row[13])*100 + safe_int(row[6])) + check_epiweek(safe_int(row[13]) * 100 + safe_int(row[8]), safe_int(row[13]) * 100 + safe_int(row[6])) except: return None return { - 'issue': safe_int(row[13])*100 + safe_int(row[6]), - 'epiweek': safe_int(row[13])*100 + safe_int(row[8]), - 'region': country, - 'total_pop': safe_int(row[14]), - 'serotype': row[10], - 'num_dengue': safe_int(row[12]), - 'incidence_rate': safe_float(row[0]), - 'num_severe': safe_int(row[11]), - 'num_deaths': safe_int(row[5]), - 'severe_ratio': safe_float(row[1]), - 'cfr': safe_float(row[2]) + "issue": safe_int(row[13]) * 100 + safe_int(row[6]), + "epiweek": safe_int(row[13]) * 100 + safe_int(row[8]), + "region": country, + "total_pop": safe_int(row[14]), + "serotype": row[10], + "num_dengue": safe_int(row[12]), + "incidence_rate": safe_float(row[0]), + "num_severe": safe_int(row[11]), + "num_deaths": safe_int(row[5]), + "severe_ratio": safe_float(row[1]), + "cfr": safe_float(row[2]), } + def update_from_file(issue, date, filename, test_mode=False): # Read PAHO data from CSV and insert into (or update) the database. @@ -156,23 +163,23 @@ def update_from_file(issue, date, filename, test_mode=False): # database connection u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - rows1 = get_rows(cnx, 'paho_dengue') - print('rows before: %d' % (rows1)) + cnx = mysql.connector.connect(user=u, password=p, database="epidata") + rows1 = get_rows(cnx, "paho_dengue") + print("rows before: %d" % (rows1)) insert = cnx.cursor() # load the data, ignoring empty rows - print('loading data from %s as issued on %d' % (filename, issue)) - with open(filename,'r',encoding='utf-8') as f: + print("loading data from %s as issued on %d" % (filename, issue)) + with open(filename, "r", encoding="utf-8") as f: c = f.read() rows = [] - for l in csv.reader(StringIO(c), delimiter=','): + for l in csv.reader(StringIO(c), delimiter=","): rows.append(get_paho_row(l)) - print(' loaded %d rows' % len(rows)) + print(" loaded %d rows" % len(rows)) entries = [obj for obj in rows if obj] - print(' found %d entries' % len(entries)) + print(" found %d entries" % len(entries)) - sql = ''' + sql = """ INSERT INTO `paho_dengue` (`release_date`, `issue`, `epiweek`, `region`, `lag`, `total_pop`, `serotype`, `num_dengue`, `incidence_rate`, @@ -187,55 +194,43 @@ def update_from_file(issue, date, filename, test_mode=False): `incidence_rate` = %s, `num_severe` = %s, `num_deaths` = %s - ''' + """ for row in entries: - if row['issue'] > issue: # Issued in a week that hasn't happened yet + if row["issue"] > issue: # Issued in a week that hasn't happened yet continue - lag = delta_epiweeks(row['epiweek'], issue) - data_args = [row['total_pop'], row['serotype'], row['num_dengue'], - row['incidence_rate'], row['num_severe'], row['num_deaths']] + lag = delta_epiweeks(row["epiweek"], issue) + data_args = [row["total_pop"], row["serotype"], row["num_dengue"], row["incidence_rate"], row["num_severe"], row["num_deaths"]] - insert_args = [date,issue,row['epiweek'],row['region'],lag] + data_args + insert_args = [date, issue, row["epiweek"], row["region"], lag] + data_args update_args = [date] + data_args insert.execute(sql % tuple(insert_args + update_args)) # cleanup insert.close() if test_mode: - print('test mode, not committing') + print("test mode, not committing") rows2 = rows1 else: cnx.commit() rows2 = get_rows(cnx) - print('rows after: %d (added %d)' % (rows2,rows2-rows1)) + print("rows after: %d (added %d)" % (rows2, rows2 - rows1)) cnx.close() + def main(): # args and usage parser = argparse.ArgumentParser() - parser.add_argument( - '--test', - action='store_true', - help='do dry run only, do not update the database' - ) - parser.add_argument( - '--file', - type=str, - help='load an existing zip file (otherwise fetch current data)' - ) - parser.add_argument( - '--issue', - type=int, - help='issue of the file (e.g. 201740); used iff --file is given' - ) + parser.add_argument("--test", action="store_true", help="do dry run only, do not update the database") + parser.add_argument("--file", type=str, help="load an existing zip file (otherwise fetch current data)") + parser.add_argument("--issue", type=int, help="issue of the file (e.g. 201740); used iff --file is given") args = parser.parse_args() if (args.file is None) != (args.issue is None): - raise Exception('--file and --issue must both be present or absent') + raise Exception("--file and --issue must both be present or absent") - date = datetime.datetime.now().strftime('%Y-%m-%d') - print('assuming release date is today, %s' % date) + date = datetime.datetime.now().strftime("%Y-%m-%d") + print("assuming release date is today, %s" % date) if args.file: update_from_file(args.issue, date, args.file, test_mode=args.test) @@ -247,34 +242,31 @@ def main(): max_tries = 5 while flag < max_tries: flag = flag + 1 - tmp_dir = ''.join(random.choice('0123456789abcdefghijklmnopqrstuvwxyz') for i in range(8)) - tmp_dir = 'downloads_' + tmp_dir - subprocess.call(["mkdir",tmp_dir]) - # Use temporary directory to avoid data from different time - # downloaded to same folder - get_paho_data(dir=tmp_dir) - issue = EpiDate.today().get_ew() - # Check to make sure we downloaded a file for every week - issueset = set() - files = glob.glob('%s/*.csv' % tmp_dir) - for filename in files: - with open(filename,'r') as f: - _ = f.readline() - data = f.readline().split(',') - issueset.add(data[6]) - db_error = False - if len(issueset) >= 53: # Shouldn't be more than 53 + with tempfile.TemporaryDirectory() as tmp_dir: + # Use temporary directory to avoid data from different time + # downloaded to same folder + get_paho_data(dir=tmp_dir) + issue = EpiDate.today().get_ew() + # Check to make sure we downloaded a file for every week + issueset = set() + files = glob.glob(f"{tmp_dir}/*.csv") for filename in files: - try: - update_from_file(issue, date, filename, test_mode=args.test) - subprocess.call(["rm",filename]) - except: - db_error = True - subprocess.call(["rm","-r",tmp_dir]) - if not db_error: - break # Exit loop with success + with open(filename, "r") as f: + _ = f.readline() + data = f.readline().split(",") + issueset.add(data[6]) + db_error = False + if len(issueset) >= 53: # Shouldn't be more than 53 + for filename in files: + try: + update_from_file(issue, date, filename, test_mode=args.test) + except: + db_error = True + if not db_error: + break # Exit loop with success if flag >= max_tries: - print('WARNING: Database `paho_dengue` did not update successfully') + print("WARNING: Database `paho_dengue` did not update successfully") + -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/src/acquisition/paho/paho_download.py b/src/acquisition/paho/paho_download.py index 60dd13ae8..149fdf374 100644 --- a/src/acquisition/paho/paho_download.py +++ b/src/acquisition/paho/paho_download.py @@ -1,4 +1,3 @@ - # IMPORTANT: This code is extremely unstable. # Slight changes to the PAHO website may render this script partially or entirely useless. @@ -15,42 +14,45 @@ headerheight = 0 + def wait_for(browser, css_selector, delay=10): try: WebDriverWait(browser, delay).until(EC.presence_of_element_located((By.CSS_SELECTOR, css_selector))) WebDriverWait(browser, delay).until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_selector))) - print('Success Loading %s' % (css_selector)) + print("Success Loading %s" % (css_selector)) except TimeoutException: print("Loading %s took too much time!" % (css_selector)) - + + def find_and_click(browser, element): element.location_once_scrolled_into_view browser.switch_to.default_content() - browser.execute_script("window.scrollBy(0,-%d)"%headerheight) + browser.execute_script("window.scrollBy(0,-%d)" % headerheight) browser.switch_to.frame(browser.find_element_by_tag_name("iframe")) browser.switch_to.frame(browser.find_element_by_tag_name("iframe")) element.click() -def get_paho_data(offset=0, dir='downloads'): + +def get_paho_data(offset=0, dir="downloads"): opts = Options() opts.set_headless() assert opts.headless # Operating in headless mode fp = FirefoxProfile() - fp.set_preference("browser.download.folderList",2) - fp.set_preference("browser.download.manager.showWhenStarting",False) - fp.set_preference("browser.download.dir",os.path.abspath(dir)) - fp.set_preference("browser.helperApps.neverAsk.saveToDisk","text/csv") + fp.set_preference("browser.download.folderList", 2) + fp.set_preference("browser.download.manager.showWhenStarting", False) + fp.set_preference("browser.download.dir", os.path.abspath(dir)) + fp.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/csv") - browser = Firefox(options=opts,firefox_profile=fp) - browser.get('http://www.paho.org/data/index.php/en/mnu-topics/indicadores-dengue-en/dengue-nacional-en/252-dengue-pais-ano-en.html?showall=&start=1') + browser = Firefox(options=opts, firefox_profile=fp) + browser.get("https://www.paho.org/data/index.php/en/mnu-topics/indicadores-dengue-en/dengue-nacional-en/252-dengue-pais-ano-en.html?showall=&start=1") tab1 = browser.window_handles[0] - browser.execute_script('''window.open("","_blank");''') + browser.execute_script("""window.open("","_blank");""") tab2 = browser.window_handles[1] browser.switch_to.window(tab1) - + curr_offset = offset - + wait_for(browser, "div.rt-top-inner", delay=30) header = browser.find_element_by_css_selector("div.rt-top-inner") global headerheight @@ -59,7 +61,7 @@ def get_paho_data(offset=0, dir='downloads'): # The actual content of the data of this webpage is within 2 iframes, so we need to navigate into them first browser.switch_to.frame(browser.find_element_by_tag_name("iframe")) browser.switch_to.frame(browser.find_element_by_tag_name("iframe")) - + # Locate the button that allows to download the table downloadoption = browser.find_elements_by_css_selector("div.tabToolbarButton.tab-widget.download")[0] find_and_click(browser, downloadoption) @@ -78,10 +80,13 @@ def get_paho_data(offset=0, dir='downloads'): # Extract session ID href = downloadbutton.get_attribute("href") startidx = href.index("sessions/") + len("sessions/") - endidx = href.index("/",startidx) + endidx = href.index("/", startidx) sessionid = href[startidx:endidx] - dataurl = "http://phip.paho.org/vizql/w/Casosdedengue_tben/v/ByLastAvailableEpiWeek/viewData/sessions/%s/views/18076444178507886853_9530488980060483892?maxrows=200&viz=%%7B%%22worksheet%%22:%%22W%%20By%%20Last%%20Available%%20EpiWeek%%22,%%22dashboard%%22:%%22By%%20Last%%20Available%%20Epi%%20Week%%22%%7D"%sessionid + dataurl = ( + "https://phip.paho.org/vizql/w/Casosdedengue_tben/v/ByLastAvailableEpiWeek/viewData/sessions/%s/views/18076444178507886853_9530488980060483892?maxrows=200&viz=%%7B%%22worksheet%%22:%%22W%%20By%%20Last%%20Available%%20EpiWeek%%22,%%22dashboard%%22:%%22By%%20Last%%20Available%%20Epi%%20Week%%22%%7D" + % sessionid + ) wait_for(browser, "div[data-tb-test-id='CancelBtn-Button']") @@ -107,27 +112,27 @@ def get_paho_data(offset=0, dir='downloads'): for i in range(offset): gp = browser.find_element_by_css_selector("div.wcGlassPane") - #print gp.is_enabled() - #print gp.is_selected() - #print gp.is_displayed() + # print gp.is_enabled() + # print gp.is_selected() + # print gp.is_displayed() try: WebDriverWait(browser, 10).until(EC.staleness_of(gp)) - print("Loaded next week % d" % (53-offset)) + print("Loaded next week % d" % (53 - offset)) except TimeoutException: - print("Loading next week %d took too much time!" % (53-offset)) + print("Loading next week %d took too much time!" % (53 - offset)) gp = browser.find_element_by_css_selector("div.wcGlassPane") - #print gp.is_enabled() - #print gp.is_selected() - #print gp.is_displayed() + # print gp.is_enabled() + # print gp.is_selected() + # print gp.is_displayed() x = browser.find_elements_by_css_selector("div.dijitReset.dijitSliderButtonContainer.dijitSliderButtonContainerH.tableauArrowDec")[0] find_and_click(browser, x) # Cycle through all weeks, downloading each week as a separate .csv # Theoretically, need to cycle 53 times, but in practice only 54 works, unsure why - for i in range(54-offset): + for i in range(54 - offset): # If something goes wrong for whatever reason, try from the beginning try: - print('Loading week %d' % (53-i)) + print("Loading week %d" % (53 - i)) # (Re-)load URL browser.switch_to.window(tab2) browser.get(dataurl) @@ -137,7 +142,7 @@ def get_paho_data(offset=0, dir='downloads'): full_data_tab = browser.find_elements_by_css_selector("li[id='tab-view-full-data']")[0] full_data_tab.click() - wait_for(browser, "a.csvLink") # Sometimes this fails but the button is successfully clicked anyway, not sure why + wait_for(browser, "a.csvLink") # Sometimes this fails but the button is successfully clicked anyway, not sure why # Actually download the data as a .csv (Will be downloaded to Firefox's default download destination) data_links = browser.find_elements_by_css_selector("a.csvLink") data_link = None @@ -155,10 +160,11 @@ def get_paho_data(offset=0, dir='downloads'): find_and_click(browser, x) curr_offset += 1 except Exception as e: - print('Got exception %s\nTrying again from week %d' % (e,53-offset)) + print("Got exception %s\nTrying again from week %d" % (e, 53 - offset)) browser.quit() get_paho_data(offset=curr_offset) browser.quit() -if __name__ == '__main__': - get_paho_data(dir='downloads/') + +if __name__ == "__main__": + get_paho_data(dir="downloads/") diff --git a/src/acquisition/quidel/quidel.py b/src/acquisition/quidel/quidel.py index a7c9a2918..f33cb3aef 100644 --- a/src/acquisition/quidel/quidel.py +++ b/src/acquisition/quidel/quidel.py @@ -1,4 +1,4 @@ -''' +""" =============== === Purpose === =============== @@ -15,7 +15,7 @@ * add end date, end week check 2017-12-02: * original version -''' +""" # standard library from collections import defaultdict @@ -35,148 +35,146 @@ import delphi.utils.epidate as ED from delphi.utils.geo.locations import Locations -def word_map(row,terms): - for (k,v) in terms.items(): - row = row.replace(k,v) + +def word_map(row, terms): + for (k, v) in terms.items(): + row = row.replace(k, v) return row -def date_less_than(d1,d2): - y1,m1,d1 = [int(x) for x in d1.split('-')] - y2,m2,d2 = [int(x) for x in d2.split('-')] - if y1*10000+m1*100+d10: shifted to future def date_to_epiweek(date, shift=0): - y,m,d = [int(x) for x in date.split('-')] + y, m, d = [int(x) for x in date.split("-")] - epidate = ED.EpiDate(y,m,d) + epidate = ED.EpiDate(y, m, d) epidate = epidate.add_days(shift) ew = epidate.get_ew() return ew + # convert measurment to time series format # startweek and endweek are inclusive -def measurement_to_ts(m,index,startweek=None,endweek=None): +def measurement_to_ts(m, index, startweek=None, endweek=None): if startweek is None: startweek = 0 if endweek is None: endweek = 999999 res = {} - for r,rdict in m.items(): - res[r]={} - for t,vals in rdict.items(): - if index>=len(vals): + for r, rdict in m.items(): + res[r] = {} + for t, vals in rdict.items(): + if index >= len(vals): raise Exception("Index is invalid") - if t>=startweek and t<=endweek: + if t >= startweek and t <= endweek: res[r][t] = vals[index] return res + class QuidelData: def __init__(self, raw_path, load_email=True): self.data_path = raw_path - self.excel_uptodate_path = join(raw_path,'excel/uptodate') - self.excel_history_path = join(raw_path,'excel/history') - self.csv_path = join(raw_path,'csv') - self.xlsx_uptodate_list = [ - f[:-5] for f in listdir(self.excel_uptodate_path) if isfile(join(self.excel_uptodate_path, f)) and f[-5:]=='.xlsx' - ] - self.xlsx_history_list = [ - f[:-5] for f in listdir(self.excel_history_path) if isfile(join(self.excel_history_path, f)) and f[-5:]=='.xlsx' - ] - self.csv_list = [f[:-4] for f in listdir(self.csv_path) if isfile(join(self.csv_path, f)) and f[-4:]=='.csv'] + self.excel_uptodate_path = join(raw_path, "excel/uptodate") + self.excel_history_path = join(raw_path, "excel/history") + self.csv_path = join(raw_path, "csv") + self.xlsx_uptodate_list = [f[:-5] for f in listdir(self.excel_uptodate_path) if isfile(join(self.excel_uptodate_path, f)) and f[-5:] == ".xlsx"] + self.xlsx_history_list = [f[:-5] for f in listdir(self.excel_history_path) if isfile(join(self.excel_history_path, f)) and f[-5:] == ".xlsx"] + self.csv_list = [f[:-4] for f in listdir(self.csv_path) if isfile(join(self.csv_path, f)) and f[-4:] == ".csv"] self.map_terms = { - ' FL 34637"':'FL', + ' FL 34637"': "FL", } # hardcoded parameters self.date_dim = 1 self.state_dim = 4 - self.fields = [ - 'sofia_ser','date','fac_id','city','state','zip','age', - 'fluA','fluB','fluAll','county','fac_type' - ] - self.fields_to_keep = ['fac_id','fluA','fluB','fluAll'] + self.fields = ["sofia_ser", "date", "fac_id", "city", "state", "zip", "age", "fluA", "fluB", "fluAll", "county", "fac_type"] + self.fields_to_keep = ["fac_id", "fluA", "fluB", "fluAll"] self.dims_to_keep = [self.fields.index(x) for x in self.fields_to_keep] if load_email: self.retrieve_excels() self.prepare_csv() def retrieve_excels(self): - detach_dir = self.excel_uptodate_path # directory where to save attachments (default: current) + detach_dir = self.excel_uptodate_path # directory where to save attachments (default: current) # connecting to the gmail imap server m = imaplib.IMAP4_SSL("imap.gmail.com") - m.login(secrets.quidel.email_addr,secrets.quidel.email_pwd) - m.select("INBOX") # here you a can choose a mail box like INBOX instead + m.login(secrets.quidel.email_addr, secrets.quidel.email_pwd) + m.select("INBOX") # here you a can choose a mail box like INBOX instead # use m.list() to get all the mailboxes - _, items = m.search(None, "ALL") # you could filter using the IMAP rules here (check http://www.example-code.com/csharp/imap-search-critera.asp) - items = items[0].split() # getting the mails id + _, items = m.search(None, "ALL") # you could filter using the IMAP rules here (check https://www.example-code.com/csharp/imap-search-critera.asp) + items = items[0].split() # getting the mails id # The emailids are ordered from past to now for emailid in items: - _, data = m.fetch(emailid, "(RFC822)") # fetching the mail, "`(RFC822)`" means "get the whole stuff", but you can ask for headers only, etc - email_body = data[0][1].decode('utf-8') # getting the mail content - mail = email.message_from_string(email_body) # parsing the mail content to get a mail object + _, data = m.fetch(emailid, "(RFC822)") # fetching the mail, "`(RFC822)`" means "get the whole stuff", but you can ask for headers only, etc + email_body = data[0][1].decode("utf-8") # getting the mail content + mail = email.message_from_string(email_body) # parsing the mail content to get a mail object - #Check if any attachments at all - if mail.get_content_maintype() != 'multipart': + # Check if any attachments at all + if mail.get_content_maintype() != "multipart": continue # we use walk to create a generator so we can iterate on the parts and forget about the recursive headach for part in mail.walk(): # multipart are just containers, so we skip them - if part.get_content_maintype() == 'multipart': + if part.get_content_maintype() == "multipart": continue # is this part an attachment ? - if part.get('Content-Disposition') is None: + if part.get("Content-Disposition") is None: continue filename = part.get_filename() # check duplicates - if filename[-5:]!='.xlsx' or filename[:-5] in self.xlsx_uptodate_list+self.xlsx_history_list: + if filename[-5:] != ".xlsx" or filename[:-5] in self.xlsx_uptodate_list + self.xlsx_history_list: continue self.xlsx_uptodate_list.append(filename[:-5]) att_path = os.path.join(detach_dir, filename) - #Check if its already there - if not os.path.isfile(att_path) : + # Check if its already there + if not os.path.isfile(att_path): # finally write the stuff - fp = open(att_path, 'wb') + fp = open(att_path, "wb") fp.write(part.get_payload(decode=True)) fp.close() def prepare_csv(self): - need_update=False + need_update = False for f in self.xlsx_uptodate_list: if f in self.csv_list: continue else: - need_update=True + need_update = True - date_regex = '\d{2}-\d{2}-\d{4}' - date_items = re.findall(date_regex,f) + date_regex = "\d{2}-\d{2}-\d{4}" + date_items = re.findall(date_regex, f) if date_items: - end_date = '-'.join(date_items[-1].split('-')[x] for x in [2,0,1]) + end_date = "-".join(date_items[-1].split("-")[x] for x in [2, 0, 1]) else: - print("End date not found in file name:"+f) + print("End date not found in file name:" + f) end_date = None - df_dict = pd.read_excel(join(self.excel_uptodate_path, f+'.xlsx'), sheet_name=None) - for (_,df) in df_dict.items(): - df = df.dropna(axis=0, how='all') - df['TestDate'] = df['TestDate'].apply(lambda x: x.strftime('%Y-%m-%d')) - df_filtered = df[df['TestDate']!=''] + df_dict = pd.read_excel(join(self.excel_uptodate_path, f + ".xlsx"), sheet_name=None) + for (_, df) in df_dict.items(): + df = df.dropna(axis=0, how="all") + df["TestDate"] = df["TestDate"].apply(lambda x: x.strftime("%Y-%m-%d")) + df_filtered = df[df["TestDate"] != ""] if end_date is not None: - df_filtered = df_filtered[df.apply(lambda x: date_less_than(end_date,x['TestDate'])!=1, axis=1)] - df_filtered.to_csv(join(self.csv_path, f+'.csv'), index=False, encoding='utf-8') - self.csv_list = [f[:-4] for f in listdir(self.csv_path) if isfile(join(self.csv_path, f)) and f[-4:]=='.csv'] + df_filtered = df_filtered[df.apply(lambda x: date_less_than(end_date, x["TestDate"]) != 1, axis=1)] + df_filtered.to_csv(join(self.csv_path, f + ".csv"), index=False, encoding="utf-8") + self.csv_list = [f[:-4] for f in listdir(self.csv_path) if isfile(join(self.csv_path, f)) and f[-4:] == ".csv"] self.need_update = need_update def load_csv(self, dims=None): @@ -186,12 +184,12 @@ def load_csv(self, dims=None): for f in self.csv_list: if f in self.xlsx_history_list: continue - rf = open(join(self.csv_path,f+'.csv')) + rf = open(join(self.csv_path, f + ".csv")) lines = rf.readlines() for l in lines[1:]: - l = word_map(l,self.map_terms) - row = l.strip().split(',') + l = word_map(l, self.map_terms) + row = l.strip().split(",") date = row[self.date_dim] state = row[self.state_dim] if state not in parsed_dict[date]: @@ -202,7 +200,7 @@ def load_csv(self, dims=None): # hardcoded aggregation function # output: [#unique_device,fluA,fluB,fluAll,total] - def prepare_measurements(self,data_dict,use_hhs=True,start_weekday=6): + def prepare_measurements(self, data_dict, use_hhs=True, start_weekday=6): buffer_dict = {} if use_hhs: region_list = Locations.hhs_list @@ -210,34 +208,33 @@ def prepare_measurements(self,data_dict,use_hhs=True,start_weekday=6): region_list = Locations.atom_list def get_hhs_region(atom): - for region in Locations.hhs_list: - if atom.lower() in Locations.hhs_map[region]: - return region - if atom.lower() == 'ny': - return 'hhs2' - return atom + for region in Locations.hhs_list: + if atom.lower() in Locations.hhs_map[region]: + return region + if atom.lower() == "ny": + return "hhs2" + return atom day_shift = 6 - start_weekday - time_map = lambda x:date_to_epiweek(x,shift=day_shift) - region_map = lambda x:get_hhs_region(x) \ - if use_hhs and x not in Locations.hhs_list else x # a bit hacky + time_map = lambda x: date_to_epiweek(x, shift=day_shift) + region_map = lambda x: get_hhs_region(x) if use_hhs and x not in Locations.hhs_list else x # a bit hacky end_date = sorted(data_dict.keys())[-1] # count the latest week in only if Thurs data is included - end_epiweek = date_to_epiweek(end_date,shift=-4) + end_epiweek = date_to_epiweek(end_date, shift=-4) # first pass: prepare device_id set device_dict = {} - for (date,daily_dict) in data_dict.items(): + for (date, daily_dict) in data_dict.items(): if not date: continue ew = time_map(date) - if ew == -1 or ew>end_epiweek: + if ew == -1 or ew > end_epiweek: continue if ew not in device_dict: - device_dict[ew]={} + device_dict[ew] = {} for r in region_list: device_dict[ew][r] = set() - for (state,rec_list) in daily_dict.items(): + for (state, rec_list) in daily_dict.items(): region = region_map(state) # get rid of non-US regions if region not in region_list: @@ -247,38 +244,40 @@ def get_hhs_region(atom): device_dict[ew][region].add(fac) # second pass: prepare all measurements - for (date,daily_dict) in data_dict.items(): + for (date, daily_dict) in data_dict.items(): ew = time_map(date) - if ew == -1 or ew>end_epiweek: + if ew == -1 or ew > end_epiweek: continue if ew not in buffer_dict: - buffer_dict[ew]={} + buffer_dict[ew] = {} for r in region_list: - buffer_dict[ew][r] = [0.0]*8 + buffer_dict[ew][r] = [0.0] * 8 - for (state,rec_list) in daily_dict.items(): + for (state, rec_list) in daily_dict.items(): region = region_map(state) # get rid of non-US regions if region not in region_list: continue for rec in rec_list: fac_num = float(len(device_dict[ew][region])) - buffer_dict[ew][region]= np.add( - buffer_dict[ew][region],[ - rec[1]=='positive', - rec[2]=='positive', - rec[3]=='positive', + buffer_dict[ew][region] = np.add( + buffer_dict[ew][region], + [ + rec[1] == "positive", + rec[2] == "positive", + rec[3] == "positive", 1.0, - float(rec[1]=='positive')/fac_num, - float(rec[2]=='positive')/fac_num, - float(rec[3]=='positive')/fac_num, - 1.0/fac_num, - ]).tolist() + float(rec[1] == "positive") / fac_num, + float(rec[2] == "positive") / fac_num, + float(rec[3] == "positive") / fac_num, + 1.0 / fac_num, + ], + ).tolist() # switch two dims of dict result_dict = {} for r in region_list: - result_dict[r]={} - for (k,v) in buffer_dict.items(): - result_dict[r][k]=v[r] + result_dict[r] = {} + for (k, v) in buffer_dict.items(): + result_dict[r][k] = v[r] return result_dict diff --git a/src/acquisition/quidel/quidel_update.py b/src/acquisition/quidel/quidel_update.py index b6303533c..d8a93cc36 100644 --- a/src/acquisition/quidel/quidel_update.py +++ b/src/acquisition/quidel/quidel_update.py @@ -1,4 +1,4 @@ -''' +""" =============== === Purpose === =============== @@ -33,7 +33,7 @@ 2017-12-02: * original version -''' +""" # standard library import argparse @@ -49,106 +49,107 @@ from delphi.utils.geo.locations import Locations LOCATIONS = Locations.hhs_list -DATAPATH = '/home/automation/quidel_data' +DATAPATH = "/home/automation/quidel_data" + def update(locations, first=None, last=None, force_update=False, load_email=True): - # download and prepare data first - qd = quidel.QuidelData(DATAPATH,load_email) - if not qd.need_update and not force_update: - print('Data not updated, nothing needs change.') - return - - qd_data = qd.load_csv() - qd_measurements = qd.prepare_measurements(qd_data,start_weekday=4) - qd_ts = quidel.measurement_to_ts(qd_measurements,7,startweek=first,endweek=last) - # connect to the database - u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - cur = cnx.cursor() - - def get_num_rows(): - cur.execute('SELECT count(1) `num` FROM `quidel`') - for (num,) in cur: - pass - return num - - # check from 4 weeks preceeding the last week with data through this week - cur.execute('SELECT max(`epiweek`) `ew0`, yearweek(now(), 6) `ew1` FROM `quidel`') - for (ew0, ew1) in cur: - ew0 = 200401 if ew0 is None else flu.add_epiweeks(ew0, -4) - ew0 = ew0 if first is None else first - ew1 = ew1 if last is None else last - print('Checking epiweeks between %d and %d...' % (ew0, ew1)) - - # keep track of how many rows were added - rows_before = get_num_rows() - - # check Quidel for new and/or revised data - sql = ''' + # download and prepare data first + qd = quidel.QuidelData(DATAPATH, load_email) + if not qd.need_update and not force_update: + print("Data not updated, nothing needs change.") + return + + qd_data = qd.load_csv() + qd_measurements = qd.prepare_measurements(qd_data, start_weekday=4) + qd_ts = quidel.measurement_to_ts(qd_measurements, 7, startweek=first, endweek=last) + # connect to the database + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") + cur = cnx.cursor() + + def get_num_rows(): + cur.execute("SELECT count(1) `num` FROM `quidel`") + for (num,) in cur: + pass + return num + + # check from 4 weeks preceeding the last week with data through this week + cur.execute("SELECT max(`epiweek`) `ew0`, yearweek(now(), 6) `ew1` FROM `quidel`") + for (ew0, ew1) in cur: + ew0 = 200401 if ew0 is None else flu.add_epiweeks(ew0, -4) + ew0 = ew0 if first is None else first + ew1 = ew1 if last is None else last + print("Checking epiweeks between %d and %d..." % (ew0, ew1)) + + # keep track of how many rows were added + rows_before = get_num_rows() + + # check Quidel for new and/or revised data + sql = """ INSERT INTO `quidel` (`location`, `epiweek`, `value`) VALUES (%s, %s, %s) ON DUPLICATE KEY UPDATE `value` = %s - ''' - - total_rows = 0 - - for location in locations: - if location not in qd_ts: - continue - ews = sorted(qd_ts[location].keys()) - num_missing = 0 - for ew in ews: - v = qd_ts[location][ew] - sql_data = (location, ew, v, v) - cur.execute(sql, sql_data) - total_rows += 1 - if v == 0: - num_missing += 1 - if num_missing > 0: - print(' [%s] missing %d/%d value(s)' % (location, num_missing, len(ews))) - - # keep track of how many rows were added - rows_after = get_num_rows() - print('Inserted %d/%d row(s)'%(rows_after - rows_before, total_rows)) - - # cleanup - cur.close() - cnx.commit() - cnx.close() + """ + + total_rows = 0 + + for location in locations: + if location not in qd_ts: + continue + ews = sorted(qd_ts[location].keys()) + num_missing = 0 + for ew in ews: + v = qd_ts[location][ew] + sql_data = (location, ew, v, v) + cur.execute(sql, sql_data) + total_rows += 1 + if v == 0: + num_missing += 1 + if num_missing > 0: + print(" [%s] missing %d/%d value(s)" % (location, num_missing, len(ews))) + + # keep track of how many rows were added + rows_after = get_num_rows() + print("Inserted %d/%d row(s)" % (rows_after - rows_before, total_rows)) + + # cleanup + cur.close() + cnx.commit() + cnx.close() def main(): - # args and usage - parser = argparse.ArgumentParser() - parser.add_argument('--location', action='store', type=str, default=None, help='location(s) (ex: all; any of hhs1-10)') - parser.add_argument('--first', '-f', default=None, type=int, help='first epiweek override') - parser.add_argument('--last', '-l', default=None, type=int, help='last epiweek override') - parser.add_argument('--force_update', '-u', action='store_true', help='force update db values') - parser.add_argument('--skip_email', '-s', action='store_true', help='skip email downloading step') - args = parser.parse_args() - - # sanity check - first, last, force_update, skip_email = args.first, args.last, args.force_update, args.skip_email - load_email = not skip_email - if first is not None: - flu.check_epiweek(first) - if last is not None: - flu.check_epiweek(last) - if first is not None and last is not None and first > last: - raise Exception('epiweeks in the wrong order') - - # decide what to update - if args.location.lower() == 'all': - locations = LOCATIONS - else: - locations = args.location.lower().split(',') - - # run the update - update(locations, first, last, force_update, load_email) - - -if __name__ == '__main__': - main() + # args and usage + parser = argparse.ArgumentParser() + parser.add_argument("--location", action="store", type=str, default=None, help="location(s) (ex: all; any of hhs1-10)") + parser.add_argument("--first", "-f", default=None, type=int, help="first epiweek override") + parser.add_argument("--last", "-l", default=None, type=int, help="last epiweek override") + parser.add_argument("--force_update", "-u", action="store_true", help="force update db values") + parser.add_argument("--skip_email", "-s", action="store_true", help="skip email downloading step") + args = parser.parse_args() + + # sanity check + first, last, force_update, skip_email = args.first, args.last, args.force_update, args.skip_email + load_email = not skip_email + if first is not None: + flu.check_epiweek(first) + if last is not None: + flu.check_epiweek(last) + if first is not None and last is not None and first > last: + raise Exception("epiweeks in the wrong order") + + # decide what to update + if args.location.lower() == "all": + locations = LOCATIONS + else: + locations = args.location.lower().split(",") + + # run the update + update(locations, first, last, force_update, load_email) + + +if __name__ == "__main__": + main() diff --git a/src/acquisition/twtr/healthtweets.py b/src/acquisition/twtr/healthtweets.py index 78eb2b3ec..763298cee 100644 --- a/src/acquisition/twtr/healthtweets.py +++ b/src/acquisition/twtr/healthtweets.py @@ -1,4 +1,4 @@ -''' +""" =============== === Purpose === =============== @@ -20,7 +20,7 @@ * Fetching daily values instead of weekly values 2015-03-?? * Original version -''' +""" # standard library import argparse @@ -36,132 +36,190 @@ class HealthTweets: - # mapping from state abbreviations to location codes used by healthtweets.org - STATE_CODES = {'AL': 3024, 'AK': 3025, 'AZ': 3026, 'AR': 3027, 'CA': 440, 'CO': 3029, 'CT': 3030, 'DE': 3031, 'DC': 3032, 'FL': 3033, 'GA': 3034, 'HI': 3035, 'ID': 3036, 'IL': 3037, 'IN': 3038, 'IA': 3039, 'KS': 3040, 'KY': 3041, 'LA': 2183, 'ME': 3043, 'MD': 3044, 'MA': 450, 'MI': 3046, 'MN': 3047, 'MS': 3048, 'MO': 3049, 'MT': 3050, 'NE': 3051, 'NV': 3052, 'NH': 3053, 'NJ': 478, 'NM': 2225, 'NY': 631, 'NC': 3057, 'ND': 3058, 'OH': 3059, 'OK': 3060, 'OR': 281, 'PA': 3062, 'RI': 3063, 'SC': 3064, 'SD': 3065, 'TN': 3066, 'TX': 3067, 'UT': 2272, 'VT': 3069, 'VA': 3070, 'WA': 3071, 'WV': 3072, 'WI': 3073, 'WY': 3074} - - def __init__(self, username, password, debug=False): - self.debug = debug - self.session = requests.Session() - # spoof a web browser - self.session.headers.update({ - 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36', - }) - # get the login token - response = self._go('http://www.healthtweets.org/accounts/login') - token = self._get_token(response.text) - if self.debug: - print('token=%s'%(token)) - data = { - 'csrfmiddlewaretoken': token, - 'username': username, - 'password': password, - 'next': '/', + # mapping from state abbreviations to location codes used by healthtweets.org + STATE_CODES = { + "AL": 3024, + "AK": 3025, + "AZ": 3026, + "AR": 3027, + "CA": 440, + "CO": 3029, + "CT": 3030, + "DE": 3031, + "DC": 3032, + "FL": 3033, + "GA": 3034, + "HI": 3035, + "ID": 3036, + "IL": 3037, + "IN": 3038, + "IA": 3039, + "KS": 3040, + "KY": 3041, + "LA": 2183, + "ME": 3043, + "MD": 3044, + "MA": 450, + "MI": 3046, + "MN": 3047, + "MS": 3048, + "MO": 3049, + "MT": 3050, + "NE": 3051, + "NV": 3052, + "NH": 3053, + "NJ": 478, + "NM": 2225, + "NY": 631, + "NC": 3057, + "ND": 3058, + "OH": 3059, + "OK": 3060, + "OR": 281, + "PA": 3062, + "RI": 3063, + "SC": 3064, + "SD": 3065, + "TN": 3066, + "TX": 3067, + "UT": 2272, + "VT": 3069, + "VA": 3070, + "WA": 3071, + "WV": 3072, + "WI": 3073, + "WY": 3074, } - # login to the site - response = self._go('http://www.healthtweets.org/accounts/login', data=data) - if response.status_code != 200 or 'Your username and password' in response.text: - raise Exception('login failed') - - def get_values(self, state, date1, date2): - ''' - state: two-letter state abbreviation (see STATE_CODES) - date1: the first date in the range, inclusive (format: YYYY-MM-DD) - date2: the last date in the range, inclusive (format: YYYY-MM-DD) - returns a dictionary (by date) of number of flu tweets (num) and total tweets (total) - ''' - # get raw values (number of flu tweets) and normalized values (flu tweets as a percent of total tweets) - raw_values = self._get_values(state, date1, date2, False) - normalized_values = self._get_values(state, date1, date2, True) - values = {} - # save the raw number and calculate the total - for date in raw_values.keys(): - if normalized_values[date] == 0: - continue - values[date] = { - 'num': round(raw_values[date]), - 'total': round(100 * raw_values[date] / normalized_values[date]), - } - print(date, raw_values[date], normalized_values[date]) - return values - - def _get_values(self, state, date1, date2, normalized): - if state not in HealthTweets.STATE_CODES: - raise Exception('invalid state') - state_code = HealthTweets.STATE_CODES[state] - d1, d2 = datetime.strptime(date1, '%Y-%m-%d'), datetime.strptime(date2, '%Y-%m-%d') - s1, s2 = d1.strftime('%m%%2F%d%%2F%Y'), d2.strftime('%m%%2F%d%%2F%Y') - count_type = 'normalized' if normalized else 'raw' - url = 'http://www.healthtweets.org/trends/plot?resolution=Day&count_type=%s&dayNum=%d&from=%s&to=%s&plot1_disease=65&location_plot1=%d'%(count_type, (d2 - d1).days, s1, s2, state_code) - response = self._go('http://www.healthtweets.org/trends/plot?resolution=Day&count_type=%s&dayNum=%d&from=%s&to=%s&plot1_disease=65&location_plot1=%d'%(count_type, (d2 - d1).days, s1, s2, state_code)) - #print(state, date1, date2, normalized) - #print(url) - #print(response.status_code) - if response.status_code != 200: - raise Exception('plot status is ' + str(response.status_code) + ' (when was data last updated?)') - lines = [line.strip() for line in response.text.split('\n')] - data_line = [line for line in lines if line[:16] == 'var chartData = '] - if len(data_line) != 1: - raise Exception('lookup failed') - values = json.loads(data_line[0][16:-1]) - return dict([(datetime.strptime(v[0], '%m/%d/%Y').strftime('%Y-%m-%d'), float(v[1])) for v in values]) - - def check_state(self, state): - ''' - Sanity checks state code mapping. - state: two-letter state abbreviation (see STATE_CODES) - returns the full state name associated with the state abbreviation - ''' - if state not in HealthTweets.STATE_CODES: - raise Exception('invalid state') - state_code = HealthTweets.STATE_CODES[state] - response = self._go('http://www.healthtweets.org/trends/plot?resolution=Day&count_type=normalized&dayNum=7&from=01%%2F01%%2F2015&to=01%%2F07%%2F2015&plot1_disease=65&location_plot1=%d'%(state_code)) - lines = [line.strip() for line in response.text.split('\n')] - data_line = [line for line in lines if line[:29] == 'var plotNames = ["Influenza ('] - if len(data_line) == 0: - raise Exception('check failed') - name = data_line[0][29:] - name = name.split('(')[0] - return name.strip() - - def _get_token(self, html): - page = PageParser.parse(html) - hidden = PageParser.filter_all(page, [('html',), ('body',), ('div',), ('div',), ('div',), ('form',), ('input',)]) - return hidden['attrs']['value'] - - def _go(self, url, method=None, referer=None, data=None): - if self.debug: - print('%s'%(url)) - if method is None: - if data is None: - method = self.session.get - else: - method = self.session.post - response = method(url, headers={'referer': referer}, data=data) - html = response.text - if self.debug: - for item in response.history: - print(' [%d to %s]'%(item.status_code, item.headers['Location'])) - print(' %d (%d bytes)'%(response.status_code, len(html))) - return response + + def __init__(self, username, password, debug=False): + self.debug = debug + self.session = requests.Session() + # spoof a web browser + self.session.headers.update( + { + "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36", + } + ) + # get the login token + response = self._go("https://www.healthtweets.org/accounts/login") + token = self._get_token(response.text) + if self.debug: + print("token=%s" % (token)) + data = { + "csrfmiddlewaretoken": token, + "username": username, + "password": password, + "next": "/", + } + # login to the site + response = self._go("https://www.healthtweets.org/accounts/login", data=data) + if response.status_code != 200 or "Your username and password" in response.text: + raise Exception("login failed") + + def get_values(self, state, date1, date2): + """ + state: two-letter state abbreviation (see STATE_CODES) + date1: the first date in the range, inclusive (format: YYYY-MM-DD) + date2: the last date in the range, inclusive (format: YYYY-MM-DD) + returns a dictionary (by date) of number of flu tweets (num) and total tweets (total) + """ + # get raw values (number of flu tweets) and normalized values (flu tweets as a percent of total tweets) + raw_values = self._get_values(state, date1, date2, False) + normalized_values = self._get_values(state, date1, date2, True) + values = {} + # save the raw number and calculate the total + for date in raw_values.keys(): + if normalized_values[date] == 0: + continue + values[date] = { + "num": round(raw_values[date]), + "total": round(100 * raw_values[date] / normalized_values[date]), + } + print(date, raw_values[date], normalized_values[date]) + return values + + def _get_values(self, state, date1, date2, normalized): + if state not in HealthTweets.STATE_CODES: + raise Exception("invalid state") + state_code = HealthTweets.STATE_CODES[state] + d1, d2 = datetime.strptime(date1, "%Y-%m-%d"), datetime.strptime(date2, "%Y-%m-%d") + s1, s2 = d1.strftime("%m%%2F%d%%2F%Y"), d2.strftime("%m%%2F%d%%2F%Y") + count_type = "normalized" if normalized else "raw" + url = "https://www.healthtweets.org/trends/plot?resolution=Day&count_type=%s&dayNum=%d&from=%s&to=%s&plot1_disease=65&location_plot1=%d" % (count_type, (d2 - d1).days, s1, s2, state_code) + response = self._go( + "https://www.healthtweets.org/trends/plot?resolution=Day&count_type=%s&dayNum=%d&from=%s&to=%s&plot1_disease=65&location_plot1=%d" % (count_type, (d2 - d1).days, s1, s2, state_code) + ) + # print(state, date1, date2, normalized) + # print(url) + # print(response.status_code) + if response.status_code != 200: + raise Exception("plot status is " + str(response.status_code) + " (when was data last updated?)") + lines = [line.strip() for line in response.text.split("\n")] + data_line = [line for line in lines if line[:16] == "var chartData = "] + if len(data_line) != 1: + raise Exception("lookup failed") + values = json.loads(data_line[0][16:-1]) + return dict([(datetime.strptime(v[0], "%m/%d/%Y").strftime("%Y-%m-%d"), float(v[1])) for v in values]) + + def check_state(self, state): + """ + Sanity checks state code mapping. + state: two-letter state abbreviation (see STATE_CODES) + returns the full state name associated with the state abbreviation + """ + if state not in HealthTweets.STATE_CODES: + raise Exception("invalid state") + state_code = HealthTweets.STATE_CODES[state] + response = self._go( + "https://www.healthtweets.org/trends/plot?resolution=Day&count_type=normalized&dayNum=7&from=01%%2F01%%2F2015&to=01%%2F07%%2F2015&plot1_disease=65&location_plot1=%d" % (state_code) + ) + lines = [line.strip() for line in response.text.split("\n")] + data_line = [line for line in lines if line[:29] == 'var plotNames = ["Influenza ('] + if len(data_line) == 0: + raise Exception("check failed") + name = data_line[0][29:] + name = name.split("(")[0] + return name.strip() + + def _get_token(self, html): + page = PageParser.parse(html) + hidden = PageParser.filter_all(page, [("html",), ("body",), ("div",), ("div",), ("div",), ("form",), ("input",)]) + return hidden["attrs"]["value"] + + def _go(self, url, method=None, referer=None, data=None): + if self.debug: + print("%s" % (url)) + if method is None: + if data is None: + method = self.session.get + else: + method = self.session.post + response = method(url, headers={"referer": referer}, data=data) + html = response.text + if self.debug: + for item in response.history: + print(" [%d to %s]" % (item.status_code, item.headers["Location"])) + print(" %d (%d bytes)" % (response.status_code, len(html))) + return response def main(): - # args and usage - parser = argparse.ArgumentParser() - parser.add_argument('username', action='store', type=str, help='healthtweets.org username') - parser.add_argument('password', action='store', type=str, help='healthtweets.org password') - parser.add_argument('state', action='store', type=str, choices=list(HealthTweets.STATE_CODES.keys()), help='U.S. state (ex: TX)') - parser.add_argument('date1', action='store', type=str, help='first date, inclusive (ex: 2015-01-01)') - parser.add_argument('date2', action='store', type=str, help='last date, inclusive (ex: 2015-01-01)') - parser.add_argument('-d', '--debug', action='store_const', const=True, default=False, help='enable debug mode') - args = parser.parse_args() - - ht = HealthTweets(args.username, args.password, debug=args.debug) - values = ht.get_values(args.state, args.date1, args.date2) - print('Daily counts in %s from %s to %s:'%(ht.check_state(args.state), args.date1, args.date2)) - for date in sorted(list(values.keys())): - print('%s: num=%-4d total=%-5d (%.3f%%)'%(date, values[date]['num'], values[date]['total'], 100 * values[date]['num'] / values[date]['total'])) - - -if __name__ == '__main__': - main() + # args and usage + parser = argparse.ArgumentParser() + parser.add_argument("username", action="store", type=str, help="healthtweets.org username") + parser.add_argument("password", action="store", type=str, help="healthtweets.org password") + parser.add_argument("state", action="store", type=str, choices=list(HealthTweets.STATE_CODES.keys()), help="U.S. state (ex: TX)") + parser.add_argument("date1", action="store", type=str, help="first date, inclusive (ex: 2015-01-01)") + parser.add_argument("date2", action="store", type=str, help="last date, inclusive (ex: 2015-01-01)") + parser.add_argument("-d", "--debug", action="store_const", const=True, default=False, help="enable debug mode") + args = parser.parse_args() + + ht = HealthTweets(args.username, args.password, debug=args.debug) + values = ht.get_values(args.state, args.date1, args.date2) + print("Daily counts in %s from %s to %s:" % (ht.check_state(args.state), args.date1, args.date2)) + for date in sorted(list(values.keys())): + print("%s: num=%-4d total=%-5d (%.3f%%)" % (date, values[date]["num"], values[date]["total"], 100 * values[date]["num"] / values[date]["total"])) + + +if __name__ == "__main__": + main() diff --git a/src/acquisition/twtr/pageparser.py b/src/acquisition/twtr/pageparser.py index 5e9aaaea1..2b2183c89 100644 --- a/src/acquisition/twtr/pageparser.py +++ b/src/acquisition/twtr/pageparser.py @@ -5,74 +5,73 @@ class PageParser(HTMLParser): - ''' - This is an HTML parser! All of the hard work is done by the superclass - (which is a Python built-in). This class puts the HTML into a hierarchy - that's (hopefully) easier to work with than raw string parsing. - ''' + """ + This is an HTML parser! All of the hard work is done by the superclass + (which is a Python built-in). This class puts the HTML into a hierarchy + that's (hopefully) easier to work with than raw string parsing. + """ - @staticmethod - def parse(html): - parser = PageParser() - parser.feed(html) - return parser.get_root_node() + @staticmethod + def parse(html): + parser = PageParser() + parser.feed(html) + return parser.get_root_node() - @staticmethod - def banlist(): - '''Commonly unclosed tags''' - return ('br', 'img', 'meta') + @staticmethod + def banlist(): + """Commonly unclosed tags""" + return ("br", "img", "meta") - @staticmethod - def new_node(type): - '''An empty node of the HTML tree''' - return {'type': type, 'attrs': {}, 'nodes': [], 'data': ''} + @staticmethod + def new_node(type): + """An empty node of the HTML tree""" + return {"type": type, "attrs": {}, "nodes": [], "data": ""} - @staticmethod - def filter_all(node, filters): - '''Applies all filters''' - for f in filters: - node = PageParser.filter(node, *f) - return node + @staticmethod + def filter_all(node, filters): + """Applies all filters""" + for f in filters: + node = PageParser.filter(node, *f) + return node - @staticmethod - def filter(node, type, index=0): - '''Finds a sub-node of the given type, specified by index''' - i = 0 - for node in node['nodes']: - if node['type'] == type: - if i == index: - return node - i += 1 - return None + @staticmethod + def filter(node, type, index=0): + """Finds a sub-node of the given type, specified by index""" + i = 0 + for node in node["nodes"]: + if node["type"] == type: + if i == index: + return node + i += 1 + return None - def __init__(self): - HTMLParser.__init__(self) - self.root = PageParser.new_node(None) - self.stack = [self.root] - self.indent = 0 + def __init__(self): + HTMLParser.__init__(self) + self.root = PageParser.new_node(None) + self.stack = [self.root] + self.indent = 0 - def get_root_node(self): - '''After parsing, returns the abstract root node (which contains the html node)''' - return self.root + def get_root_node(self): + """After parsing, returns the abstract root node (which contains the html node)""" + return self.root - def handle_starttag(self, tag, attrs): - '''Inherited - called when a start tag is found''' - if tag in PageParser.banlist(): - return - element = PageParser.new_node(tag) - for (k, v) in attrs: - element['attrs'][k] = v - self.stack[-1]['nodes'].append(element) - self.stack.append(element) + def handle_starttag(self, tag, attrs): + """Inherited - called when a start tag is found""" + if tag in PageParser.banlist(): + return + element = PageParser.new_node(tag) + for (k, v) in attrs: + element["attrs"][k] = v + self.stack[-1]["nodes"].append(element) + self.stack.append(element) - def handle_endtag(self, tag): - '''Inherited - called when an end tag is found''' - if tag in PageParser.banlist(): - return - self.stack.pop() + def handle_endtag(self, tag): + """Inherited - called when an end tag is found""" + if tag in PageParser.banlist(): + return + self.stack.pop() - - def handle_data(self, data): - '''Inherited - called when a data string is found''' - element = self.stack[-1] - element['data'] += data + def handle_data(self, data): + """Inherited - called when a data string is found""" + element = self.stack[-1] + element["data"] += data diff --git a/src/acquisition/twtr/twitter_update.py b/src/acquisition/twtr/twitter_update.py index 5c1f3f45b..b2e270c97 100644 --- a/src/acquisition/twtr/twitter_update.py +++ b/src/acquisition/twtr/twitter_update.py @@ -1,4 +1,4 @@ -''' +""" =============== === Purpose === =============== @@ -49,7 +49,7 @@ * Small documentation update 2015-05-22 * Original version -''' +""" # third party import mysql.connector @@ -60,46 +60,46 @@ def run(): - # connect to the database - u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - cur = cnx.cursor() - - def get_num_rows(): - cur.execute('SELECT count(1) `num` FROM `twitter`') - for (num,) in cur: - pass - return num - - # check from 7 days preceeding the last date with data through yesterday (healthtweets.org 404's if today's date is part of the range) - cur.execute('SELECT date_sub(max(`date`), INTERVAL 7 DAY) `date1`, date_sub(date(now()), INTERVAL 1 DAY) `date2` FROM `twitter`') - for (date1, date2) in cur: - date1, date2 = date1.strftime('%Y-%m-%d'), date2.strftime('%Y-%m-%d') - print('Checking dates between %s and %s...'%(date1, date2)) - - # keep track of how many rows were added - rows_before = get_num_rows() - - # check healthtweets.org for new and/or revised data - ht = HealthTweets(*secrets.healthtweets.login) - sql = 'INSERT INTO `twitter` (`date`, `state`, `num`, `total`) VALUES (%s, %s, %s, %s) ON DUPLICATE KEY UPDATE `num` = %s, `total` = %s' - total_rows = 0 - for state in sorted(HealthTweets.STATE_CODES.keys()): - values = ht.get_values(state, date1, date2) - for date in sorted(list(values.keys())): - sql_data = (date, state, values[date]['num'], values[date]['total'], values[date]['num'], values[date]['total']) - cur.execute(sql, sql_data) - total_rows += 1 - - # keep track of how many rows were added - rows_after = get_num_rows() - print('Inserted %d/%d row(s)'%(rows_after - rows_before, total_rows)) - - # cleanup - cur.close() - cnx.commit() - cnx.close() - - -if __name__ == '__main__': - run() + # connect to the database + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") + cur = cnx.cursor() + + def get_num_rows(): + cur.execute("SELECT count(1) `num` FROM `twitter`") + for (num,) in cur: + pass + return num + + # check from 7 days preceeding the last date with data through yesterday (healthtweets.org 404's if today's date is part of the range) + cur.execute("SELECT date_sub(max(`date`), INTERVAL 7 DAY) `date1`, date_sub(date(now()), INTERVAL 1 DAY) `date2` FROM `twitter`") + for (date1, date2) in cur: + date1, date2 = date1.strftime("%Y-%m-%d"), date2.strftime("%Y-%m-%d") + print("Checking dates between %s and %s..." % (date1, date2)) + + # keep track of how many rows were added + rows_before = get_num_rows() + + # check healthtweets.org for new and/or revised data + ht = HealthTweets(*secrets.healthtweets.login) + sql = "INSERT INTO `twitter` (`date`, `state`, `num`, `total`) VALUES (%s, %s, %s, %s) ON DUPLICATE KEY UPDATE `num` = %s, `total` = %s" + total_rows = 0 + for state in sorted(HealthTweets.STATE_CODES.keys()): + values = ht.get_values(state, date1, date2) + for date in sorted(list(values.keys())): + sql_data = (date, state, values[date]["num"], values[date]["total"], values[date]["num"], values[date]["total"]) + cur.execute(sql, sql_data) + total_rows += 1 + + # keep track of how many rows were added + rows_after = get_num_rows() + print("Inserted %d/%d row(s)" % (rows_after - rows_before, total_rows)) + + # cleanup + cur.close() + cnx.commit() + cnx.close() + + +if __name__ == "__main__": + run() diff --git a/src/acquisition/wiki/wiki.py b/src/acquisition/wiki/wiki.py index 602e21102..c57582918 100644 --- a/src/acquisition/wiki/wiki.py +++ b/src/acquisition/wiki/wiki.py @@ -1,112 +1,112 @@ """ -=============== -=== Purpose === -=============== - -Wrapper for the entire wiki data collection process: - 1. Uses wiki_update.py to fetch metadata for new access logs - 2. Uses wiki_download.py to download the access logs - 3. Uses wiki_extract.py to store article access counts - +=============== +=== Purpose === +=============== + +Wrapper for the entire wiki data collection process: + 1. Uses wiki_update.py to fetch metadata for new access logs + 2. Uses wiki_download.py to download the access logs + 3. Uses wiki_extract.py to store article access counts + See also: master.php - - -======================= -=== Data Dictionary === -======================= - -`wiki_raw` is a staging table where extracted access log data is stored for -further processing. When wiki_update.py finds a new log, it saves the name and -hash to this table, with a status of 0. This table is read by master.php, which -then hands out "jobs" (independently and in parallel) to wiki_download.py. -After wiki_download.py downloads the log and extracts the counts, it submits -the data (as JSON) to master.php, which then stores the "raw" JSON counts in -this table. -+----------+---------------+------+-----+---------+----------------+ -| Field | Type | Null | Key | Default | Extra | -+----------+---------------+------+-----+---------+----------------+ -| id | int(11) | NO | PRI | NULL | auto_increment | -| name | varchar(64) | NO | UNI | NULL | | -| hash | char(32) | NO | | NULL | | -| status | int(11) | NO | MUL | 0 | | -| size | int(11) | YES | | NULL | | -| datetime | datetime | YES | | NULL | | -| worker | varchar(256) | YES | | NULL | | -| elapsed | float | YES | | NULL | | -| data | varchar(2048) | YES | | NULL | | -+----------+---------------+------+-----+---------+----------------+ -id: unique identifier for each record -name: name of the access log -hash: md5 hash of the file, as reported by the dumps site (all zeroes if no - hash is provided) -status: the status of the job, using the following values: - 0: queued for download - 1: download in progress - 2: queued for extraction - 3: extracted to `wiki` table - (any negative value indicates failure) -size: the size, in bytes, of the downloaded file -datetime: the timestamp of the most recent status update -worker: name (user@hostname) of the machine working on the job -elapsed: time, in seconds, taken to complete the job -data: a JSON string containing counts for selected articles in the access log - -`wiki` is the table where access counts are stored (parsed from wiki_raw). The -"raw" JSON counts are parsed by wiki_extract.py and stored directly in this -table. -+----------+-------------+------+-----+---------+----------------+ -| Field | Type | Null | Key | Default | Extra | -+----------+-------------+------+-----+---------+----------------+ -| id | int(11) | NO | PRI | NULL | auto_increment | -| datetime | datetime | NO | MUL | NULL | | -| article | varchar(64) | NO | MUL | NULL | | -| count | int(11) | NO | | NULL | | -+----------+-------------+------+-----+---------+----------------+ -id: unique identifier for each record -datetime: UTC timestamp (rounded to the nearest hour) of article access -article: name of the article -count: number of times the article was accessed in the hour - -`wiki_meta` is a metadata table for this dataset. It contains pre-calculated -date and epiweeks fields, and more importantly, the total number of English -article hits (denominator) for each `datetime` in the `wiki` table. This table -is populated in parallel with `wiki` by the wiki_extract.py script. -+----------+----------+------+-----+---------+----------------+ -| Field | Type | Null | Key | Default | Extra | -+----------+----------+------+-----+---------+----------------+ -| id | int(11) | NO | PRI | NULL | auto_increment | -| datetime | datetime | NO | UNI | NULL | | -| date | date | NO | | NULL | | -| epiweek | int(11) | NO | | NULL | | -| total | int(11) | NO | | NULL | | -+----------+----------+------+-----+---------+----------------+ -id: unique identifier for each record -datetime: UTC timestamp (rounded to the nearest hour) of article access -date: the date portion of `datetime` -epiweek: the year and week containing `datetime` -total: total number of English article hits in the hour - - -================= -=== Changelog === -================= - + + +======================= +=== Data Dictionary === +======================= + +`wiki_raw` is a staging table where extracted access log data is stored for +further processing. When wiki_update.py finds a new log, it saves the name and +hash to this table, with a status of 0. This table is read by master.php, which +then hands out "jobs" (independently and in parallel) to wiki_download.py. +After wiki_download.py downloads the log and extracts the counts, it submits +the data (as JSON) to master.php, which then stores the "raw" JSON counts in +this table. ++----------+---------------+------+-----+---------+----------------+ +| Field | Type | Null | Key | Default | Extra | ++----------+---------------+------+-----+---------+----------------+ +| id | int(11) | NO | PRI | NULL | auto_increment | +| name | varchar(64) | NO | UNI | NULL | | +| hash | char(32) | NO | | NULL | | +| status | int(11) | NO | MUL | 0 | | +| size | int(11) | YES | | NULL | | +| datetime | datetime | YES | | NULL | | +| worker | varchar(256) | YES | | NULL | | +| elapsed | float | YES | | NULL | | +| data | varchar(2048) | YES | | NULL | | ++----------+---------------+------+-----+---------+----------------+ +id: unique identifier for each record +name: name of the access log +hash: md5 hash of the file, as reported by the dumps site (all zeroes if no + hash is provided) +status: the status of the job, using the following values: + 0: queued for download + 1: download in progress + 2: queued for extraction + 3: extracted to `wiki` table + (any negative value indicates failure) +size: the size, in bytes, of the downloaded file +datetime: the timestamp of the most recent status update +worker: name (user@hostname) of the machine working on the job +elapsed: time, in seconds, taken to complete the job +data: a JSON string containing counts for selected articles in the access log + +`wiki` is the table where access counts are stored (parsed from wiki_raw). The +"raw" JSON counts are parsed by wiki_extract.py and stored directly in this +table. ++----------+-------------+------+-----+---------+----------------+ +| Field | Type | Null | Key | Default | Extra | ++----------+-------------+------+-----+---------+----------------+ +| id | int(11) | NO | PRI | NULL | auto_increment | +| datetime | datetime | NO | MUL | NULL | | +| article | varchar(64) | NO | MUL | NULL | | +| count | int(11) | NO | | NULL | | ++----------+-------------+------+-----+---------+----------------+ +id: unique identifier for each record +datetime: UTC timestamp (rounded to the nearest hour) of article access +article: name of the article +count: number of times the article was accessed in the hour + +`wiki_meta` is a metadata table for this dataset. It contains pre-calculated +date and epiweeks fields, and more importantly, the total number of English +article hits (denominator) for each `datetime` in the `wiki` table. This table +is populated in parallel with `wiki` by the wiki_extract.py script. ++----------+----------+------+-----+---------+----------------+ +| Field | Type | Null | Key | Default | Extra | ++----------+----------+------+-----+---------+----------------+ +| id | int(11) | NO | PRI | NULL | auto_increment | +| datetime | datetime | NO | UNI | NULL | | +| date | date | NO | | NULL | | +| epiweek | int(11) | NO | | NULL | | +| total | int(11) | NO | | NULL | | ++----------+----------+------+-----+---------+----------------+ +id: unique identifier for each record +datetime: UTC timestamp (rounded to the nearest hour) of article access +date: the date portion of `datetime` +epiweek: the year and week containing `datetime` +total: total number of English article hits in the hour + + +================= +=== Changelog === +================= + 2017-02-24 * secrets and small improvements 2016-08-14 * Increased job limit (6 -> 12) (pageviews files are ~2x smaller) -2015-08-26 +2015-08-26 * Reduced job limit (8 -> 6) -2015-08-14 +2015-08-14 * Reduced job limit (10 -> 8) -2015-08-11 +2015-08-11 + New table `wiki_meta` -2015-05-22 +2015-05-22 * Updated status codes for `wiki_raw` table -2015-05-21 +2015-05-21 * Original version """ - + # first party from . import wiki_update from . import wiki_download @@ -115,31 +115,27 @@ def main(): - # step 1: find new access logs (aka "jobs") - print('looking for new jobs...') - try: - wiki_update.run() - except: - print('wiki_update failed') - - # step 2: run a few jobs - print('running jobs...') - try: - wiki_download.run( - secrets.wiki.hmac, - download_limit=1024 * 1024 * 1024, - job_limit=12 - ) - except: - print('wiki_download failed') - - # step 3: extract counts from the staging data - print('extracting counts...') - try: - wiki_extract.run(job_limit=100) - except: - print('wiki_extract failed') - - -if __name__ == '__main__': - main() + # step 1: find new access logs (aka "jobs") + print("looking for new jobs...") + try: + wiki_update.run() + except: + print("wiki_update failed") + + # step 2: run a few jobs + print("running jobs...") + try: + wiki_download.run(secrets.wiki.hmac, download_limit=1024 * 1024 * 1024, job_limit=12) + except: + print("wiki_download failed") + + # step 3: extract counts from the staging data + print("extracting counts...") + try: + wiki_extract.run(job_limit=100) + except: + print("wiki_extract failed") + + +if __name__ == "__main__": + main() diff --git a/src/acquisition/wiki/wiki_download.py b/src/acquisition/wiki/wiki_download.py index 1a01b7f8e..0737df6de 100644 --- a/src/acquisition/wiki/wiki_download.py +++ b/src/acquisition/wiki/wiki_download.py @@ -29,14 +29,15 @@ # python 2 and 3 from __future__ import print_function import sys + if sys.version_info.major == 2: - # python 2 libraries - from urllib import urlencode - from urllib2 import urlopen + # python 2 libraries + from urllib import urlencode + from urllib2 import urlopen else: - # python 3 libraries - from urllib.parse import urlencode - from urllib.request import urlopen + # python 3 libraries + from urllib.parse import urlencode + from urllib.request import urlopen # common libraries import argparse @@ -53,234 +54,233 @@ VERSION = 10 -MASTER_URL = 'https://delphi.cmu.edu/~automation/public/wiki/master.php' +MASTER_URL = "https://delphi.cmu.edu/~automation/public/wiki/master.php" + def text(data_string): - return str(data_string.decode('utf-8')) + return str(data_string.decode("utf-8")) def data(text_string): - if sys.version_info.major == 2: - return text_string - else: - return bytes(text_string, 'utf-8') + if sys.version_info.major == 2: + return text_string + else: + return bytes(text_string, "utf-8") def get_hmac_sha256(key, msg): - key_bytes, msg_bytes = key.encode('utf-8'), msg.encode('utf-8') - return hmac.new(key_bytes, msg_bytes, hashlib.sha256).hexdigest() + key_bytes, msg_bytes = key.encode("utf-8"), msg.encode("utf-8") + return hmac.new(key_bytes, msg_bytes, hashlib.sha256).hexdigest() def extract_article_counts(filename, language, articles, debug_mode): - """ - Support multiple languages ('en' | 'es' | 'pt') - Running time optimized to O(M), which means only need to scan the whole file once - :param filename: - :param language: Different languages such as 'en', 'es', and 'pt' - :param articles: - :param debug_mode: - :return: - """ - counts = {} - articles_set = set(map(lambda x: x.lower(), articles)) - total = 0 - with open(filename, "r", encoding="utf8") as f: - for line in f: - content = line.strip().split() - if len(content) != 4: - print('unexpected article format: {0}'.format(line)) - continue - article_title = content[1].lower() - article_count = int(content[2]) - if content[0] == language: - total += article_count - if content[0] == language and article_title in articles_set: - if debug_mode: - print("Find article {0}: {1}".format(article_title, line)) - counts[article_title] = article_count - if debug_mode: - print("Total number of counts for language {0} is {1}".format(language, total)) - counts['total'] = total - return counts + """ + Support multiple languages ('en' | 'es' | 'pt') + Running time optimized to O(M), which means only need to scan the whole file once + :param filename: + :param language: Different languages such as 'en', 'es', and 'pt' + :param articles: + :param debug_mode: + :return: + """ + counts = {} + articles_set = set(map(lambda x: x.lower(), articles)) + total = 0 + with open(filename, "r", encoding="utf8") as f: + for line in f: + content = line.strip().split() + if len(content) != 4: + print("unexpected article format: {0}".format(line)) + continue + article_title = content[1].lower() + article_count = int(content[2]) + if content[0] == language: + total += article_count + if content[0] == language and article_title in articles_set: + if debug_mode: + print("Find article {0}: {1}".format(article_title, line)) + counts[article_title] = article_count + if debug_mode: + print("Total number of counts for language {0} is {1}".format(language, total)) + counts["total"] = total + return counts def extract_article_counts_orig(articles, debug_mode): - """ - The original method which extracts article counts by shell command grep (only support en articles). - As it is difficult to deal with other languages (utf-8 encoding), we choose to use python read files. - Another things is that it is slower to go over the whole file once and once again, the time complexity is O(NM), - where N is the number of articles and M is the lines in the file - In our new implementation extract_article_counts(), the time complexity is O(M), and it can cope with utf8 encoding - :param articles: - :param debug_mode: - :return: - """ - counts = {} - for article in articles: - if debug_mode: - print(' %s' % (article)) - out = text( - subprocess.check_output('LC_ALL=C grep -a -i "^en %s " raw2 | cat' % (article.lower()), shell=True)).strip() - count = 0 - if len(out) > 0: - for line in out.split('\n'): - fields = line.split() - if len(fields) != 4: - print('unexpected article format: [%s]' % (line)) - else: - count += int(fields[2]) - # print ' %4d %s'%(count, article) - counts[article.lower()] = count + """ + The original method which extracts article counts by shell command grep (only support en articles). + As it is difficult to deal with other languages (utf-8 encoding), we choose to use python read files. + Another things is that it is slower to go over the whole file once and once again, the time complexity is O(NM), + where N is the number of articles and M is the lines in the file + In our new implementation extract_article_counts(), the time complexity is O(M), and it can cope with utf8 encoding + :param articles: + :param debug_mode: + :return: + """ + counts = {} + for article in articles: + if debug_mode: + print(" %s" % (article)) + out = text(subprocess.check_output('LC_ALL=C grep -a -i "^en %s " raw2 | cat' % (article.lower()), shell=True)).strip() + count = 0 + if len(out) > 0: + for line in out.split("\n"): + fields = line.split() + if len(fields) != 4: + print("unexpected article format: [%s]" % (line)) + else: + count += int(fields[2]) + # print ' %4d %s'%(count, article) + counts[article.lower()] = count + if debug_mode: + print(" %d" % (count)) + print("getting total count...") + out = text(subprocess.check_output('cat raw2 | LC_ALL=C grep -a -i "^en " | cut -d" " -f 3 | awk \'{s+=$1} END {printf "%.0f", s}\'', shell=True)) + total = int(out) if debug_mode: - print(' %d' % (count)) - print('getting total count...') - out = text(subprocess.check_output( - 'cat raw2 | LC_ALL=C grep -a -i "^en " | cut -d" " -f 3 | awk \'{s+=$1} END {printf "%.0f", s}\'', shell=True)) - total = int(out) - if debug_mode: - print(total) - counts['total'] = total - return counts + print(total) + counts["total"] = total + return counts def run(secret, download_limit=None, job_limit=None, sleep_time=1, job_type=0, debug_mode=False): - worker = text(subprocess.check_output("echo `whoami`@`hostname`", shell=True)).strip() - print('this is [%s]'%(worker)) - if debug_mode: - print('*** running in debug mode ***') - - total_download = 0 - passed_jobs = 0 - failed_jobs = 0 - while (download_limit is None or total_download < download_limit) and (job_limit is None or (passed_jobs + failed_jobs) < job_limit): - try: - time_start = datetime.datetime.now() - req = urlopen(MASTER_URL + '?get=x&type=%s'%(job_type)) - code = req.getcode() - if code != 200: - if code == 201: - print('no jobs available') - if download_limit is None and job_limit is None: - time.sleep(60) - continue - else: - print('nothing to do, exiting') - return - else: - raise Exception('server response code (get) was %d'%(code)) - # Make the code compatible with mac os system - if platform == "darwin": - job_content = text(req.readlines()[1]) - else: - job_content = text(req.readlines()[0]) - if job_content == 'no jobs': - print('no jobs available') - if download_limit is None and job_limit is None: - time.sleep(60) - continue - else: - print('nothing to do, exiting') - return - job = json.loads(job_content) - print('received job [%d|%s]'%(job['id'], job['name'])) - # updated parsing for pageviews - maybe use a regex in the future - #year, month = int(job['name'][11:15]), int(job['name'][15:17]) - year, month = int(job['name'][10:14]), int(job['name'][14:16]) - #print 'year=%d | month=%d'%(year, month) - url = 'https://dumps.wikimedia.org/other/pageviews/%d/%d-%02d/%s'%(year, year, month, job['name']) - print('downloading file [%s]...'%(url)) - subprocess.check_call('curl -s %s > raw.gz'%(url), shell=True) - print('checking file size...') - # Make the code cross-platfrom, so use python to get the size of the file - # size = int(text(subprocess.check_output('ls -l raw.gz | cut -d" " -f 5', shell=True))) - size = os.stat("raw.gz").st_size - if debug_mode: - print(size) - total_download += size - if job['hash'] != '00000000000000000000000000000000': - print('checking hash...') - out = text(subprocess.check_output('md5sum raw.gz', shell=True)) - result = out[0:32] - if result != job['hash']: - raise Exception('wrong hash [expected %s, got %s]'%(job['hash'], result)) - if debug_mode: - print(result) - print('decompressing...') - subprocess.check_call('gunzip -f raw.gz', shell=True) - #print 'converting case...' - #subprocess.check_call('cat raw | tr "[:upper:]" "[:lower:]" > raw2', shell=True) - #subprocess.check_call('rm raw', shell=True) - subprocess.check_call('mv raw raw2', shell=True) - print('extracting article counts...') - - # Use python to read the file and extract counts, if you want to use the original shell method, please use - counts = {} - for language in wiki_util.Articles.available_languages: - lang2articles = {'en': wiki_util.Articles.en_articles, 'es': wiki_util.Articles.es_articles, 'pt': wiki_util.Articles.pt_articles} - articles = lang2articles[language] - articles = sorted(articles) - if debug_mode: - print("Language is {0} and target articles are {1}".format(language, articles)) - temp_counts = extract_article_counts("raw2", language, articles, debug_mode) - counts[language] = temp_counts - - if not debug_mode: - print('deleting files...') - subprocess.check_call('rm raw2', shell=True) - print('saving results...') - time_stop = datetime.datetime.now() - result = { - 'id': job['id'], - 'size': size, - 'data': json.dumps(counts), - 'worker': worker, - 'elapsed': (time_stop - time_start).total_seconds(), - } - payload = json.dumps(result) - hmac_str = get_hmac_sha256(secret, payload) - if debug_mode: - print(' hmac: %s' % hmac_str) - post_data = urlencode({'put': payload, 'hmac': hmac_str}) - req = urlopen(MASTER_URL, data=data(post_data)) - code = req.getcode() - if code != 200: - raise Exception('server response code (put) was %d'%(code)) - print('done! (dl=%d)'%(total_download)) - passed_jobs += 1 - except Exception as ex: - print('***** Caught Exception: %s *****'%(str(ex))) - failed_jobs += 1 - time.sleep(30) - print('passed=%d | failed=%d | total=%d'%(passed_jobs, failed_jobs, passed_jobs + failed_jobs)) - time.sleep(sleep_time) - - if download_limit is not None and total_download >= download_limit: - print('download limit has been reached [%d >= %d]'%(total_download, download_limit)) - if job_limit is not None and (passed_jobs + failed_jobs) >= job_limit: - print('job limit has been reached [%d >= %d]'%(passed_jobs + failed_jobs, job_limit)) + worker = text(subprocess.check_output("echo `whoami`@`hostname`", shell=True)).strip() + print("this is [%s]" % (worker)) + if debug_mode: + print("*** running in debug mode ***") + + total_download = 0 + passed_jobs = 0 + failed_jobs = 0 + while (download_limit is None or total_download < download_limit) and (job_limit is None or (passed_jobs + failed_jobs) < job_limit): + try: + time_start = datetime.datetime.now() + req = urlopen(MASTER_URL + "?get=x&type=%s" % (job_type)) + code = req.getcode() + if code != 200: + if code == 201: + print("no jobs available") + if download_limit is None and job_limit is None: + time.sleep(60) + continue + else: + print("nothing to do, exiting") + return + else: + raise Exception("server response code (get) was %d" % (code)) + # Make the code compatible with mac os system + if platform == "darwin": + job_content = text(req.readlines()[1]) + else: + job_content = text(req.readlines()[0]) + if job_content == "no jobs": + print("no jobs available") + if download_limit is None and job_limit is None: + time.sleep(60) + continue + else: + print("nothing to do, exiting") + return + job = json.loads(job_content) + print("received job [%d|%s]" % (job["id"], job["name"])) + # updated parsing for pageviews - maybe use a regex in the future + # year, month = int(job['name'][11:15]), int(job['name'][15:17]) + year, month = int(job["name"][10:14]), int(job["name"][14:16]) + # print 'year=%d | month=%d'%(year, month) + url = "https://dumps.wikimedia.org/other/pageviews/%d/%d-%02d/%s" % (year, year, month, job["name"]) + print("downloading file [%s]..." % (url)) + subprocess.check_call("curl -s %s > raw.gz" % (url), shell=True) + print("checking file size...") + # Make the code cross-platfrom, so use python to get the size of the file + # size = int(text(subprocess.check_output('ls -l raw.gz | cut -d" " -f 5', shell=True))) + size = os.stat("raw.gz").st_size + if debug_mode: + print(size) + total_download += size + if job["hash"] != "00000000000000000000000000000000": + print("checking hash...") + out = text(subprocess.check_output("md5sum raw.gz", shell=True)) + result = out[0:32] + if result != job["hash"]: + raise Exception("wrong hash [expected %s, got %s]" % (job["hash"], result)) + if debug_mode: + print(result) + print("decompressing...") + subprocess.check_call("gunzip -f raw.gz", shell=True) + # print 'converting case...' + # subprocess.check_call('cat raw | tr "[:upper:]" "[:lower:]" > raw2', shell=True) + # subprocess.check_call('rm raw', shell=True) + subprocess.check_call("mv raw raw2", shell=True) + print("extracting article counts...") + + # Use python to read the file and extract counts, if you want to use the original shell method, please use + counts = {} + for language in wiki_util.Articles.available_languages: + lang2articles = {"en": wiki_util.Articles.en_articles, "es": wiki_util.Articles.es_articles, "pt": wiki_util.Articles.pt_articles} + articles = lang2articles[language] + articles = sorted(articles) + if debug_mode: + print("Language is {0} and target articles are {1}".format(language, articles)) + temp_counts = extract_article_counts("raw2", language, articles, debug_mode) + counts[language] = temp_counts + + if not debug_mode: + print("deleting files...") + subprocess.check_call("rm raw2", shell=True) + print("saving results...") + time_stop = datetime.datetime.now() + result = { + "id": job["id"], + "size": size, + "data": json.dumps(counts), + "worker": worker, + "elapsed": (time_stop - time_start).total_seconds(), + } + payload = json.dumps(result) + hmac_str = get_hmac_sha256(secret, payload) + if debug_mode: + print(" hmac: %s" % hmac_str) + post_data = urlencode({"put": payload, "hmac": hmac_str}) + req = urlopen(MASTER_URL, data=data(post_data)) + code = req.getcode() + if code != 200: + raise Exception("server response code (put) was %d" % (code)) + print("done! (dl=%d)" % (total_download)) + passed_jobs += 1 + except Exception as ex: + print("***** Caught Exception: %s *****" % (str(ex))) + failed_jobs += 1 + time.sleep(30) + print("passed=%d | failed=%d | total=%d" % (passed_jobs, failed_jobs, passed_jobs + failed_jobs)) + time.sleep(sleep_time) + + if download_limit is not None and total_download >= download_limit: + print("download limit has been reached [%d >= %d]" % (total_download, download_limit)) + if job_limit is not None and (passed_jobs + failed_jobs) >= job_limit: + print("job limit has been reached [%d >= %d]" % (passed_jobs + failed_jobs, job_limit)) def main(): - # version info - print('version', VERSION) + # version info + print("version", VERSION) - # args and usage - parser = argparse.ArgumentParser() - parser.add_argument('secret', type=str, help='hmac secret key') - parser.add_argument('-b', '--blimit', action='store', type=int, default=None, help='download limit, in bytes') - parser.add_argument('-j', '--jlimit', action='store', type=int, default=None, help='job limit') - parser.add_argument('-s', '--sleep', action='store', type=int, default=1, help='seconds to sleep between each job') - parser.add_argument('-t', '--type', action='store', type=int, default=0, help='type of job') - parser.add_argument('-d', '--debug', action='store_const', const=True, default=False, help='enable debug mode') - args = parser.parse_args() + # args and usage + parser = argparse.ArgumentParser() + parser.add_argument("secret", type=str, help="hmac secret key") + parser.add_argument("-b", "--blimit", action="store", type=int, default=None, help="download limit, in bytes") + parser.add_argument("-j", "--jlimit", action="store", type=int, default=None, help="job limit") + parser.add_argument("-s", "--sleep", action="store", type=int, default=1, help="seconds to sleep between each job") + parser.add_argument("-t", "--type", action="store", type=int, default=0, help="type of job") + parser.add_argument("-d", "--debug", action="store_const", const=True, default=False, help="enable debug mode") + args = parser.parse_args() - # runtime options - secret, download_limit, job_limit, sleep_time, job_type, debug_mode = args.secret, args.blimit, args.jlimit, args.sleep, args.type, args.debug + # runtime options + secret, download_limit, job_limit, sleep_time, job_type, debug_mode = args.secret, args.blimit, args.jlimit, args.sleep, args.type, args.debug - # run - run(secret, download_limit, job_limit, sleep_time, job_type, debug_mode) + # run + run(secret, download_limit, job_limit, sleep_time, job_type, debug_mode) -if __name__ == '__main__': - main() +if __name__ == "__main__": + main() diff --git a/src/acquisition/wiki/wiki_extract.py b/src/acquisition/wiki/wiki_extract.py index 839d7d6dc..cdcc440a6 100644 --- a/src/acquisition/wiki/wiki_extract.py +++ b/src/acquisition/wiki/wiki_extract.py @@ -35,74 +35,80 @@ def floor_timestamp(timestamp): - return datetime(timestamp.year, timestamp.month, timestamp.day, timestamp.hour) + return datetime(timestamp.year, timestamp.month, timestamp.day, timestamp.hour) def ceil_timestamp(timestamp): - return floor_timestamp(timestamp) + timedelta(hours=1) + return floor_timestamp(timestamp) + timedelta(hours=1) def round_timestamp(timestamp): - before = floor_timestamp(timestamp) - after = ceil_timestamp(timestamp) - if (timestamp - before) < (after - timestamp): - return before - else: - return after + before = floor_timestamp(timestamp) + after = ceil_timestamp(timestamp) + if (timestamp - before) < (after - timestamp): + return before + else: + return after def get_timestamp(name): - # new parsing for pageviews compared to pagecounts - maybe switch to regex in the future - #return datetime(int(name[11:15]), int(name[15:17]), int(name[17:19]), int(name[20:22]), int(name[22:24]), int(name[24:26])) - return datetime(int(name[10:14]), int(name[14:16]), int(name[16:18]), int(name[19:21]), int(name[21:23]), int(name[23:25])) + # new parsing for pageviews compared to pagecounts - maybe switch to regex in the future + # return datetime(int(name[11:15]), int(name[15:17]), int(name[17:19]), int(name[20:22]), int(name[22:24]), int(name[24:26])) + return datetime(int(name[10:14]), int(name[14:16]), int(name[16:18]), int(name[19:21]), int(name[21:23]), int(name[23:25])) def run(job_limit=100): - # connect to the database - u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - cur = cnx.cursor() - - # # Some preparation for utf-8, and it is a temporary trick solution. The real solution should change those char set and collation encoding to utf8 permanently - # cur.execute("SET NAMES utf8;") - # cur.execute("SET CHARACTER SET utf8;") - # # I print SHOW SESSION VARIABLES LIKE 'character\_set\_%'; and SHOW SESSION VARIABLES LIKE 'collation\_%'; on my local computer - # cur.execute("SET character_set_client=utf8mb4;") - # cur.execute("SET character_set_connection=utf8mb4;") - # cur.execute("SET character_set_database=utf8;") - # cur.execute("SET character_set_results=utf8mb4;") - # cur.execute("SET character_set_server=utf8;") - # cur.execute("SET collation_connection=utf8mb4_general_ci;") - # cur.execute("SET collation_database=utf8_general_ci;") - # cur.execute("SET collation_server=utf8_general_ci;") - - # find jobs that are queued for extraction - cur.execute('SELECT `id`, `name`, `data` FROM `wiki_raw` WHERE `status` = 2 ORDER BY `name` ASC LIMIT %s', (job_limit,)) - jobs = [] - for (id, name, data_str) in cur: - jobs.append((id, name, json.loads(data_str))) - print('Processing data from %d jobs'%(len(jobs))) - - # get the counts from the json object and insert into (or update) the database - # Notice that data_collect contains data with different languages - for (id, name, data_collect) in jobs: - print('processing job [%d|%s]...'%(id, name)) - timestamp = round_timestamp(get_timestamp(name)) - for language in data_collect.keys(): - data = data_collect[language] - for article in sorted(data.keys()): - count = data[article] - cur.execute('INSERT INTO `wiki` (`datetime`, `article`, `count`, `language`) VALUES (%s, %s, %s, %s) ON DUPLICATE KEY UPDATE `count` = `count` + %s', (str(timestamp), article.encode('utf-8').decode('latin-1'), count, language, count)) - if article == 'total': - cur.execute('INSERT INTO `wiki_meta` (`datetime`, `date`, `epiweek`, `total`, `language`) VALUES (%s, date(%s), yearweek(%s, 6), %s, %s) ON DUPLICATE KEY UPDATE `total` = `total` + %s', (str(timestamp), str(timestamp), str(timestamp), count, language, count)) - # update the job - cur.execute('UPDATE `wiki_raw` SET `status` = 3 WHERE `id` = %s', (id,)) - - # cleanup - cur.close() - cnx.commit() - cnx.close() - - -if __name__ == '__main__': - run() + # connect to the database + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") + cur = cnx.cursor() + + # # Some preparation for utf-8, and it is a temporary trick solution. The real solution should change those char set and collation encoding to utf8 permanently + # cur.execute("SET NAMES utf8;") + # cur.execute("SET CHARACTER SET utf8;") + # # I print SHOW SESSION VARIABLES LIKE 'character\_set\_%'; and SHOW SESSION VARIABLES LIKE 'collation\_%'; on my local computer + # cur.execute("SET character_set_client=utf8mb4;") + # cur.execute("SET character_set_connection=utf8mb4;") + # cur.execute("SET character_set_database=utf8;") + # cur.execute("SET character_set_results=utf8mb4;") + # cur.execute("SET character_set_server=utf8;") + # cur.execute("SET collation_connection=utf8mb4_general_ci;") + # cur.execute("SET collation_database=utf8_general_ci;") + # cur.execute("SET collation_server=utf8_general_ci;") + + # find jobs that are queued for extraction + cur.execute("SELECT `id`, `name`, `data` FROM `wiki_raw` WHERE `status` = 2 ORDER BY `name` ASC LIMIT %s", (job_limit,)) + jobs = [] + for (id, name, data_str) in cur: + jobs.append((id, name, json.loads(data_str))) + print("Processing data from %d jobs" % (len(jobs))) + + # get the counts from the json object and insert into (or update) the database + # Notice that data_collect contains data with different languages + for (id, name, data_collect) in jobs: + print("processing job [%d|%s]..." % (id, name)) + timestamp = round_timestamp(get_timestamp(name)) + for language in data_collect.keys(): + data = data_collect[language] + for article in sorted(data.keys()): + count = data[article] + cur.execute( + "INSERT INTO `wiki` (`datetime`, `article`, `count`, `language`) VALUES (%s, %s, %s, %s) ON DUPLICATE KEY UPDATE `count` = `count` + %s", + (str(timestamp), article.encode("utf-8").decode("latin-1"), count, language, count), + ) + if article == "total": + cur.execute( + "INSERT INTO `wiki_meta` (`datetime`, `date`, `epiweek`, `total`, `language`) VALUES (%s, date(%s), yearweek(%s, 6), %s, %s) ON DUPLICATE KEY UPDATE `total` = `total` + %s", + (str(timestamp), str(timestamp), str(timestamp), count, language, count), + ) + # update the job + cur.execute("UPDATE `wiki_raw` SET `status` = 3 WHERE `id` = %s", (id,)) + + # cleanup + cur.close() + cnx.commit() + cnx.close() + + +if __name__ == "__main__": + run() diff --git a/src/acquisition/wiki/wiki_update.py b/src/acquisition/wiki/wiki_update.py index 411544810..773b9351d 100644 --- a/src/acquisition/wiki/wiki_update.py +++ b/src/acquisition/wiki/wiki_update.py @@ -32,87 +32,87 @@ def floor_timestamp(timestamp): - return datetime(timestamp.year, timestamp.month, timestamp.day, timestamp.hour) + return datetime(timestamp.year, timestamp.month, timestamp.day, timestamp.hour) def ceil_timestamp(timestamp): - return floor_timestamp(timestamp) + timedelta(hours=1) + return floor_timestamp(timestamp) + timedelta(hours=1) def round_timestamp(timestamp): - before = floor_timestamp(timestamp) - after = ceil_timestamp(timestamp) - if (timestamp - before) < (after - timestamp): - return before - else: - return after + before = floor_timestamp(timestamp) + after = ceil_timestamp(timestamp) + if (timestamp - before) < (after - timestamp): + return before + else: + return after def get_timestamp(name): - # If the program is cold start (there are no previous names in the table, and the name will be None) - if name is None: - curr = datetime.now() - return datetime(curr.year, curr.month, curr.day, curr.hour, curr.minute, curr.second) - # new parsing for pageviews compared to pagecounts - maybe switch to regex in the future - #return datetime(int(name[11:15]), int(name[15:17]), int(name[17:19]), int(name[20:22]), int(name[22:24]), int(name[24:26])) - return datetime(int(name[10:14]), int(name[14:16]), int(name[16:18]), int(name[19:21]), int(name[21:23]), int(name[23:25])) + # If the program is cold start (there are no previous names in the table, and the name will be None) + if name is None: + curr = datetime.now() + return datetime(curr.year, curr.month, curr.day, curr.hour, curr.minute, curr.second) + # new parsing for pageviews compared to pagecounts - maybe switch to regex in the future + # return datetime(int(name[11:15]), int(name[15:17]), int(name[17:19]), int(name[20:22]), int(name[22:24]), int(name[24:26])) + return datetime(int(name[10:14]), int(name[14:16]), int(name[16:18]), int(name[19:21]), int(name[21:23]), int(name[23:25])) def get_manifest(year, month, optional=False): - # unlike pagecounts-raw, pageviews doesn't provide hashes - #url = 'https://dumps.wikimedia.org/other/pagecounts-raw/%d/%d-%02d/md5sums.txt'%(year, year, month) - url = 'https://dumps.wikimedia.org/other/pageviews/%d/%d-%02d/' % (year, year, month) - print('Checking manifest at %s...'%(url)) - response = requests.get(url) - if response.status_code == 200: - #manifest = [line.strip().split() for line in response.text.split('\n') if 'pagecounts' in line] - manifest = [('00000000000000000000000000000000', line[9:37]) for line in response.text.split('\n') if ' max_name: - new_logs[name] = hash - print(' New job: %s [%s]'%(name, hash)) - print('Found %d new job(s)'%(len(new_logs))) - - # store metadata for new jobs - for name in sorted(new_logs.keys()): - cur.execute('INSERT INTO `wiki_raw` (`name`, `hash`) VALUES (%s, %s)', (name, new_logs[name])) - - # cleanup - cur.close() - cnx.commit() - cnx.close() - - -if __name__ == '__main__': - run() + # connect to the database + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") + cur = cnx.cursor() + + # get the most recent job in wiki_raw + # luckily, "pageviews" is lexicographically greater than "pagecounts-raw" + cur.execute("SELECT max(`name`) FROM `wiki_raw`") + for (max_name,) in cur: + pass + print("Last known file: %s" % (max_name)) + timestamp = get_timestamp(max_name) + + # crawl dumps.wikimedia.org to find more recent access logs + t1, t2 = floor_timestamp(timestamp), ceil_timestamp(timestamp) + manifest = get_manifest(t1.year, t1.month, optional=False) + if t2.month != t1.month: + manifest += get_manifest(t2.year, t2.month, optional=True) + + # find access logs newer than the most recent job + new_logs = {} + for (hash, name) in manifest: + if max_name is None or name > max_name: + new_logs[name] = hash + print(" New job: %s [%s]" % (name, hash)) + print("Found %d new job(s)" % (len(new_logs))) + + # store metadata for new jobs + for name in sorted(new_logs.keys()): + cur.execute("INSERT INTO `wiki_raw` (`name`, `hash`) VALUES (%s, %s)", (name, new_logs[name])) + + # cleanup + cur.close() + cnx.commit() + cnx.close() + + +if __name__ == "__main__": + run() diff --git a/src/acquisition/wiki/wiki_util.py b/src/acquisition/wiki/wiki_util.py index ed3c743bc..55bf3e2ca 100644 --- a/src/acquisition/wiki/wiki_util.py +++ b/src/acquisition/wiki/wiki_util.py @@ -1,159 +1,156 @@ - - - class Articles: # Notice that all languages must be two chars, because that `language` column in table `wiki` is CHAR(2) - available_languages = ['en', 'es', 'pt'] + available_languages = ["en", "es", "pt"] en_articles_flu = [ - 'Influenza_B_virus', - 'Influenza_A_virus', - 'Human_flu', - 'Influenzavirus_C', - 'Oseltamivir', - 'Influenza', - 'Influenzavirus_A', - 'Influenza_A_virus_subtype_H1N1', - 'Zanamivir', - 'Influenza-like_illness', - 'Common_cold', - 'Sore_throat', - 'Flu_season', - 'Chills', - 'Fever', - 'Influenza_A_virus_subtype_H2N2', - 'Swine_influenza', - 'Shivering', - 'Canine_influenza', - 'Influenza_A_virus_subtype_H3N2', - 'Neuraminidase_inhibitor', - 'Influenza_pandemic', - 'Viral_pneumonia', - 'Influenza_prevention', - 'Influenza_A_virus_subtype_H1N2', - 'Rhinorrhea', - 'Orthomyxoviridae', - 'Nasal_congestion', - 'Gastroenteritis', - 'Rimantadine', - 'Paracetamol', - 'Amantadine', - 'Viral_neuraminidase', - 'Headache', - 'Influenza_vaccine', - 'Vomiting', - 'Cough', - 'Influenza_A_virus_subtype_H5N1', - 'Nausea', - 'Avian_influenza', - 'Influenza_A_virus_subtype_H7N9', - 'Influenza_A_virus_subtype_H10N7', - 'Influenza_A_virus_subtype_H9N2', - 'Hemagglutinin_(influenza)', - 'Influenza_A_virus_subtype_H7N7', - 'Fatigue_(medical)', - 'Myalgia', - 'Influenza_A_virus_subtype_H7N3', - 'Malaise', - 'Equine_influenza', - 'Cat_flu', - 'Influenza_A_virus_subtype_H3N8', - 'Antiviral_drugs', - 'Influenza_A_virus_subtype_H7N2', + "Influenza_B_virus", + "Influenza_A_virus", + "Human_flu", + "Influenzavirus_C", + "Oseltamivir", + "Influenza", + "Influenzavirus_A", + "Influenza_A_virus_subtype_H1N1", + "Zanamivir", + "Influenza-like_illness", + "Common_cold", + "Sore_throat", + "Flu_season", + "Chills", + "Fever", + "Influenza_A_virus_subtype_H2N2", + "Swine_influenza", + "Shivering", + "Canine_influenza", + "Influenza_A_virus_subtype_H3N2", + "Neuraminidase_inhibitor", + "Influenza_pandemic", + "Viral_pneumonia", + "Influenza_prevention", + "Influenza_A_virus_subtype_H1N2", + "Rhinorrhea", + "Orthomyxoviridae", + "Nasal_congestion", + "Gastroenteritis", + "Rimantadine", + "Paracetamol", + "Amantadine", + "Viral_neuraminidase", + "Headache", + "Influenza_vaccine", + "Vomiting", + "Cough", + "Influenza_A_virus_subtype_H5N1", + "Nausea", + "Avian_influenza", + "Influenza_A_virus_subtype_H7N9", + "Influenza_A_virus_subtype_H10N7", + "Influenza_A_virus_subtype_H9N2", + "Hemagglutinin_(influenza)", + "Influenza_A_virus_subtype_H7N7", + "Fatigue_(medical)", + "Myalgia", + "Influenza_A_virus_subtype_H7N3", + "Malaise", + "Equine_influenza", + "Cat_flu", + "Influenza_A_virus_subtype_H3N8", + "Antiviral_drugs", + "Influenza_A_virus_subtype_H7N2", ] en_articles_noro = [ - 'Norovirus', - 'Diarrhea', - 'Dehydration', - 'Gastroenteritis', - 'Vomiting', - 'Abdominal_pain', - 'Nausea', - 'Foodborne_illness', - 'Rotavirus', - 'Fecal–oral_route', - 'Intravenous_therapy', - 'Oral_rehydration_therapy', - 'Shellfish', - 'Caliciviridae', - 'Leaky_scanning', + "Norovirus", + "Diarrhea", + "Dehydration", + "Gastroenteritis", + "Vomiting", + "Abdominal_pain", + "Nausea", + "Foodborne_illness", + "Rotavirus", + "Fecal–oral_route", + "Intravenous_therapy", + "Oral_rehydration_therapy", + "Shellfish", + "Caliciviridae", + "Leaky_scanning", ] en_articles_dengue = [ - 'Dengue_fever', - 'Dengue_virus', - 'Aedes', - 'Aedes_aegypti', - 'Dengue_vaccine', - 'Mosquito', - 'Mosquito-borne_disease', - 'Blood_transfusion', - 'Paracetamol', - 'Fever', - 'Headache', - 'Rhinitis', - 'Flavivirus', - 'Exanthem', - 'Myalgia', - 'Arthralgia', - 'Thrombocytopenia', - 'Hematuria', - 'Nosebleed', - 'Petechia', - 'Nausea', - 'Vomiting', - 'Diarrhea', + "Dengue_fever", + "Dengue_virus", + "Aedes", + "Aedes_aegypti", + "Dengue_vaccine", + "Mosquito", + "Mosquito-borne_disease", + "Blood_transfusion", + "Paracetamol", + "Fever", + "Headache", + "Rhinitis", + "Flavivirus", + "Exanthem", + "Myalgia", + "Arthralgia", + "Thrombocytopenia", + "Hematuria", + "Nosebleed", + "Petechia", + "Nausea", + "Vomiting", + "Diarrhea", ] en_articles = list(set(en_articles_flu + en_articles_noro + en_articles_dengue)) es_articles = [ - 'Dengue', - 'Virus_dengue', - 'Aedes', - 'Aedes_aegypti', - 'Culicidae', - 'Transfusión_de_sangre', - 'Paracetamol', - 'Fiebre', - 'Cefalea', - 'Coriza', - 'Flavivirus', - 'Exantema', - 'Mosquito', - 'Mialgia', - 'Artralgia', - 'Trombocitopenia', - 'Hematuria', - 'Epistaxis', - 'Petequia', - 'Náusea', - 'Vómito', - 'Diarrea', + "Dengue", + "Virus_dengue", + "Aedes", + "Aedes_aegypti", + "Culicidae", + "Transfusión_de_sangre", + "Paracetamol", + "Fiebre", + "Cefalea", + "Coriza", + "Flavivirus", + "Exantema", + "Mosquito", + "Mialgia", + "Artralgia", + "Trombocitopenia", + "Hematuria", + "Epistaxis", + "Petequia", + "Náusea", + "Vómito", + "Diarrea", ] pt_articles = [ - 'Dengue', - 'Vírus_da_dengue', - 'Aedes', - 'Aedes_aegypti', - 'Culicidae', - 'Transfusão_de_sangue', - 'Paracetamol', - 'Febre', - 'Cefaleia', - 'Coriza', - 'Flavivírus', - 'Exantema', - 'Mialgia', - 'Artralgia', - 'Trombocitopenia', - 'Hematúria', - 'Epistaxe', - 'Petéquia', - 'Náusea', - 'Vômito', - 'Diarreia', + "Dengue", + "Vírus_da_dengue", + "Aedes", + "Aedes_aegypti", + "Culicidae", + "Transfusão_de_sangue", + "Paracetamol", + "Febre", + "Cefaleia", + "Coriza", + "Flavivírus", + "Exantema", + "Mialgia", + "Artralgia", + "Trombocitopenia", + "Hematúria", + "Epistaxe", + "Petéquia", + "Náusea", + "Vômito", + "Diarreia", ]