From bad0b4b92181b2e8b0d18713b73dc72c35148b6e Mon Sep 17 00:00:00 2001 From: Rostyslav Zatserkovnyi Date: Fri, 2 Dec 2022 14:57:14 +0200 Subject: [PATCH 1/6] Acquisition --- src/acquisition/afhsb/afhsb_csv.py | 102 +++++++++++------- src/acquisition/afhsb/afhsb_sql.py | 2 +- src/acquisition/afhsb/afhsb_update.py | 1 + src/acquisition/cdcp/cdc_dropbox_receiver.py | 5 - src/acquisition/cdcp/cdc_extract.py | 10 +- src/acquisition/cdcp/cdc_upload.py | 2 - src/acquisition/covid_hosp/common/network.py | 1 - .../covid_hosp/common/test_utils.py | 2 - .../covid_hosp/state_daily/database.py | 2 - .../covid_hosp/state_daily/network.py | 4 - .../covid_hosp/state_daily/update.py | 7 +- src/acquisition/covidcast/csv_importer.py | 5 +- src/acquisition/covidcast/csv_to_database.py | 4 +- src/acquisition/covidcast/database.py | 9 +- src/acquisition/covidcast/logger.py | 2 +- .../covidcast/migrate_epidata_to_v4.py | 11 +- .../covidcast/signal_dash_data_generator.py | 5 +- .../covidcast_nowcast/load_sensors.py | 4 +- src/acquisition/ecdc/ecdc_db_update.py | 15 +-- src/acquisition/ecdc/ecdc_ili.py | 17 +-- src/acquisition/flusurv/flusurv.py | 2 +- src/acquisition/fluview/fluview_update.py | 62 ++++++----- .../fluview/impute_missing_values.py | 1 - src/acquisition/ght/ght_update.py | 1 - src/acquisition/kcdc/kcdc_update.py | 10 +- src/acquisition/nidss/taiwan_nidss.py | 8 +- src/acquisition/norostat/norostat_sql.py | 11 +- src/acquisition/norostat/norostat_update.py | 14 --- src/acquisition/norostat/norostat_utils.py | 3 - src/acquisition/paho/paho_db_update.py | 19 ++-- src/acquisition/paho/paho_download.py | 6 +- src/acquisition/quidel/quidel.py | 16 ++- src/acquisition/twtr/pageparser.py | 17 ++- src/acquisition/twtr/twitter_update.py | 38 +++---- src/acquisition/wiki/wiki_download.py | 2 +- 35 files changed, 186 insertions(+), 234 deletions(-) diff --git a/src/acquisition/afhsb/afhsb_csv.py b/src/acquisition/afhsb/afhsb_csv.py index a4a14ad00..b839c4053 100644 --- a/src/acquisition/afhsb/afhsb_csv.py +++ b/src/acquisition/afhsb/afhsb_csv.py @@ -14,60 +14,73 @@ import csv import os -import sas7bdat import pickle +import sas7bdat import epiweeks as epi DATAPATH = '/home/automation/afhsb_data' -SOURCE_DIR = DATAPATH -TARGET_DIR = DATAPATH +SOURCE_DIR = DATAPATH +TARGET_DIR = DATAPATH INVALID_DMISIDS = set() def get_flu_cat(dx): # flu1 (influenza) - if (len(dx) == 0): return None + if len(dx) == 0: + return None dx = dx.capitalize() - if (dx.isnumeric()): + if dx.isnumeric(): for prefix in ["487", "488"]: - if (dx.startswith(prefix)): return 1 + if dx.startswith(prefix): + return 1 for i in range(0, 7): prefix = str(480 + i) - if (dx.startswith(prefix)): return 2 + if dx.startswith(prefix): + return 2 for i in range(0, 7): prefix = str(460 + i) - if (dx.startswith(prefix)): return 3 + if dx.startswith(prefix): + return 3 for prefix in ["07999", "3829", "7806", "7862"]: - if (dx.startswith(prefix)): return 3 + if dx.startswith(prefix): + return 3 elif (dx[0].isalpha() and dx[1:].isnumeric()): for prefix in ["J09", "J10", "J11"]: - if (dx.startswith(prefix)): return 1 + if dx.startswith(prefix): + return 1 for i in range(12, 19): prefix = "J{}".format(i) - if (dx.startswith(prefix)): return 2 + if dx.startswith(prefix): + return 2 for i in range(0, 7): prefix = "J0{}".format(i) - if (dx.startswith(prefix)): return 3 + if dx.startswith(prefix): + return 3 for i in range(20, 23): prefix = "J{}".format(i) - if (dx.startswith(prefix)): return 3 + if dx.startswith(prefix): + return 3 for prefix in ["J40", "R05", "H669", "R509", "B9789"]: - if (dx.startswith(prefix)): return 3 + if dx.startswith(prefix): + return 3 else: return None def aggregate_data(sourcefile, targetfile): reader = sas7bdat.SAS7BDAT(os.path.join(SOURCE_DIR, sourcefile), skip_header=True) # map column names to column indices - COL2IDX = {column.name.decode('utf-8'): column.col_id for column in reader.columns} - def get_field(row, column): return row[COL2IDX[column]] + col_2_idx = {column.name.decode('utf-8'): column.col_id for column in reader.columns} + + def get_field(row, column): + return row[col_2_idx[column]] def row2flu(row): for i in range(1, 9): dx = get_field(row, "dx{}".format(i)) flu_cat = get_flu_cat(dx) - if (flu_cat != None): return flu_cat + if flu_cat is not None: + return flu_cat return 0 def row2epiweek(row): @@ -77,10 +90,11 @@ def row2epiweek(row): year, week_num = week_tuple[0], week_tuple[1] return year, week_num - results_dict = dict() - for r, row in enumerate(reader): + results_dict = {} + for _, row in enumerate(reader): # if (r >= 1000000): break - if (get_field(row, 'type') != "Outpt"): continue + if get_field(row, 'type') != "Outpt": + continue year, week_num = row2epiweek(row) dmisid = get_field(row, 'DMISID') flu_cat = row2flu(row) @@ -88,17 +102,18 @@ def row2epiweek(row): key_list = [year, week_num, dmisid, flu_cat] curr_dict = results_dict for i, key in enumerate(key_list): - if (i == len(key_list) - 1): - if (not key in curr_dict): curr_dict[key] = 0 + if i == len(key_list) - 1: + if key not in curr_dict: + curr_dict[key] = 0 curr_dict[key] += 1 else: - if (not key in curr_dict): curr_dict[key] = dict() + if key not in curr_dict: + curr_dict[key] = {} curr_dict = curr_dict[key] results_path = os.path.join(TARGET_DIR, targetfile) with open(results_path, 'wb') as f: pickle.dump(results_dict, f, pickle.HIGHEST_PROTOCOL) - return ################# Functions for geographical information #################### @@ -122,7 +137,7 @@ def format_dmisid_csv(filename, target_name): src_csv = open(src_path, "r", encoding='utf-8-sig') reader = csv.DictReader(src_csv) - + dst_csv = open(dst_path, "w") fieldnames = ['dmisid', 'country', 'state', 'zip5'] writer = csv.DictWriter(dst_csv, fieldnames=fieldnames) @@ -132,9 +147,11 @@ def format_dmisid_csv(filename, target_name): for row in reader: country2 = row['Facility ISO Country Code'] - if (country2 == ""): country3 = "" - elif (not country2 in country_mapping): - for key in row.keys(): print(key, row[key]) + if country2 == "": + country3 = "" + elif country2 not in country_mapping: + for key in row.keys(): + print(key, row[key]) continue else: country3 = country_mapping[country2] @@ -149,6 +166,7 @@ def dmisid(): target_name = "simple_DMISID_FY2018.csv" format_dmisid_csv(filename, target_name) + cen2states = {'cen1': {'CT', 'ME', 'MA', 'NH', 'RI', 'VT'}, 'cen2': {'NJ', 'NY', 'PA'}, 'cen3': {'IL', 'IN', 'MI', 'OH', 'WI'}, @@ -175,7 +193,7 @@ def state2region(D): for region in D.keys(): states = D[region] for state in states: - assert(not state in results) + assert state not in results results[state] = region return results @@ -204,7 +222,7 @@ def write_afhsb_csv(period): with open(os.path.join(TARGET_DIR, "{}.csv".format(period)), 'w') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() - + i = 0 for year in sorted(results_dict.keys()): year_dict = results_dict[year] @@ -217,11 +235,12 @@ def write_afhsb_csv(period): i += 1 epiweek = int("{}{:02d}".format(year, week)) flu_type = flu_mapping[flu] - + row = {"epiweek": epiweek, "dmisid": None if (not dmisid.isnumeric()) else dmisid, "flu_type": flu_type, "visit_sum": visit_sum, "id": i} writer.writerow(row) - if (i % 100000 == 0): print(row) + if i % 100000 == 0: + print(row) def dmisid_start_time_from_file(filename): starttime_record = dict() @@ -230,7 +249,7 @@ def dmisid_start_time_from_file(filename): for row in reader: dmisid = row['dmisid'] epiweek = int(row['epiweek']) - if (not dmisid in starttime_record): + if dmisid not in starttime_record: starttime_record[dmisid] = epiweek else: starttime_record[dmisid] = min(epiweek, starttime_record[dmisid]) @@ -241,7 +260,7 @@ def dmisid_start_time(): record2 = dmisid_start_time_from_file(os.path.join(TARGET_DIR, "13to17.csv")) record = record1 for dmisid, epiweek in record2.items(): - if (dmisid in record): + if dmisid in record: record[dmisid] = min(record[dmisid], epiweek) else: record[dmisid] = epiweek @@ -261,10 +280,10 @@ def fillin_zero_to_csv(period, dmisid_start_record): dmisid = row['dmisid'] flu_type = row['flu_type'] visit_sum = row['visit_sum'] - if (not epiweek in results_dict): + if epiweek not in results_dict: results_dict[epiweek] = dict() week_dict = results_dict[epiweek] - if (not dmisid in week_dict): + if dmisid not in week_dict: week_dict[dmisid] = dict() dmisid_dict = week_dict[dmisid] dmisid_dict[flu_type] = visit_sum @@ -277,14 +296,15 @@ def fillin_zero_to_csv(period, dmisid_start_record): week_dict = results_dict[epiweek] for dmisid in dmisid_group: start_week = dmisid_start_record[dmisid] - if (start_week > epiweek): continue + if start_week > epiweek: + continue - if (not dmisid in week_dict): + if dmisid not in week_dict: week_dict[dmisid] = dict() dmisid_dict = week_dict[dmisid] for flutype in flutype_group: - if (not flutype in dmisid_dict): + if flutype not in dmisid_dict: dmisid_dict[flutype] = 0 # Write to csv files @@ -301,7 +321,7 @@ def fillin_zero_to_csv(period, dmisid_start_record): row = {"id": i, "epiweek": epiweek, "dmisid": dmisid, "flu_type": flutype, "visit_sum": visit_sum} writer.writerow(row) - if (i % 100000 == 0): + if i % 100000 == 0: print(row) i += 1 print("Wrote {} rows".format(i)) @@ -328,4 +348,4 @@ def main(): if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/src/acquisition/afhsb/afhsb_sql.py b/src/acquisition/afhsb/afhsb_sql.py index da81520d6..278f3fc38 100644 --- a/src/acquisition/afhsb/afhsb_sql.py +++ b/src/acquisition/afhsb/afhsb_sql.py @@ -159,7 +159,7 @@ def init_all_tables(datapath): raw_table_name = 'afhsb_{}_raw'.format(period) state_table_name = 'afhsb_{}_state'.format(period) region_table_name = 'afhsb_{}_region'.format(period) - + init_raw_data(raw_table_name, os.path.join(datapath, "filled_{}.csv".format(period))) agg_by_state(raw_table_name, state_table_name) agg_by_region(state_table_name, region_table_name) diff --git a/src/acquisition/afhsb/afhsb_update.py b/src/acquisition/afhsb/afhsb_update.py index 91bf31910..c5a8635c8 100644 --- a/src/acquisition/afhsb/afhsb_update.py +++ b/src/acquisition/afhsb/afhsb_update.py @@ -34,5 +34,6 @@ def main(): afhsb_sql.init_all_tables(tmp_datapath) # (Temporary parent directory should be deleted automatically.) + if __name__ == '__main__': main() diff --git a/src/acquisition/cdcp/cdc_dropbox_receiver.py b/src/acquisition/cdcp/cdc_dropbox_receiver.py index bfcf94979..eb0d97f2a 100644 --- a/src/acquisition/cdcp/cdc_dropbox_receiver.py +++ b/src/acquisition/cdcp/cdc_dropbox_receiver.py @@ -17,7 +17,6 @@ """ # standard library -import argparse import datetime from zipfile import ZIP_DEFLATED, ZipFile @@ -149,10 +148,6 @@ def fetch_data(): def main(): - # args and usage - parser = argparse.ArgumentParser() - args = parser.parse_args() - # fetch new data fetch_data() diff --git a/src/acquisition/cdcp/cdc_extract.py b/src/acquisition/cdcp/cdc_extract.py index 63297c369..83ed08d5b 100644 --- a/src/acquisition/cdcp/cdc_extract.py +++ b/src/acquisition/cdcp/cdc_extract.py @@ -67,13 +67,11 @@ # third party import mysql.connector -import numpy as np # first party -from . import cdc_upload import delphi.operations.secrets as secrets -from delphi.utils.epidate import EpiDate import delphi.utils.epiweek as flu +from . import cdc_upload def get_num_hits(cur, epiweek, state, page): @@ -95,8 +93,7 @@ def get_num_hits(cur, epiweek, state, page): pass if num is None: return 0 - else: - return num + return num def get_total_hits(cur, epiweek, state): @@ -114,8 +111,7 @@ def get_total_hits(cur, epiweek, state): pass if total is None: raise Exception('missing data for %d-%s' % (epiweek, state)) - else: - return total + return total def store_result(cur, epiweek, state, num1, num2, num3, num4, num5, num6, num7, num8, total): diff --git a/src/acquisition/cdcp/cdc_upload.py b/src/acquisition/cdcp/cdc_upload.py index 3178ff895..c9c206dfa 100644 --- a/src/acquisition/cdcp/cdc_upload.py +++ b/src/acquisition/cdcp/cdc_upload.py @@ -77,7 +77,6 @@ import io import os import shutil -import sys from zipfile import ZipFile # third party @@ -85,7 +84,6 @@ # first party import delphi.operations.secrets as secrets -import delphi.utils.epiweek as flu STATES = { diff --git a/src/acquisition/covid_hosp/common/network.py b/src/acquisition/covid_hosp/common/network.py index 1015620d2..ba0cca281 100644 --- a/src/acquisition/covid_hosp/common/network.py +++ b/src/acquisition/covid_hosp/common/network.py @@ -1,6 +1,5 @@ # third party import pandas -import requests class Network: diff --git a/src/acquisition/covid_hosp/common/test_utils.py b/src/acquisition/covid_hosp/common/test_utils.py index 0779d179a..2a737b383 100644 --- a/src/acquisition/covid_hosp/common/test_utils.py +++ b/src/acquisition/covid_hosp/common/test_utils.py @@ -9,9 +9,7 @@ """ # standard library -import json from pathlib import Path -from unittest.mock import Mock # third party import pandas diff --git a/src/acquisition/covid_hosp/state_daily/database.py b/src/acquisition/covid_hosp/state_daily/database.py index 1563161a5..58eaf8190 100644 --- a/src/acquisition/covid_hosp/state_daily/database.py +++ b/src/acquisition/covid_hosp/state_daily/database.py @@ -3,8 +3,6 @@ from delphi.epidata.acquisition.covid_hosp.common.database import Columndef from delphi.epidata.acquisition.covid_hosp.common.utils import Utils -import pandas as pd - class Database(BaseDatabase): diff --git a/src/acquisition/covid_hosp/state_daily/network.py b/src/acquisition/covid_hosp/state_daily/network.py index 1879af9b3..f4678cc9b 100644 --- a/src/acquisition/covid_hosp/state_daily/network.py +++ b/src/acquisition/covid_hosp/state_daily/network.py @@ -1,7 +1,3 @@ -# third party - -import requests - # first party from delphi.epidata.acquisition.covid_hosp.common.network import Network as BaseNetwork diff --git a/src/acquisition/covid_hosp/state_daily/update.py b/src/acquisition/covid_hosp/state_daily/update.py index a2f905601..12a51e6c3 100644 --- a/src/acquisition/covid_hosp/state_daily/update.py +++ b/src/acquisition/covid_hosp/state_daily/update.py @@ -3,12 +3,6 @@ dataset provided by the US Department of Health & Human Services via healthdata.gov. """ -# standard library -import json - -# third party -import pandas as pd - # first party from delphi.epidata.acquisition.covid_hosp.common.utils import Utils from delphi.epidata.acquisition.covid_hosp.state_daily.database import Database @@ -29,5 +23,6 @@ def run(network=Network): return Utils.update_dataset(Database, network) + # main entry point Utils.launch_if_main(Update.run, __name__) diff --git a/src/acquisition/covidcast/csv_importer.py b/src/acquisition/covidcast/csv_importer.py index cce25cd34..7b88ba00f 100644 --- a/src/acquisition/covidcast/csv_importer.py +++ b/src/acquisition/covidcast/csv_importer.py @@ -3,7 +3,6 @@ # standard library from datetime import date import glob -import math import os import re @@ -216,7 +215,7 @@ def validate_quantity(row, attr_quantity): try: quantity = CsvImporter.maybe_apply(float, getattr(row, attr_quantity)) return quantity - except (ValueError, AttributeError) as e: + except (ValueError, AttributeError): # val was a string or another data return "Error" @@ -265,7 +264,7 @@ def extract_and_check_row(row, geo_type, filepath=None): # use consistent capitalization (e.g. for states) try: geo_id = row.geo_id.lower() - except AttributeError as e: + except AttributeError: # geo_id was `None` return (None, 'geo_id') diff --git a/src/acquisition/covidcast/csv_to_database.py b/src/acquisition/covidcast/csv_to_database.py index 34cbad663..bbc722373 100644 --- a/src/acquisition/covidcast/csv_to_database.py +++ b/src/acquisition/covidcast/csv_to_database.py @@ -42,10 +42,10 @@ def make_handlers(data_dir, specific_issue_date, file_archiver_impl=FileArchiver if specific_issue_date: # issue-specific uploads are always one-offs, so we can leave all # files in place without worrying about cleaning up - def handle_failed(path_src, filename, source, logger): + def handle_failed(_, filename, source, logger): logger.info(event='leaving failed file alone', dest=source, file=filename) - def handle_successful(path_src, filename, source, logger): + def handle_successful(path_src, filename, _, logger): logger.info(event='archiving as successful',file=filename) file_archiver_impl.archive_inplace(path_src, filename) else: diff --git a/src/acquisition/covidcast/database.py b/src/acquisition/covidcast/database.py index 58631145a..d21a27c35 100644 --- a/src/acquisition/covidcast/database.py +++ b/src/acquisition/covidcast/database.py @@ -131,16 +131,16 @@ def _reset_load_table_ai_counter(self): This is also destructive to any data in the load table. """ - self._cursor.execute(f'DELETE FROM epimetric_load') + self._cursor.execute('DELETE FROM epimetric_load') # NOTE: 'ones' are used as filler here for the (required) NOT NULL columns. - self._cursor.execute(f""" + self._cursor.execute(""" INSERT INTO epimetric_load (epimetric_id, source, `signal`, geo_type, geo_value, time_type, time_value, issue, `lag`, value_updated_timestamp) VALUES ((SELECT 1+MAX(epimetric_id) FROM epimetric_full), '1', '1', '1', '1', '1', 1, 1, 1, 1);""") - self._cursor.execute(f'DELETE FROM epimetric_load') + self._cursor.execute('DELETE FROM epimetric_load') def do_analyze(self): """performs and stores key distribution analyses, used for join order and index selection""" @@ -201,7 +201,6 @@ def insert_or_update_batch(self, cc_rows, batch_size=2**20, commit_partial=False for batch_num in range(num_batches): start = batch_num * batch_size end = min(num_rows, start + batch_size) - length = end - start args = [( row.source, @@ -377,8 +376,6 @@ def delete_batch(self, cc_deletions): # composite keys: short_comp_key = "`source`, `signal`, `time_type`, `geo_type`, `time_value`, `geo_value`" long_comp_key = short_comp_key + ", `issue`" - short_comp_ref_key = "`signal_key_id`, `geo_key_id`, `time_type`, `time_value`" - long_comp_ref_key = short_comp_ref_key + ", `issue`" create_tmp_table_sql = f''' CREATE TABLE {tmp_table_name} LIKE {self.load_table}; diff --git a/src/acquisition/covidcast/logger.py b/src/acquisition/covidcast/logger.py index 1db86ec57..ad3b3679f 100644 --- a/src/acquisition/covidcast/logger.py +++ b/src/acquisition/covidcast/logger.py @@ -56,7 +56,7 @@ def get_structured_logger(name=__name__, handlers=handlers ) - def add_pid(logger, method_name, event_dict): + def add_pid(_logger, _method_name, event_dict): """ Add current PID to the event dict. """ diff --git a/src/acquisition/covidcast/migrate_epidata_to_v4.py b/src/acquisition/covidcast/migrate_epidata_to_v4.py index a4afafc11..f5522337e 100644 --- a/src/acquisition/covidcast/migrate_epidata_to_v4.py +++ b/src/acquisition/covidcast/migrate_epidata_to_v4.py @@ -1,3 +1,8 @@ +import argparse +import sys +import time +from delphi.epidata.acquisition.covidcast.database import Database + # run as: # python3 -u -m delphi.epidata.acquisition.covidcast.migrate_epidata_to_v4 # ("-u" allows unbuffered print statements so we can watch timing in closer-to-real-time) @@ -55,11 +60,6 @@ ''' - -from delphi.epidata.acquisition.covidcast.database import Database -import time -import argparse - def start_tx(cursor): cursor.execute('SET SESSION TRANSACTION ISOLATION LEVEL READ UNCOMMITTED;') cursor.execute('SET autocommit=0;') # starts a transaction as suggested in https://dev.mysql.com/doc/refman/8.0/en/lock-tables.html @@ -142,7 +142,6 @@ def main(destination_schema, batch_size, start_id, upper_lim_override): print("this WILL truncate any existing v4 tables") print() if input("type 'yes' to continue: ") != 'yes': - import sys sys.exit('operation cancelled!') print(f"starting run at: {time.strftime('%c')}") diff --git a/src/acquisition/covidcast/signal_dash_data_generator.py b/src/acquisition/covidcast/signal_dash_data_generator.py index 81d95728a..2e7467487 100644 --- a/src/acquisition/covidcast/signal_dash_data_generator.py +++ b/src/acquisition/covidcast/signal_dash_data_generator.py @@ -195,8 +195,7 @@ def get_latest_time_value_from_metadata(dashboard_signal, metadata): return df_for_source["max_time"].max().date() -def get_coverage(dashboard_signal: DashboardSignal, - metadata) -> List[DashboardSignalCoverage]: +def get_coverage(dashboard_signal: DashboardSignal) -> List[DashboardSignalCoverage]: """Get the most recent coverage for the signal.""" count_by_geo_type_df = pd.read_csv( COVERAGE_URL.format(source=dashboard_signal.source, @@ -252,7 +251,7 @@ def main(args): metadata) latest_time_value = get_latest_time_value_from_metadata( dashboard_signal, metadata) - latest_coverage = get_coverage(dashboard_signal, metadata) + latest_coverage = get_coverage(dashboard_signal) signal_status_list.append( DashboardSignalStatus( diff --git a/src/acquisition/covidcast_nowcast/load_sensors.py b/src/acquisition/covidcast_nowcast/load_sensors.py index 079b2f27c..f443bbd48 100644 --- a/src/acquisition/covidcast_nowcast/load_sensors.py +++ b/src/acquisition/covidcast_nowcast/load_sensors.py @@ -1,11 +1,11 @@ from shutil import move -from datetime import datetime import os import time -import delphi.operations.secrets as secrets import pandas as pd import sqlalchemy + +import delphi.operations.secrets as secrets from delphi.epidata.acquisition.covidcast.csv_importer import CsvImporter SENSOR_CSV_PATH = "/common/covidcast_nowcast/receiving/" diff --git a/src/acquisition/ecdc/ecdc_db_update.py b/src/acquisition/ecdc/ecdc_db_update.py index 857cd49c7..63689c1d5 100644 --- a/src/acquisition/ecdc/ecdc_db_update.py +++ b/src/acquisition/ecdc/ecdc_db_update.py @@ -31,23 +31,19 @@ """ import argparse -import csv import datetime import glob -import sys import subprocess import random -from io import StringIO import os # third party import mysql.connector -#import pycountry # first party import delphi.operations.secrets as secrets from delphi.epidata.acquisition.ecdc.ecdc_ili import download_ecdc_data -from delphi.utils.epiweek import delta_epiweeks, join_epiweek, check_epiweek +from delphi.utils.epiweek import delta_epiweeks from delphi.utils.epidate import EpiDate def ensure_tables_exist(): @@ -74,13 +70,13 @@ def ensure_tables_exist(): def safe_float(f): try: return float(f.replace(',','')) - except Exception: + except: return 0 def safe_int(i): try: return int(i.replace(',','')) - except Exception: + except: return 0 def get_rows(cnx, table='ecdc_ili'): @@ -106,7 +102,6 @@ def update_from_file(issue, date, dir, test_mode=False): rows = [] for filename in files: with open(filename,'r') as f: - header = map(lambda s: s.strip(),f.readline().split(',')) for l in f: data = list(map(lambda s: s.strip().replace('"',''),l.split(','))) row = {} @@ -137,7 +132,7 @@ def update_from_file(issue, date, dir, test_mode=False): update_args = [date] + data_args try: insert.execute(sql % tuple(insert_args + update_args)) - except Exception: + except: pass # cleanup @@ -203,7 +198,7 @@ def main(): try: update_from_file(issue, date, filename, test_mode=args.test) subprocess.call(["rm",filename]) - except Exception: + except: db_error = True subprocess.call(["rm","-r",tmp_dir]) if not db_error: diff --git a/src/acquisition/ecdc/ecdc_ili.py b/src/acquisition/ecdc/ecdc_ili.py index c0b0cda68..1dd0505d1 100644 --- a/src/acquisition/ecdc/ecdc_ili.py +++ b/src/acquisition/ecdc/ecdc_ili.py @@ -3,20 +3,21 @@ @author: jingjingtang and rumackaaron """ -import requests +import os import re +import requests +import time + from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.support.ui import Select from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC -import time -import os + def download_ecdc_data(download_dir = "downloads"): url = 'https://flunewseurope.org/PrimaryCareData' - header = {'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"} resp = requests.get(url) soup = BeautifulSoup(resp.content, 'lxml') mydivs = soup.findAll('div') @@ -39,7 +40,7 @@ def download_ecdc_data(download_dir = "downloads"): for i in range(2, 54): # select country try: - elt = WebDriverWait(driver,30).until(EC.element_to_be_clickable((By.ID,'fluNewsReportViewer_ctl04_ctl03_ddValue'))) + WebDriverWait(driver,30).until(EC.element_to_be_clickable((By.ID,'fluNewsReportViewer_ctl04_ctl03_ddValue'))) Select(driver.find_element_by_tag_name('select')).select_by_value(str(i)) time.sleep(3) soup = BeautifulSoup(driver.page_source, 'html.parser') @@ -52,11 +53,11 @@ def download_ecdc_data(download_dir = "downloads"): break if type(ind) == str: # select clinical tyle - elt = WebDriverWait(driver,30).until(EC.element_to_be_clickable((By.ID,'fluNewsReportViewer_ctl04_ctl05_ddValue'))) + WebDriverWait(driver,30).until(EC.element_to_be_clickable((By.ID,'fluNewsReportViewer_ctl04_ctl05_ddValue'))) Select(driver.find_element_by_id('fluNewsReportViewer_ctl04_ctl05_ddValue')).select_by_value(ind) - elt = WebDriverWait(driver,30).until(EC.element_to_be_clickable((By.ID,'btnSelectExportType'))) + WebDriverWait(driver,30).until(EC.element_to_be_clickable((By.ID,'btnSelectExportType'))) driver.find_element_by_id('btnSelectExportType').click() - elt = WebDriverWait(driver,30).until(EC.element_to_be_clickable((By.ID,'btnExportToCsv'))) + WebDriverWait(driver,30).until(EC.element_to_be_clickable((By.ID,'btnExportToCsv'))) driver.find_element_by_id('btnExportToCsv').click() time.sleep(3) except: diff --git a/src/acquisition/flusurv/flusurv.py b/src/acquisition/flusurv/flusurv.py index 1e3ea6a11..6b8d247ae 100644 --- a/src/acquisition/flusurv/flusurv.py +++ b/src/acquisition/flusurv/flusurv.py @@ -158,7 +158,7 @@ def extract_from_object(data_in): continue age_index = obj['age'] - 1 # iterage over weeks - for mmwrid, week, overall, rate in obj['data']: + for mmwrid, _, _, rate in obj['data']: epiweek = mmwrid_to_epiweek(mmwrid) if epiweek not in data_out: # weekly rate of each age group diff --git a/src/acquisition/fluview/fluview_update.py b/src/acquisition/fluview/fluview_update.py index dd9942bd3..65bec7a40 100644 --- a/src/acquisition/fluview/fluview_update.py +++ b/src/acquisition/fluview/fluview_update.py @@ -104,12 +104,12 @@ similar to fluview table. data taken right from the WHO_NREVSS dataset. NOTE: for state-wise data, public health labs do not report by epiweek, but - by season (e.g. season 2016/17). calculating the lag is not very - meaningful with this. in addition, the epiweek field will be set to + by season (e.g. season 2016/17). calculating the lag is not very + meaningful with this. in addition, the epiweek field will be set to 201640 for season 2016/17, and so on. Changelog: -- 10/05/18: add/modify functions to also process clinical lab and public +- 10/05/18: add/modify functions to also process clinical lab and public health lab data. """ @@ -118,18 +118,16 @@ import csv import datetime import io -import os -import re import zipfile # third party import mysql.connector # first party -from . import fluview -from . import fluview_locations import delphi.operations.secrets as secrets from delphi.utils.epiweek import delta_epiweeks, join_epiweek +from . import fluview +from . import fluview_locations # sheet names ILINET_SHEET = 'ILINet.csv' @@ -191,15 +189,15 @@ def get_ilinet_data(row): def get_clinical_data(row): if row[0] == 'REGION TYPE' and row != [ - 'REGION TYPE', - 'REGION', - 'YEAR', - 'WEEK', - 'TOTAL SPECIMENS', + 'REGION TYPE', + 'REGION', + 'YEAR', + 'WEEK', + 'TOTAL SPECIMENS', 'TOTAL A', - 'TOTAL B', - 'PERCENT POSITIVE', - 'PERCENT A', + 'TOTAL B', + 'PERCENT POSITIVE', + 'PERCENT A', 'PERCENT B' ]: raise Exception('header row has changed for clinical lab data.') @@ -223,30 +221,30 @@ def get_clinical_data(row): def get_public_data(row): hrow1 = [ - 'REGION TYPE', - 'REGION', - 'SEASON_DESCRIPTION', + 'REGION TYPE', + 'REGION', + 'SEASON_DESCRIPTION', 'TOTAL SPECIMENS', - 'A (2009 H1N1)', - 'A (H3)', - 'A (Subtyping not Performed)', - 'B', + 'A (2009 H1N1)', + 'A (H3)', + 'A (Subtyping not Performed)', + 'B', 'BVic', - 'BYam', + 'BYam', 'H3N2v' ] hrow2 = [ - 'REGION TYPE', - 'REGION', + 'REGION TYPE', + 'REGION', 'YEAR', 'WEEK', 'TOTAL SPECIMENS', - 'A (2009 H1N1)', - 'A (H3)', - 'A (Subtyping not Performed)', - 'B', + 'A (2009 H1N1)', + 'A (H3)', + 'A (Subtyping not Performed)', + 'B', 'BVic', - 'BYam', + 'BYam', 'H3N2v' ] if row[0] == 'REGION TYPE' and row != hrow1 and row != hrow2: @@ -256,7 +254,7 @@ def get_public_data(row): return None if row[3] == 'X': # data is not reported, ignore this row - return None + return None # handle case where data is reported by season, not by epiweek is_weekly = len(row) == len(hrow2) # set epiweek @@ -402,7 +400,7 @@ def update_from_file_public(issue, date, filename, test_mode=False): lag = delta_epiweeks(row['epiweek'], issue) args = [ row['total_specimens'], row['total_a_h1n1'], row['total_a_h3'], - row['total_a_h3n2v'], row['total_a_no_sub'], row['total_b'], + row['total_a_h3n2v'], row['total_a_no_sub'], row['total_b'], row['total_b_vic'], row['total_b_yam'] ] ins_args = [date, issue, row['epiweek'], row['location'], lag] + args diff --git a/src/acquisition/fluview/impute_missing_values.py b/src/acquisition/fluview/impute_missing_values.py index c97c7d7e7..7f9a23231 100644 --- a/src/acquisition/fluview/impute_missing_values.py +++ b/src/acquisition/fluview/impute_missing_values.py @@ -53,7 +53,6 @@ import numpy as np # first party -from delphi.epidata.acquisition.fluview import fluview_locations import delphi.operations.secrets as secrets from delphi.utils.epiweek import delta_epiweeks from delphi.utils.geo.locations import Locations diff --git a/src/acquisition/ght/ght_update.py b/src/acquisition/ght/ght_update.py index 6475a3a54..c1e9b8d94 100644 --- a/src/acquisition/ght/ght_update.py +++ b/src/acquisition/ght/ght_update.py @@ -77,7 +77,6 @@ from .google_health_trends import GHT from .google_health_trends import NO_LOCATION_STR import delphi.operations.secrets as secrets -from delphi.utils.epidate import EpiDate import delphi.utils.epiweek as flu diff --git a/src/acquisition/kcdc/kcdc_update.py b/src/acquisition/kcdc/kcdc_update.py index c765ddc47..70c167738 100644 --- a/src/acquisition/kcdc/kcdc_update.py +++ b/src/acquisition/kcdc/kcdc_update.py @@ -30,16 +30,16 @@ ili: num_ili / 1000 outpatient visits """ -import requests import argparse import datetime +import requests # third party import mysql.connector # first party import delphi.operations.secrets as secrets -from delphi.utils.epiweek import delta_epiweeks, range_epiweeks, join_epiweek, check_epiweek, add_epiweeks +from delphi.utils.epiweek import delta_epiweeks, range_epiweeks, add_epiweeks from delphi.utils.epidate import EpiDate def ensure_tables_exist(): @@ -66,13 +66,13 @@ def ensure_tables_exist(): def safe_float(f): try: return float(f.replace(',','')) - except Exception: + except: return 0 def safe_int(i): try: return int(i.replace(',','')) - except Exception: + except: return 0 def get_rows(cnx, table='kcdc_ili'): @@ -173,7 +173,7 @@ def main(): ews,ilis = get_kcdc_data() update_from_data(ews, ilis, date, issue, test_mode=args.test) - + if __name__ == '__main__': main() diff --git a/src/acquisition/nidss/taiwan_nidss.py b/src/acquisition/nidss/taiwan_nidss.py index 4a6c7b01c..a4f5cb3e3 100644 --- a/src/acquisition/nidss/taiwan_nidss.py +++ b/src/acquisition/nidss/taiwan_nidss.py @@ -28,13 +28,12 @@ import argparse import base64 import re -import urllib.parse # third party import requests # first party -from delphi.utils.epiweek import range_epiweeks, add_epiweeks, delta_epiweeks, check_epiweek +from delphi.utils.epiweek import range_epiweeks, add_epiweeks, check_epiweek class NIDSS: @@ -204,11 +203,6 @@ def get_dengue_data(first_week, last_week): fields = line.split(',') location_b64 = base64.b64encode(fields[3].encode('utf-8')) location = NIDSS._TRANSLATED[location_b64] - region = NIDSS.LOCATION_TO_REGION[location] - imported_b64 = base64.b64encode(fields[6].encode('utf-8')) - imported = imported_b64 == b'5piv' - sex = fields[5] - age = fields[7] count = int(fields[8]) year = int(fields[1]) week = int(fields[2]) diff --git a/src/acquisition/norostat/norostat_sql.py b/src/acquisition/norostat/norostat_sql.py index 79f683912..168e275eb 100644 --- a/src/acquisition/norostat/norostat_sql.py +++ b/src/acquisition/norostat/norostat_sql.py @@ -6,7 +6,6 @@ # first party from .norostat_utils import * -from delphi.utils.epidate import EpiDate import delphi.operations.secrets as secrets # Column names: @@ -73,25 +72,25 @@ def ensure_tables_exist(): `parse_time` DATETIME(6) NOT NULL, PRIMARY KEY (`release_date`, `parse_time`) ); - '''); + ''') cursor.execute(''' CREATE TABLE IF NOT EXISTS `norostat_raw_datatable_measurement_type_pool` ( `measurement_type_id` INT NOT NULL PRIMARY KEY AUTO_INCREMENT, `measurement_type` NVARCHAR(255) NOT NULL UNIQUE KEY ); - '''); + ''') cursor.execute(''' CREATE TABLE IF NOT EXISTS `norostat_raw_datatable_location_pool` ( `location_id` INT NOT NULL PRIMARY KEY AUTO_INCREMENT, `location` NVARCHAR(255) NOT NULL UNIQUE KEY ); - '''); + ''') cursor.execute(''' CREATE TABLE IF NOT EXISTS `norostat_raw_datatable_week_pool` ( `week_id` INT NOT NULL PRIMARY KEY AUTO_INCREMENT, `week` NVARCHAR(255) NOT NULL UNIQUE KEY ); - '''); + ''') cursor.execute(''' CREATE TABLE IF NOT EXISTS `norostat_raw_datatable_diffs` ( `release_date` DATE NOT NULL, @@ -355,7 +354,7 @@ def record_long_raw(long_raw): FOREIGN KEY (`release_date`,`parse_time`) REFERENCES `norostat_raw_datatable_version_list` (`release_date`,`parse_time`), PRIMARY KEY (`release_date`, `parse_time`) ); - '''); + ''') cursor.execute(''' CREATE TABLE IF NOT EXISTS `norostat_point_diffs` ( `release_date` DATE NOT NULL, diff --git a/src/acquisition/norostat/norostat_update.py b/src/acquisition/norostat/norostat_update.py index dfb6786de..4b0021dd5 100644 --- a/src/acquisition/norostat/norostat_update.py +++ b/src/acquisition/norostat/norostat_update.py @@ -8,23 +8,9 @@ process and record it in the appropriate databases. """ -# standard library -import datetime -import re -import os -import time -import collections - -# third party -import pandas as pd -import mysql.connector - # first party from . import norostat_sql from . import norostat_raw -from delphi.utils.epidate import EpiDate -import delphi.operations.secrets as secrets - def main(): diff --git a/src/acquisition/norostat/norostat_utils.py b/src/acquisition/norostat/norostat_utils.py index 2e5b53b8e..a99a4dc96 100644 --- a/src/acquisition/norostat/norostat_utils.py +++ b/src/acquisition/norostat/norostat_utils.py @@ -2,9 +2,6 @@ import re import datetime -# third party -import pandas as pd - # first party from delphi.utils.epidate import EpiDate diff --git a/src/acquisition/paho/paho_db_update.py b/src/acquisition/paho/paho_db_update.py index ec1601716..d07885f79 100644 --- a/src/acquisition/paho/paho_db_update.py +++ b/src/acquisition/paho/paho_db_update.py @@ -50,7 +50,6 @@ import csv import datetime import glob -import sys import subprocess import random from io import StringIO @@ -62,7 +61,7 @@ # first party import delphi.operations.secrets as secrets from delphi.epidata.acquisition.paho.paho_download import get_paho_data -from delphi.utils.epiweek import delta_epiweeks, join_epiweek, check_epiweek +from delphi.utils.epiweek import delta_epiweeks, check_epiweek from delphi.utils.epidate import EpiDate def ensure_tables_exist(): @@ -94,13 +93,13 @@ def ensure_tables_exist(): def safe_float(f): try: return float(f.replace(',','')) - except Exception: + except: return 0 def safe_int(i): try: return int(i.replace(',','')) - except Exception: + except: return 0 def get_rows(cnx, table='paho_dengue'): @@ -120,17 +119,17 @@ def get_paho_row(row): return None try: country = pycountry.countries.get(name=row[4]).alpha_2 - except Exception: + except: try: country = pycountry.countries.get(common_name=row[4]).alpha_2 - except Exception: + except: try: country = pycountry.countries.get(official_name=row[4]).alpha_2 - except Exception: + except: return None try: check_epiweek(safe_int(row[13])*100 + safe_int(row[8]), safe_int(row[13])*100 + safe_int(row[6])) - except Exception: + except: return None return { 'issue': safe_int(row[13])*100 + safe_int(row[6]), @@ -196,7 +195,7 @@ def update_from_file(issue, date, filename, test_mode=False): lag = delta_epiweeks(row['epiweek'], issue) data_args = [row['total_pop'], row['serotype'], row['num_dengue'], row['incidence_rate'], row['num_severe'], row['num_deaths']] - + insert_args = [date,issue,row['epiweek'],row['region'],lag] + data_args update_args = [date] + data_args insert.execute(sql % tuple(insert_args + update_args)) @@ -269,7 +268,7 @@ def main(): try: update_from_file(issue, date, filename, test_mode=args.test) subprocess.call(["rm",filename]) - except Exception: + except: db_error = True subprocess.call(["rm","-r",tmp_dir]) if not db_error: diff --git a/src/acquisition/paho/paho_download.py b/src/acquisition/paho/paho_download.py index bea5f457f..60dd13ae8 100644 --- a/src/acquisition/paho/paho_download.py +++ b/src/acquisition/paho/paho_download.py @@ -17,8 +17,8 @@ def wait_for(browser, css_selector, delay=10): try: - myElem = WebDriverWait(browser, delay).until(EC.presence_of_element_located((By.CSS_SELECTOR, css_selector))) - myElem = WebDriverWait(browser, delay).until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_selector))) + WebDriverWait(browser, delay).until(EC.presence_of_element_located((By.CSS_SELECTOR, css_selector))) + WebDriverWait(browser, delay).until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_selector))) print('Success Loading %s' % (css_selector)) except TimeoutException: print("Loading %s took too much time!" % (css_selector)) @@ -104,7 +104,7 @@ def get_paho_data(offset=0, dir='downloads'): if i.text == "(All)": y = i find_and_click(browser, y) - + for i in range(offset): gp = browser.find_element_by_css_selector("div.wcGlassPane") #print gp.is_enabled() diff --git a/src/acquisition/quidel/quidel.py b/src/acquisition/quidel/quidel.py index 8ddebef22..a7c9a2918 100644 --- a/src/acquisition/quidel/quidel.py +++ b/src/acquisition/quidel/quidel.py @@ -19,14 +19,11 @@ # standard library from collections import defaultdict -import datetime import email -import getpass import imaplib import os from os import listdir from os.path import isfile, join -import math import re # third party @@ -36,7 +33,6 @@ # first party import delphi.operations.secrets as secrets import delphi.utils.epidate as ED -import delphi.utils.epiweek as EW from delphi.utils.geo.locations import Locations def word_map(row,terms): @@ -44,7 +40,7 @@ def word_map(row,terms): row = row.replace(k,v) return row -def date_less_than(d1,d2,delimiter='-'): +def date_less_than(d1,d2): y1,m1,d1 = [int(x) for x in d1.split('-')] y2,m2,d2 = [int(x) for x in d2.split('-')] @@ -56,7 +52,7 @@ def date_less_than(d1,d2,delimiter='-'): return -1 # shift>0: shifted to future -def date_to_epiweek(date, delimiter='-', shift=0): +def date_to_epiweek(date, shift=0): y,m,d = [int(x) for x in date.split('-')] epidate = ED.EpiDate(y,m,d) @@ -118,12 +114,12 @@ def retrieve_excels(self): m.login(secrets.quidel.email_addr,secrets.quidel.email_pwd) m.select("INBOX") # here you a can choose a mail box like INBOX instead # use m.list() to get all the mailboxes - resp, items = m.search(None, "ALL") # you could filter using the IMAP rules here (check http://www.example-code.com/csharp/imap-search-critera.asp) + _, items = m.search(None, "ALL") # you could filter using the IMAP rules here (check http://www.example-code.com/csharp/imap-search-critera.asp) items = items[0].split() # getting the mails id # The emailids are ordered from past to now for emailid in items: - resp, data = m.fetch(emailid, "(RFC822)") # fetching the mail, "`(RFC822)`" means "get the whole stuff", but you can ask for headers only, etc + _, data = m.fetch(emailid, "(RFC822)") # fetching the mail, "`(RFC822)`" means "get the whole stuff", but you can ask for headers only, etc email_body = data[0][1].decode('utf-8') # getting the mail content mail = email.message_from_string(email_body) # parsing the mail content to get a mail object @@ -222,13 +218,13 @@ def get_hhs_region(atom): return atom day_shift = 6 - start_weekday - time_map = lambda x:date_to_epiweek(x,'-',shift=day_shift) + time_map = lambda x:date_to_epiweek(x,shift=day_shift) region_map = lambda x:get_hhs_region(x) \ if use_hhs and x not in Locations.hhs_list else x # a bit hacky end_date = sorted(data_dict.keys())[-1] # count the latest week in only if Thurs data is included - end_epiweek = date_to_epiweek(end_date,'-',shift=-4) + end_epiweek = date_to_epiweek(end_date,shift=-4) # first pass: prepare device_id set device_dict = {} for (date,daily_dict) in data_dict.items(): diff --git a/src/acquisition/twtr/pageparser.py b/src/acquisition/twtr/pageparser.py index 23a05e062..5e9aaaea1 100644 --- a/src/acquisition/twtr/pageparser.py +++ b/src/acquisition/twtr/pageparser.py @@ -1,9 +1,9 @@ -"""A small library for parsing HTML.""" - -# standard library -from html.parser import HTMLParser - - +"""A small library for parsing HTML.""" + +# standard library +from html.parser import HTMLParser + + class PageParser(HTMLParser): ''' This is an HTML parser! All of the hard work is done by the superclass @@ -69,9 +69,8 @@ def handle_endtag(self, tag): '''Inherited - called when an end tag is found''' if tag in PageParser.banlist(): return - x = self.stack.pop() - #if x['type'] != tag: - # print('Unclosed tag! Parent/Child:', x['type'], tag) + self.stack.pop() + def handle_data(self, data): '''Inherited - called when a data string is found''' diff --git a/src/acquisition/twtr/twitter_update.py b/src/acquisition/twtr/twitter_update.py index 518a7a1ab..5c1f3f45b 100644 --- a/src/acquisition/twtr/twitter_update.py +++ b/src/acquisition/twtr/twitter_update.py @@ -43,26 +43,26 @@ === Changelog === ================= -2017-02-16 - * Use secrets -2015-11-27 - * Small documentation update +2017-02-16 + * Use secrets +2015-11-27 + * Small documentation update 2015-05-22 * Original version ''' -# third party -import mysql.connector - -# first party -from .healthtweets import HealthTweets -import delphi.operations.secrets as secrets - - +# third party +import mysql.connector + +# first party +from .healthtweets import HealthTweets +import delphi.operations.secrets as secrets + + def run(): - # connect to the database - u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') + # connect to the database + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database='epidata') cur = cnx.cursor() def get_num_rows(): @@ -80,8 +80,8 @@ def get_num_rows(): # keep track of how many rows were added rows_before = get_num_rows() - # check healthtweets.org for new and/or revised data - ht = HealthTweets(*secrets.healthtweets.login) + # check healthtweets.org for new and/or revised data + ht = HealthTweets(*secrets.healthtweets.login) sql = 'INSERT INTO `twitter` (`date`, `state`, `num`, `total`) VALUES (%s, %s, %s, %s) ON DUPLICATE KEY UPDATE `num` = %s, `total` = %s' total_rows = 0 for state in sorted(HealthTweets.STATE_CODES.keys()): @@ -99,7 +99,7 @@ def get_num_rows(): cur.close() cnx.commit() cnx.close() - - + + if __name__ == '__main__': run() diff --git a/src/acquisition/wiki/wiki_download.py b/src/acquisition/wiki/wiki_download.py index a5140c775..1a01b7f8e 100644 --- a/src/acquisition/wiki/wiki_download.py +++ b/src/acquisition/wiki/wiki_download.py @@ -95,7 +95,7 @@ def extract_article_counts(filename, language, articles, debug_mode): if content[0] == language: total += article_count if content[0] == language and article_title in articles_set: - if(debug_mode): + if debug_mode: print("Find article {0}: {1}".format(article_title, line)) counts[article_title] = article_count if debug_mode: From f54687ad744bc6129dfaf109500873a45fc8ff66 Mon Sep 17 00:00:00 2001 From: Rostyslav Zatserkovnyi Date: Fri, 2 Dec 2022 15:10:29 +0200 Subject: [PATCH 2/6] Server --- src/server/_config.py | 3 +-- src/server/_exceptions.py | 2 +- src/server/_params.py | 10 +++++----- src/server/_printer.py | 2 +- src/server/_query.py | 11 ++++------- src/server/_validate.py | 4 ++-- src/server/endpoints/afhsb.py | 1 - src/server/endpoints/covidcast.py | 2 +- src/server/endpoints/covidcast_meta.py | 2 +- src/server/endpoints/covidcast_nowcast.py | 1 - src/server/endpoints/covidcast_utils/model.py | 2 +- src/server/endpoints/fluview.py | 2 +- src/server/endpoints/sensors.py | 2 +- src/server/utils/dates.py | 6 ++++-- 14 files changed, 23 insertions(+), 27 deletions(-) diff --git a/src/server/_config.py b/src/server/_config.py index 560fdd796..c8386f191 100644 --- a/src/server/_config.py +++ b/src/server/_config.py @@ -1,7 +1,6 @@ +import json import os from dotenv import load_dotenv -from flask import Flask -import json load_dotenv() diff --git a/src/server/_exceptions.py b/src/server/_exceptions.py index 5512b86fc..835bfc118 100644 --- a/src/server/_exceptions.py +++ b/src/server/_exceptions.py @@ -1,6 +1,6 @@ +from typing import Iterable, Optional from flask import make_response, request from flask.json import dumps -from typing import Iterable, Optional from werkzeug.exceptions import HTTPException diff --git a/src/server/_params.py b/src/server/_params.py index 9088a8902..2cef9725b 100644 --- a/src/server/_params.py +++ b/src/server/_params.py @@ -53,7 +53,7 @@ class GeoPair: geo_values: Union[bool, Sequence[str]] def matches(self, geo_type: str, geo_value: str) -> bool: - return self.geo_type == geo_type and (self.geo_values == True or (not isinstance(self.geo_values, bool) and geo_value in self.geo_values)) + return self.geo_type == geo_type and (self.geo_values is True or (not isinstance(self.geo_values, bool) and geo_value in self.geo_values)) def count(self) -> float: """ @@ -82,7 +82,7 @@ class SourceSignalPair: signal: Union[bool, Sequence[str]] def matches(self, source: str, signal: str) -> bool: - return self.source == source and (self.signal == True or (not isinstance(self.signal, bool) and signal in self.signal)) + return self.source == source and (self.signal is True or (not isinstance(self.signal, bool) and signal in self.signal)) def count(self) -> float: """ @@ -223,16 +223,16 @@ def parse_time_arg(key: str = "time") -> Optional[TimePair]: return None if len(time_pairs) == 1: return time_pairs[0] - + # make sure 'day' and 'week' aren't mixed time_types = set(time_pair.time_type for time_pair in time_pairs) if len(time_types) >= 2: raise ValidationFailedException(f'{key}: {time_pairs} mixes "day" and "week" time types') # merge all time pairs into one - merged = [] + merged = [] for time_pair in time_pairs: - if time_pair.time_values == True: + if time_pair.time_values is True: return time_pair else: merged.extend(time_pair.time_values) diff --git a/src/server/_printer.py b/src/server/_printer.py index 04196c71d..bbe3ee10e 100644 --- a/src/server/_printer.py +++ b/src/server/_printer.py @@ -58,7 +58,7 @@ def gen(): r = self._print_row(row) if r is not None: yield r - except Exception as e: + except: get_structured_logger('server_error').error("Exception while executing printer", exception=e) self.result = -1 yield self._error(e) diff --git a/src/server/_query.py b/src/server/_query.py index 47d91a2df..69607255f 100644 --- a/src/server/_query.py +++ b/src/server/_query.py @@ -9,15 +9,13 @@ Sequence, Tuple, Union, - cast, - Mapping, + cast ) from sqlalchemy import text from sqlalchemy.engine import Row -from ._common import db, app -from ._db import metadata +from ._common import db from ._printer import create_printer, APrinter from ._exceptions import DatabaseErrorException from ._validate import extract_strings @@ -283,7 +281,6 @@ def execute_queries( def dummy_gen(): if 3 > 4: yield {} - pass if not query_list or p.remaining_rows <= 0: return p(dummy_gen) @@ -479,9 +476,9 @@ def set_order(self, *args: str, **kwargs: Union[str, bool]) -> "QueryBuilder": """ def to_asc(v: Union[str, bool]) -> str: - if v == True: + if v is True: return "ASC" - elif v == False: + elif v is False: return "DESC" return cast(str, v) diff --git a/src/server/_validate.py b/src/server/_validate.py index e24644721..59e5aa7d0 100644 --- a/src/server/_validate.py +++ b/src/server/_validate.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple, Union +from typing import List, Optional, Sequence, Tuple, Union from flask import request @@ -94,7 +94,7 @@ def extract_integer(key: Union[str, Sequence[str]]) -> Optional[int]: return None try: return int(s) - except ValueError as e: + except ValueError: raise ValidationFailedException(f"{key}: not a number: {s}") diff --git a/src/server/endpoints/afhsb.py b/src/server/endpoints/afhsb.py index 2028ac51b..9f05eac9d 100644 --- a/src/server/endpoints/afhsb.py +++ b/src/server/endpoints/afhsb.py @@ -1,4 +1,3 @@ -import re from typing import Dict, List from flask import Blueprint diff --git a/src/server/endpoints/covidcast.py b/src/server/endpoints/covidcast.py index 9211a0d68..0c22e4573 100644 --- a/src/server/endpoints/covidcast.py +++ b/src/server/endpoints/covidcast.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Union, Tuple, Dict, Any +from typing import List, Optional, Tuple, Dict, Any from itertools import groupby from datetime import date, timedelta from epiweeks import Week diff --git a/src/server/endpoints/covidcast_meta.py b/src/server/endpoints/covidcast_meta.py index 87476d271..2181f9722 100644 --- a/src/server/endpoints/covidcast_meta.py +++ b/src/server/endpoints/covidcast_meta.py @@ -48,7 +48,7 @@ def fetch_data( age = row["age"] if age > max_age and row["epidata"]: get_structured_logger('server_api').warning("covidcast_meta cache is stale", cache_age=age) - pass + return epidata = loads(row["epidata"]) diff --git a/src/server/endpoints/covidcast_nowcast.py b/src/server/endpoints/covidcast_nowcast.py index 9a773f572..ae47259f8 100644 --- a/src/server/endpoints/covidcast_nowcast.py +++ b/src/server/endpoints/covidcast_nowcast.py @@ -5,7 +5,6 @@ extract_date, extract_dates, extract_integer, - extract_integers, extract_strings, require_all, require_any, diff --git a/src/server/endpoints/covidcast_utils/model.py b/src/server/endpoints/covidcast_utils/model.py index 28b398580..e554feb11 100644 --- a/src/server/endpoints/covidcast_utils/model.py +++ b/src/server/endpoints/covidcast_utils/model.py @@ -272,7 +272,7 @@ def create_source_signal_alias_mapper(source_signals: List[SourceSignalPair]) -> continue # uses an alias alias_to_data_sources.setdefault(source.db_source, []).append(source) - if pair.signal == True: + if pair.signal: # list all signals of this source (*) so resolve to a plain list of all in this alias transformed_pairs.append(SourceSignalPair(source.db_source, [s.signal for s in source.signals])) else: diff --git a/src/server/endpoints/fluview.py b/src/server/endpoints/fluview.py index e92dddde6..8b92fa052 100644 --- a/src/server/endpoints/fluview.py +++ b/src/server/endpoints/fluview.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, List +from typing import Any, Dict, List, Tuple from flask import Blueprint diff --git a/src/server/endpoints/sensors.py b/src/server/endpoints/sensors.py index a957c6ae7..68199e2b1 100644 --- a/src/server/endpoints/sensors.py +++ b/src/server/endpoints/sensors.py @@ -1,4 +1,4 @@ -from flask import request, Blueprint +from flask import Blueprint from .._config import AUTH, GRANULAR_SENSOR_AUTH_TOKENS, OPEN_SENSORS from .._validate import ( diff --git a/src/server/utils/dates.py b/src/server/utils/dates.py index eb293c744..b85465bb8 100644 --- a/src/server/utils/dates.py +++ b/src/server/utils/dates.py @@ -1,3 +1,4 @@ +from datetime import date, timedelta from typing import ( Callable, Optional, @@ -5,11 +6,12 @@ Tuple, Union ) -from .logger import get_structured_logger -from datetime import date, timedelta + from epiweeks import Week, Year from typing_extensions import TypeAlias +from .logger import get_structured_logger + # Alias for a sequence of date ranges (int, int) or date integers TimeValues: TypeAlias = Sequence[Union[Tuple[int, int], int]] From a8be0d9d027687c9f25e55103b76a80144495378 Mon Sep 17 00:00:00 2001 From: Rostyslav Zatserkovnyi Date: Fri, 2 Dec 2022 15:15:49 +0200 Subject: [PATCH 3/6] fix test --- tests/acquisition/covidcast/test_signal_dash_data_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/acquisition/covidcast/test_signal_dash_data_generator.py b/tests/acquisition/covidcast/test_signal_dash_data_generator.py index ff1797b35..5d0c81326 100644 --- a/tests/acquisition/covidcast/test_signal_dash_data_generator.py +++ b/tests/acquisition/covidcast/test_signal_dash_data_generator.py @@ -212,7 +212,7 @@ def test_get_coverage(self, mock_signal): mock_signal.return_value = epidata_df - coverage = get_coverage(signal, metadata) + coverage = get_coverage(signal) expected_coverage = [ DashboardSignalCoverage( From afe29cac53daeaf56eae1194163d9b11d907abe7 Mon Sep 17 00:00:00 2001 From: Rostyslav Zatserkovnyi Date: Fri, 2 Dec 2022 15:21:47 +0200 Subject: [PATCH 4/6] no return --- src/server/endpoints/covidcast_meta.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/server/endpoints/covidcast_meta.py b/src/server/endpoints/covidcast_meta.py index 2181f9722..08e919d24 100644 --- a/src/server/endpoints/covidcast_meta.py +++ b/src/server/endpoints/covidcast_meta.py @@ -48,7 +48,6 @@ def fetch_data( age = row["age"] if age > max_age and row["epidata"]: get_structured_logger('server_api').warning("covidcast_meta cache is stale", cache_age=age) - return epidata = loads(row["epidata"]) From ca19cb27751ea3e0eda757ad09ebb4a637b82a43 Mon Sep 17 00:00:00 2001 From: Rostyslav Zatserkovnyi Date: Fri, 2 Dec 2022 16:41:55 +0200 Subject: [PATCH 5/6] Update src/server/endpoints/covidcast_utils/model.py Co-authored-by: melange396 --- src/server/endpoints/covidcast_utils/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/server/endpoints/covidcast_utils/model.py b/src/server/endpoints/covidcast_utils/model.py index e554feb11..154bb3668 100644 --- a/src/server/endpoints/covidcast_utils/model.py +++ b/src/server/endpoints/covidcast_utils/model.py @@ -272,7 +272,7 @@ def create_source_signal_alias_mapper(source_signals: List[SourceSignalPair]) -> continue # uses an alias alias_to_data_sources.setdefault(source.db_source, []).append(source) - if pair.signal: + if pair.signal is True: # list all signals of this source (*) so resolve to a plain list of all in this alias transformed_pairs.append(SourceSignalPair(source.db_source, [s.signal for s in source.signals])) else: From 5ae3b7794c748696e90ac62a2133606fed575177 Mon Sep 17 00:00:00 2001 From: Rostyslav Zatserkovnyi Date: Fri, 2 Dec 2022 16:45:42 +0200 Subject: [PATCH 6/6] Fix --- src/acquisition/covidcast/csv_to_database.py | 4 ++-- src/acquisition/nidss/taiwan_nidss.py | 6 ++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/acquisition/covidcast/csv_to_database.py b/src/acquisition/covidcast/csv_to_database.py index bbc722373..34cbad663 100644 --- a/src/acquisition/covidcast/csv_to_database.py +++ b/src/acquisition/covidcast/csv_to_database.py @@ -42,10 +42,10 @@ def make_handlers(data_dir, specific_issue_date, file_archiver_impl=FileArchiver if specific_issue_date: # issue-specific uploads are always one-offs, so we can leave all # files in place without worrying about cleaning up - def handle_failed(_, filename, source, logger): + def handle_failed(path_src, filename, source, logger): logger.info(event='leaving failed file alone', dest=source, file=filename) - def handle_successful(path_src, filename, _, logger): + def handle_successful(path_src, filename, source, logger): logger.info(event='archiving as successful',file=filename) file_archiver_impl.archive_inplace(path_src, filename) else: diff --git a/src/acquisition/nidss/taiwan_nidss.py b/src/acquisition/nidss/taiwan_nidss.py index a4f5cb3e3..27da863e1 100644 --- a/src/acquisition/nidss/taiwan_nidss.py +++ b/src/acquisition/nidss/taiwan_nidss.py @@ -203,6 +203,12 @@ def get_dengue_data(first_week, last_week): fields = line.split(',') location_b64 = base64.b64encode(fields[3].encode('utf-8')) location = NIDSS._TRANSLATED[location_b64] + # Fields currently unused: + # region = NIDSS.LOCATION_TO_REGION[location] + # imported_b64 = base64.b64encode(fields[6].encode('utf-8')) + # imported = imported_b64 == b'5piv' + # sex = fields[5] + # age = fields[7] count = int(fields[8]) year = int(fields[1]) week = int(fields[2])