diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 000000000..b76cfd14a --- /dev/null +++ b/.editorconfig @@ -0,0 +1,22 @@ +# EditorConfig helps developers define and maintain consistent +# coding styles between different editors and IDEs +# editorconfig.org + +root = true + +[*] +# We recommend you to keep these unchanged +end_of_line = lf +charset = utf-8 +trim_trailing_whitespace = true +insert_final_newline = true + + +[*.py] +# Change these settings to your own preference +indent_style = space +indent_size = 4 + + +[*.md] +trim_trailing_whitespace = false diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 000000000..97dc620be --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1,24 @@ +# style(black): format cdc acquisition +980b0b7e80c7923b79e14fee620645e680785703 +# style(black): format covidcast_nowcast acquisition +9e6ff16f599e8feec34a08dd1bddbc5eae347b55 +# style(black): format ecdc acquisition +d1141d904da4e62992b97c92d5caebd8fadffd42 +# style(black): format flusurv acquisition +08af0f6b7bff85bbc2b193b63b5abf6a16ba03e4 +# style(black): format fluview acquisition +0133ef2042c4df8867e91595eb1f64873edb4632 +# style(black): format ght acquisition +b8900a0bc846888885310911efd6e26459effa99 +# style(black): format kcdc acquisition +a849384c884934b3b7c3c67b68aa6240277d6b6d +# style(black): format nidss acquisition +d04af3c02fda7708a16bec0952b1aa7475acaec7 +# style(black): format paho acquisition +7f60fbba572c1b6e5153a9ef216895bdc2f7f5b3 +# style(black): format quidel acquisition +b9ceb400d9248c8271e8342275664ac5524e335d +# style(black): format twitter acquisition +07ed83e5768f717ab0f9a62a9209e4e2cffa058d +# style(black): format wiki acquisition +923852eafa86b8f8b182d499489249ba8f815843 diff --git a/pyproject.toml b/pyproject.toml index d255c2849..a4399ca9b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,26 @@ - [tool.black] -line-length = 200 +line-length = 100 target-version = ['py38'] include = 'server,tests/server' + +[tool.pylint] + [tool.pylint.'MESSAGES CONTROL'] + max-line-length = 100 + disable = [ + 'logging-format-interpolation', + # Allow pytest functions to be part of a class + 'no-self-use', + 'too-many-locals', + 'too-many-arguments', + # Allow pytest classes to have one test + 'too-few-public-methods', + ] + + [tool.pylint.'BASIC'] + # Allow arbitrarily short-named variables. + variable-rgx = ['[a-z_][a-z0-9_]*'] + argument-rgx = [ '[a-z_][a-z0-9_]*' ] + attr-rgx = ['[a-z_][a-z0-9_]*'] + + [tool.pylint.'DESIGN'] + ignored-argument-names = ['(_.*|run_as_module)'] diff --git a/src/acquisition/cdcp/cdc_dropbox_receiver.py b/src/acquisition/cdcp/cdc_dropbox_receiver.py index eb0d97f2a..4fa20368e 100644 --- a/src/acquisition/cdcp/cdc_dropbox_receiver.py +++ b/src/acquisition/cdcp/cdc_dropbox_receiver.py @@ -29,128 +29,128 @@ # location constants -DROPBOX_BASE_DIR = '/cdc_page_stats' -DELPHI_BASE_DIR = '/common/cdc_stage' +DROPBOX_BASE_DIR = "/cdc_page_stats" +DELPHI_BASE_DIR = "/common/cdc_stage" def get_timestamp_string(): - """ - Return the current local date and time as a string. + """ + Return the current local date and time as a string. - The format is "%Y%m%d_%H%M%S". - """ - return datetime.datetime.now().strftime('%Y%m%d_%H%M%S') + The format is "%Y%m%d_%H%M%S". + """ + return datetime.datetime.now().strftime("%Y%m%d_%H%M%S") def trigger_further_processing(): - """Add CDCP processing scripts to the Automation run queue.""" + """Add CDCP processing scripts to the Automation run queue.""" - # connect - u, p = secrets.db.auto - cnx = mysql.connector.connect(user=u, password=p, database='automation') - cur = cnx.cursor() + # connect + u, p = secrets.db.auto + cnx = mysql.connector.connect(user=u, password=p, database="automation") + cur = cnx.cursor() - # add step "Process CDCP Data" to queue - cur.execute('CALL automation.RunStep(46)') + # add step "Process CDCP Data" to queue + cur.execute("CALL automation.RunStep(46)") - # disconnect - cur.close() - cnx.commit() - cnx.close() + # disconnect + cur.close() + cnx.commit() + cnx.close() def fetch_data(): - """ - Check for new files on dropbox, download them, zip them, cleanup dropbox, and - trigger further processing of new data. - """ - - # initialize dropbox api - dbx = dropbox.Dropbox(secrets.cdcp.dropbox_token) - - # look for new CDC data files - print('checking dropbox:%s' % DROPBOX_BASE_DIR) - save_list = [] - for entry in dbx.files_list_folder(DROPBOX_BASE_DIR).entries: - name = entry.name - if name.endswith('.csv') or name.endswith('.zip'): - print(' download "%s"' % name) - save_list.append(name) - else: - print(' skip "%s"' % name) - - # determine if there's anything to be done - if len(save_list) == 0: - print('did not find any new data files') - return - - # download new files, saving them inside of a new zip file - timestamp = get_timestamp_string() - zip_path = '%s/dropbox_%s.zip' % (DELPHI_BASE_DIR, timestamp) - print('downloading into delphi:%s' % zip_path) - with ZipFile(zip_path, 'w', ZIP_DEFLATED) as zf: + """ + Check for new files on dropbox, download them, zip them, cleanup dropbox, and + trigger further processing of new data. + """ + + # initialize dropbox api + dbx = dropbox.Dropbox(secrets.cdcp.dropbox_token) + + # look for new CDC data files + print(f"checking dropbox: {DROPBOX_BASE_DIR}") + save_list = [] + for entry in dbx.files_list_folder(DROPBOX_BASE_DIR).entries: + name = entry.name + if name.endswith(".csv") or name.endswith(".zip"): + print(f" download: {name}") + save_list.append(name) + else: + print(f" skip: {name}") + + # determine if there's anything to be done + if len(save_list) == 0: + print("did not find any new data files") + return + + # download new files, saving them inside of a new zip file + timestamp = get_timestamp_string() + zip_path = f"{DELPHI_BASE_DIR}/dropbox_{timestamp}.zip" + print(f"downloading into delphi:{zip_path}") + with ZipFile(zip_path, "w", ZIP_DEFLATED) as zf: + for name in save_list: + # location of the file on dropbox + dropbox_path = f"{DROPBOX_BASE_DIR}/{name}" + print(f" {dropbox_path}") + + # start the download + meta, resp = dbx.files_download(dropbox_path) + + # check status and length + if resp.status_code != 200: + raise Exception(["resp.status_code", resp.status_code]) + dropbox_len = meta.size + print(f" need {int(dropbox_len)} bytes...") + content_len = int(resp.headers.get("Content-Length", -1)) + if dropbox_len != content_len: + info = ["dropbox_len", dropbox_len, "content_len", content_len] + raise Exception(info) + + # finish the download, holding the data in this variable + filedata = resp.content + + # check the length again + payload_len = len(filedata) + print(" downloaded") + if dropbox_len != payload_len: + info = ["dropbox_len", dropbox_len, "payload_len", payload_len] + raise Exception(info) + + # add the downloaded file to the zip file + zf.writestr(name, filedata) + print(" added") + + # At this point, all the data is stored and awaiting further processing on + # the delphi server. + print(f"saved all new data in {zip_path}") + + # on dropbox, archive downloaded files so they won't be downloaded again + archive_dir = f"archived_reports/processed_{timestamp}" + print("archiving files...") for name in save_list: - # location of the file on dropbox - dropbox_path = '%s/%s' % (DROPBOX_BASE_DIR, name) - print(' %s' % dropbox_path) - - # start the download - meta, resp = dbx.files_download(dropbox_path) - - # check status and length - if resp.status_code != 200: - raise Exception(['resp.status_code', resp.status_code]) - dropbox_len = meta.size - print(' need %d bytes...' % dropbox_len) - content_len = int(resp.headers.get('Content-Length', -1)) - if dropbox_len != content_len: - info = ['dropbox_len', dropbox_len, 'content_len', content_len] - raise Exception(info) - - # finish the download, holding the data in this variable - filedata = resp.content - - # check the length again - payload_len = len(filedata) - print(' downloaded') - if dropbox_len != payload_len: - info = ['dropbox_len', dropbox_len, 'payload_len', payload_len] - raise Exception(info) - - # add the downloaded file to the zip file - zf.writestr(name, filedata) - print(' added') - - # At this point, all the data is stored and awaiting further processing on - # the delphi server. - print('saved all new data in %s' % zip_path) - - # on dropbox, archive downloaded files so they won't be downloaded again - archive_dir = 'archived_reports/processed_%s' % timestamp - print('archiving files...') - for name in save_list: - # source and destination - dropbox_src = '%s/%s' % (DROPBOX_BASE_DIR, name) - dropbox_dst = '%s/%s/%s' % (DROPBOX_BASE_DIR, archive_dir, name) - print(' "%s" -> "%s"' % (dropbox_src, dropbox_dst)) - - # move the file - meta = dbx.files_move(dropbox_src, dropbox_dst) - - # sanity check - if archive_dir not in meta.path_lower: - raise Exception('failed to move "%s"' % name) - - # finally, trigger the usual processing flow - print('triggering processing flow') - trigger_further_processing() - print('done') + # source and destination + dropbox_src = f"{DROPBOX_BASE_DIR}/{name}" + dropbox_dst = f"{DROPBOX_BASE_DIR}/{archive_dir}/{name}" + print(f" {dropbox_src} -> {dropbox_dst}") + + # move the file + meta = dbx.files_move(dropbox_src, dropbox_dst) + + # sanity check + if archive_dir not in meta.path_lower: + raise Exception(f"failed to move {name}") + + # finally, trigger the usual processing flow + print("triggering processing flow") + trigger_further_processing() + print("done") def main(): - # fetch new data - fetch_data() + # fetch new data + fetch_data() -if __name__ == '__main__': - main() +if __name__ == "__main__": + main() diff --git a/src/acquisition/cdcp/cdc_extract.py b/src/acquisition/cdcp/cdc_extract.py index 83ed08d5b..0d38e0bcc 100644 --- a/src/acquisition/cdcp/cdc_extract.py +++ b/src/acquisition/cdcp/cdc_extract.py @@ -75,7 +75,7 @@ def get_num_hits(cur, epiweek, state, page): - sql = ''' + sql = """ SELECT sum(c.`num`) `num` FROM @@ -86,36 +86,36 @@ def get_num_hits(cur, epiweek, state, page): m.`date` = c.`date` AND m.`state` = c.`state` WHERE m.`epiweek` = %s AND c.`state` = %s AND c.`page` LIKE %s - ''' - num = None - cur.execute(sql, (epiweek, state, page)) - for (num,) in cur: - pass - if num is None: - return 0 - return num + """ + num = None + cur.execute(sql, (epiweek, state, page)) + for (num,) in cur: + pass + if num is None: + return 0 + return num def get_total_hits(cur, epiweek, state): - sql = ''' + sql = """ SELECT sum(m.`total`) `total` FROM `cdc_meta` m WHERE m.`epiweek` = %s AND m.`state` = %s - ''' - total = None - cur.execute(sql, (epiweek, state)) - for (total,) in cur: - pass - if total is None: - raise Exception('missing data for %d-%s' % (epiweek, state)) - return total + """ + total = None + cur.execute(sql, (epiweek, state)) + for (total,) in cur: + pass + if total is None: + raise Exception(f"missing data for {int(epiweek)}-{state}") + return total def store_result(cur, epiweek, state, num1, num2, num3, num4, num5, num6, num7, num8, total): - sql = ''' + sql = """ INSERT INTO `cdc_extract` (`epiweek`, `state`, `num1`, `num2`, `num3`, `num4`, `num5`, `num6`, `num7`, `num8`, `total`) VALUES @@ -130,94 +130,89 @@ def store_result(cur, epiweek, state, num1, num2, num3, num4, num5, num6, num7, `num7` = %s, `num8` = %s, `total` = %s - ''' - values = [num1, num2, num3, num4, num5, num6, num7, num8, total] - args = tuple([epiweek, state] + values + values) - cur.execute(sql, args) + """ + values = [num1, num2, num3, num4, num5, num6, num7, num8, total] + args = tuple([epiweek, state] + values + values) + cur.execute(sql, args) def extract(first_week=None, last_week=None, test_mode=False): - # page title templates - pages = [ - '%What You Should Know for the % Influenza Season%', - '%What To Do If You Get Sick%', - '%Flu Symptoms & Severity%', - '%How Flu Spreads%', - '%What You Should Know About Flu Antiviral Drugs%', - '%Weekly US Map%', - '%Basics%', - '%Flu Activity & Surveillance%', - ] - - # location information - states = sorted(cdc_upload.STATES.values()) - - # connect - u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - cur = cnx.cursor() - - # weeks to update - if first_week is None: - cur.execute('SELECT max(`epiweek`) FROM `cdc_extract`') - for (first_week,) in cur: - pass - if last_week is None: - cur.execute('SELECT max(`epiweek`) FROM `cdc_meta`') - for (last_week,) in cur: - pass - print('extracting %d--%d' % (first_week, last_week)) - - # update each epiweek - for epiweek in flu.range_epiweeks(first_week, last_week, inclusive=True): - # update each state - for state in states: - try: - num1 = get_num_hits(cur, epiweek, state, pages[0]) - num2 = get_num_hits(cur, epiweek, state, pages[1]) - num3 = get_num_hits(cur, epiweek, state, pages[2]) - num4 = get_num_hits(cur, epiweek, state, pages[3]) - num5 = get_num_hits(cur, epiweek, state, pages[4]) - num6 = get_num_hits(cur, epiweek, state, pages[5]) - num7 = get_num_hits(cur, epiweek, state, pages[6]) - num8 = get_num_hits(cur, epiweek, state, pages[7]) - total = get_total_hits(cur, epiweek, state) - store_result(cur, epiweek, state, num1, num2, num3, num4, num5, num6, num7, num8, total) - print(' %d-%s: %d %d %d %d %d %d %d %d (%d)' % (epiweek, state, num1, num2, num3, num4, num5, num6, num7, num8, total)) - except Exception as ex: - print(' %d-%s: failed' % (epiweek, state), ex) - #raise ex - sys.stdout.flush() - - # disconnect - cur.close() - if not test_mode: - cnx.commit() - cnx.close() + # page title templates + pages = [ + "%What You Should Know for the % Influenza Season%", + "%What To Do If You Get Sick%", + "%Flu Symptoms & Severity%", + "%How Flu Spreads%", + "%What You Should Know About Flu Antiviral Drugs%", + "%Weekly US Map%", + "%Basics%", + "%Flu Activity & Surveillance%", + ] + + # location information + states = sorted(cdc_upload.STATES.values()) + + # connect + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") + cur = cnx.cursor() + + # weeks to update + if first_week is None: + cur.execute("SELECT max(`epiweek`) FROM `cdc_extract`") + for (first_week,) in cur: + pass + if last_week is None: + cur.execute("SELECT max(`epiweek`) FROM `cdc_meta`") + for (last_week,) in cur: + pass + print(f"extracting {int(first_week)}--{int(last_week)}") + + # update each epiweek + for epiweek in flu.range_epiweeks(first_week, last_week, inclusive=True): + # update each state + for state in states: + try: + nums = [] + for i in range(8): + nums[i] = get_num_hits(cur, epiweek, state, pages[i]) + total = get_total_hits(cur, epiweek, state) + store_result(cur, epiweek, state, *nums, total) + print(f" {epiweek}-{state}: {' '.join(str(n) for n in nums)} ({total})") + except Exception as ex: + print(f" {int(epiweek)}-{state}: failed", ex) + # raise ex + sys.stdout.flush() + + # disconnect + cur.close() + if not test_mode: + cnx.commit() + cnx.close() def main(): - # args and usage - parser = argparse.ArgumentParser() - parser.add_argument('--first', '-f', default=None, type=int, help='first epiweek override') - parser.add_argument('--last', '-l', default=None, type=int, help='last epiweek override') - parser.add_argument('--epiweek', '-w', default=None, type=int, help='epiweek override') - parser.add_argument('--test', '-t', default=False, action='store_true', help='dry run only') - args = parser.parse_args() - - # sanity check - first, last, week = args.first, args.last, args.epiweek - for ew in [first, last, week]: - if ew is not None: - flu.check_epiweek(ew) - if first is not None and last is not None and first > last: - raise Exception('epiweeks in the wrong order') - if week is not None: - first = last = week - - # extract the page hits for all states on the specified weeks - extract(first, last, args.test) - - -if __name__ == '__main__': - main() + # args and usage + parser = argparse.ArgumentParser() + parser.add_argument("--first", "-f", default=None, type=int, help="first epiweek override") + parser.add_argument("--last", "-l", default=None, type=int, help="last epiweek override") + parser.add_argument("--epiweek", "-w", default=None, type=int, help="epiweek override") + parser.add_argument("--test", "-t", default=False, action="store_true", help="dry run only") + args = parser.parse_args() + + # sanity check + first, last, week = args.first, args.last, args.epiweek + for ew in [first, last, week]: + if ew is not None: + flu.check_epiweek(ew) + if first is not None and last is not None and first > last: + raise Exception("epiweeks in the wrong order") + if week is not None: + first = last = week + + # extract the page hits for all states on the specified weeks + extract(first, last, args.test) + + +if __name__ == "__main__": + main() diff --git a/src/acquisition/cdcp/cdc_upload.py b/src/acquisition/cdcp/cdc_upload.py index c9c206dfa..0e191267b 100644 --- a/src/acquisition/cdcp/cdc_upload.py +++ b/src/acquisition/cdcp/cdc_upload.py @@ -87,191 +87,192 @@ STATES = { - 'Alabama': 'AL', - 'Alaska': 'AK', - 'Arizona': 'AZ', - 'Arkansas': 'AR', - 'California': 'CA', - 'Colorado': 'CO', - 'Connecticut': 'CT', - 'Delaware': 'DE', - 'District of Columbia': 'DC', - 'Florida': 'FL', - 'Georgia': 'GA', - 'Hawaii': 'HI', - 'Idaho': 'ID', - 'Illinois': 'IL', - 'Indiana': 'IN', - 'Iowa': 'IA', - 'Kansas': 'KS', - 'Kentucky': 'KY', - 'Louisiana': 'LA', - 'Maine': 'ME', - 'Maryland': 'MD', - 'Massachusetts': 'MA', - 'Michigan': 'MI', - 'Minnesota': 'MN', - 'Mississippi': 'MS', - 'Missouri': 'MO', - 'Montana': 'MT', - 'Nebraska': 'NE', - 'Nevada': 'NV', - 'New Hampshire': 'NH', - 'New Jersey': 'NJ', - 'New Mexico': 'NM', - 'New York': 'NY', - 'North Carolina': 'NC', - 'North Dakota': 'ND', - 'Ohio': 'OH', - 'Oklahoma': 'OK', - 'Oregon': 'OR', - 'Pennsylvania': 'PA', - 'Rhode Island': 'RI', - 'South Carolina': 'SC', - 'South Dakota': 'SD', - 'Tennessee': 'TN', - 'Texas': 'TX', - 'Utah': 'UT', - 'Vermont': 'VT', - 'Virginia': 'VA', - 'Washington': 'WA', - 'West Virginia': 'WV', - 'Wisconsin': 'WI', - 'Wyoming': 'WY', - #'Puerto Rico': 'PR', - #'Virgin Islands': 'VI', - #'Guam': 'GU', + "Alabama": "AL", + "Alaska": "AK", + "Arizona": "AZ", + "Arkansas": "AR", + "California": "CA", + "Colorado": "CO", + "Connecticut": "CT", + "Delaware": "DE", + "District of Columbia": "DC", + "Florida": "FL", + "Georgia": "GA", + "Hawaii": "HI", + "Idaho": "ID", + "Illinois": "IL", + "Indiana": "IN", + "Iowa": "IA", + "Kansas": "KS", + "Kentucky": "KY", + "Louisiana": "LA", + "Maine": "ME", + "Maryland": "MD", + "Massachusetts": "MA", + "Michigan": "MI", + "Minnesota": "MN", + "Mississippi": "MS", + "Missouri": "MO", + "Montana": "MT", + "Nebraska": "NE", + "Nevada": "NV", + "New Hampshire": "NH", + "New Jersey": "NJ", + "New Mexico": "NM", + "New York": "NY", + "North Carolina": "NC", + "North Dakota": "ND", + "Ohio": "OH", + "Oklahoma": "OK", + "Oregon": "OR", + "Pennsylvania": "PA", + "Rhode Island": "RI", + "South Carolina": "SC", + "South Dakota": "SD", + "Tennessee": "TN", + "Texas": "TX", + "Utah": "UT", + "Vermont": "VT", + "Virginia": "VA", + "Washington": "WA", + "West Virginia": "WV", + "Wisconsin": "WI", + "Wyoming": "WY", + #'Puerto Rico': 'PR', + #'Virgin Islands': 'VI', + #'Guam': 'GU', } -sql_cdc = ''' +sql_cdc = """ INSERT INTO `cdc` (`date`, `page`, `state`, `num`) VALUES (%s, %s, %s, %s) ON DUPLICATE KEY UPDATE `num` = %s -''' +""" -sql_cdc_meta = ''' +sql_cdc_meta = """ INSERT INTO `cdc_meta` (`date`, `epiweek`, `state`, `total`) VALUES (%s, yearweek(%s, 6), %s, %s) ON DUPLICATE KEY UPDATE `total` = %s -''' +""" def upload(test_mode): - # connect - u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - cur = cnx.cursor() - - # insert (or update) table `cdc` - def insert_cdc(date, page, state, num): - cur.execute(sql_cdc, (date, page, state, num, num)) - - # insert (or update) table `cdc_meta` - def insert_cdc_meta(date, state, total): - cur.execute(sql_cdc_meta, (date, date, state, total, total)) - - # loop over rows until the header row is found - def find_header(reader): - for row in reader: - if len(row) > 0 and row[0] == 'Date': - return True - return False - - # parse csv files for `cdc` and `cdc_meta` - def parse_csv(meta): - def handler(reader): - if not find_header(reader): - raise Exception('header not found') - count = 0 - cols = 3 if meta else 4 - for row in reader: - if len(row) != cols: - continue - if meta: - (a, c, d) = row - else: - (a, b, c, d) = row - c = c[:-16] - if c not in STATES: - continue - a = datetime.strptime(a, '%b %d, %Y').strftime('%Y-%m-%d') - c = STATES[c] - d = int(d) - if meta: - insert_cdc_meta(a, c, d) + # connect + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") + cur = cnx.cursor() + + # insert (or update) table `cdc` + def insert_cdc(date, page, state, num): + cur.execute(sql_cdc, (date, page, state, num, num)) + + # insert (or update) table `cdc_meta` + def insert_cdc_meta(date, state, total): + cur.execute(sql_cdc_meta, (date, date, state, total, total)) + + # loop over rows until the header row is found + def find_header(reader): + for row in reader: + if len(row) > 0 and row[0] == "Date": + return True + return False + + # parse csv files for `cdc` and `cdc_meta` + def parse_csv(meta): + def handler(reader): + if not find_header(reader): + raise Exception("header not found") + count = 0 + cols = 3 if meta else 4 + for row in reader: + if len(row) != cols: + continue + if meta: + (a, c, d) = row + else: + (a, b, c, d) = row + c = c[:-16] + if c not in STATES: + continue + a = datetime.strptime(a, "%b %d, %Y").strftime("%Y-%m-%d") + c = STATES[c] + d = int(d) + if meta: + insert_cdc_meta(a, c, d) + else: + insert_cdc(a, b, c, d) + count += 1 + return count + + return handler + + # recursively open zip files + def parse_zip(zf, level=1): + for name in zf.namelist(): + prefix = " " * level + print(prefix, name) + if name[-4:] == ".zip": + with zf.open(name) as temp: + with ZipFile(io.BytesIO(temp.read())) as zf2: + parse_zip(zf2, level + 1) + elif name[-4:] == ".csv": + handler = None + if "Flu Pages by Region" in name: + handler = parse_csv(False) + elif "Regions for all CDC" in name: + handler = parse_csv(True) + else: + print(prefix, " (skipped)") + if handler is not None: + with zf.open(name) as temp: + count = handler(csv.reader(io.StringIO(str(temp.read(), "utf-8")))) + print(prefix, f" {int(count)} rows") + else: + print(prefix, " (ignored)") + + # find, parse, and move zip files + zip_files = glob.glob("/common/cdc_stage/*.zip") + print("searching...") + for f in zip_files: + print(" ", f) + print("parsing...") + for f in zip_files: + with ZipFile(f) as zf: + parse_zip(zf) + print("moving...") + for f in zip_files: + src = f + dst = os.path.join("/home/automation/cdc_page_stats/", os.path.basename(src)) + print(" ", src, "->", dst) + if test_mode: + print(" (test mode enabled - not moved)") else: - insert_cdc(a, b, c, d) - count += 1 - return count - return handler - - # recursively open zip files - def parse_zip(zf, level=1): - for name in zf.namelist(): - prefix = ' ' * level - print(prefix, name) - if name[-4:] == '.zip': - with zf.open(name) as temp: - with ZipFile(io.BytesIO(temp.read())) as zf2: - parse_zip(zf2, level + 1) - elif name[-4:] == '.csv': - handler = None - if 'Flu Pages by Region' in name: - handler = parse_csv(False) - elif 'Regions for all CDC' in name: - handler = parse_csv(True) - else: - print(prefix, ' (skipped)') - if handler is not None: - with zf.open(name) as temp: - count = handler(csv.reader(io.StringIO(str(temp.read(), 'utf-8')))) - print(prefix, ' %d rows' % count) - else: - print(prefix, ' (ignored)') - - # find, parse, and move zip files - zip_files = glob.glob('/common/cdc_stage/*.zip') - print('searching...') - for f in zip_files: - print(' ', f) - print('parsing...') - for f in zip_files: - with ZipFile(f) as zf: - parse_zip(zf) - print('moving...') - for f in zip_files: - src = f - dst = os.path.join('/home/automation/cdc_page_stats/', os.path.basename(src)) - print(' ', src, '->', dst) - if test_mode: - print(' (test mode enabled - not moved)') - else: - shutil.move(src, dst) - if not os.path.isfile(dst): - raise Exception('unable to move file') - - # disconnect - cur.close() - if not test_mode: - cnx.commit() - cnx.close() + shutil.move(src, dst) + if not os.path.isfile(dst): + raise Exception("unable to move file") + + # disconnect + cur.close() + if not test_mode: + cnx.commit() + cnx.close() def main(): - # args and usage - parser = argparse.ArgumentParser() - parser.add_argument('--test', '-t', default=False, action='store_true', help='dry run only') - args = parser.parse_args() + # args and usage + parser = argparse.ArgumentParser() + parser.add_argument("--test", "-t", default=False, action="store_true", help="dry run only") + args = parser.parse_args() - # make it happen - upload(args.test) + # make it happen + upload(args.test) -if __name__ == '__main__': - main() +if __name__ == "__main__": + main() diff --git a/src/acquisition/covidcast_nowcast/load_sensors.py b/src/acquisition/covidcast_nowcast/load_sensors.py index 73ce7eee5..2e2269bb8 100644 --- a/src/acquisition/covidcast_nowcast/load_sensors.py +++ b/src/acquisition/covidcast_nowcast/load_sensors.py @@ -82,8 +82,7 @@ def load_and_prepare_file(filepath: str, attributes: PathDetails) -> pd.DataFram def _move_after_processing(filepath, success): archive_dir = SUCCESS_DIR if success else FAIL_DIR - new_dir = os.path.dirname(filepath).replace( - "receiving", archive_dir) + new_dir = os.path.dirname(filepath).replace("receiving", archive_dir) os.makedirs(new_dir, exist_ok=True) move(filepath, filepath.replace("receiving", archive_dir)) print(f"{filepath} moved to {archive_dir}") @@ -96,10 +95,14 @@ def method(table, conn, keys, data_iter): meta, # specify lag column explicitly; lag is a reserved word sqlalchemy doesn't know about sqlalchemy.Column("lag", sqlalchemy.Integer, quote=True), - autoload=True) - insert_stmt = sqlalchemy.dialects.mysql.insert(sql_table).values([dict(zip(keys, data)) for data in data_iter]) + autoload=True, + ) + insert_stmt = sqlalchemy.dialects.mysql.insert(sql_table).values( + [dict(zip(keys, data)) for data in data_iter] + ) upsert_stmt = insert_stmt.on_duplicate_key_update({x.name: x for x in insert_stmt.inserted}) conn.execute(upsert_stmt) + return method diff --git a/src/acquisition/ecdc/ecdc_db_update.py b/src/acquisition/ecdc/ecdc_db_update.py index 63689c1d5..84423c376 100644 --- a/src/acquisition/ecdc/ecdc_db_update.py +++ b/src/acquisition/ecdc/ecdc_db_update.py @@ -33,9 +33,8 @@ import argparse import datetime import glob -import subprocess -import random import os +import tempfile # third party import mysql.connector @@ -46,12 +45,14 @@ from delphi.utils.epiweek import delta_epiweeks from delphi.utils.epidate import EpiDate + def ensure_tables_exist(): - (u,p) = secrets.db.epi - cnx = mysql.connector.connect(user=u,password=p,database='epidata') + (u, p) = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") try: cursor = cnx.cursor() - cursor.execute(''' + cursor.execute( + """ CREATE TABLE IF NOT EXISTS `ecdc_ili` ( `id` INT(11) NOT NULL PRIMARY KEY AUTO_INCREMENT, `release_date` DATE NOT NULL, @@ -62,58 +63,63 @@ def ensure_tables_exist(): `incidence_rate` DOUBLE NOT NULL, UNIQUE KEY (`issue`, `epiweek`, `region`) ); - '''); + """ + ) cnx.commit() finally: cnx.close() + def safe_float(f): try: - return float(f.replace(',','')) + return float(f.replace(",", "")) except: return 0 + def safe_int(i): try: - return int(i.replace(',','')) + return int(i.replace(",", "")) except: return 0 -def get_rows(cnx, table='ecdc_ili'): - # Count and return the number of rows in the `ecdc_ili` table. - select = cnx.cursor() - select.execute('SELECT count(1) num FROM %s' % table) - for (num,) in select: - pass - select.close() - return num + +def get_rows(cnx, table="ecdc_ili"): + # Count and return the number of rows in the `ecdc_ili` table. + select = cnx.cursor() + select.execute(f"SELECT count(1) num FROM {table}") + for (num,) in select: + pass + select.close() + return num + def update_from_file(issue, date, dir, test_mode=False): # Read ECDC data from CSVs and insert into (or update) the database. # database connection u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - rows1 = get_rows(cnx, 'ecdc_ili') - print('rows before: %d' % (rows1)) + cnx = mysql.connector.connect(user=u, password=p, database="epidata") + rows1 = get_rows(cnx, "ecdc_ili") + print(f"rows before: {int(rows1)}") insert = cnx.cursor() # load the data, ignoring empty rows - files = glob.glob(os.path.join(dir,"*.csv")) + files = glob.glob(os.path.join(dir, "*.csv")) rows = [] for filename in files: - with open(filename,'r') as f: + with open(filename) as f: for l in f: - data = list(map(lambda s: s.strip().replace('"',''),l.split(','))) + data = list(map(lambda s: s.strip().replace('"', ""), l.split(","))) row = {} - row['epiweek'] = int(data[1][:4] + data[1][5:]) - row['region'] = data[4] - row['incidence_rate'] = data[3] + row["epiweek"] = int(data[1][:4] + data[1][5:]) + row["region"] = data[4] + row["incidence_rate"] = data[3] rows.append(row) - print(' loaded %d rows' % len(rows)) + print(f" loaded {len(rows)} rows") entries = [obj for obj in rows if obj] - print(' found %d entries' % len(entries)) + print(f" found {len(entries)} entries") - sql = ''' + sql = """ INSERT INTO `ecdc_ili` (`release_date`, `issue`, `epiweek`, `region`, `lag`, `incidence_rate`) @@ -122,13 +128,13 @@ def update_from_file(issue, date, dir, test_mode=False): ON DUPLICATE KEY UPDATE `release_date` = least(`release_date`, '%s'), `incidence_rate` = %s - ''' + """ for row in entries: - lag = delta_epiweeks(row['epiweek'], issue) - data_args = [row['incidence_rate']] + lag = delta_epiweeks(row["epiweek"], issue) + data_args = [row["incidence_rate"]] - insert_args = [date,issue,row['epiweek'],row['region'],lag] + data_args + insert_args = [date, issue, row["epiweek"], row["region"], lag] + data_args update_args = [date] + data_args try: insert.execute(sql % tuple(insert_args + update_args)) @@ -138,39 +144,42 @@ def update_from_file(issue, date, dir, test_mode=False): # cleanup insert.close() if test_mode: - print('test mode, not committing') + print("test mode, not committing") rows2 = rows1 else: cnx.commit() rows2 = get_rows(cnx) - print('rows after: %d (added %d)' % (rows2,rows2-rows1)) + print(f"rows after: {int(rows2)} (added {int(rows2 - rows1)})") cnx.close() + def main(): # args and usage parser = argparse.ArgumentParser() + # fmt: off parser.add_argument( - '--test', - action='store_true', - help='do dry run only, do not update the database' + "--test", + action="store_true", + help="do dry run only, do not update the database" ) parser.add_argument( - '--file', + "--file", type=str, - help='load an existing zip file (otherwise fetch current data)' + help="load an existing zip file (otherwise fetch current data)" ) parser.add_argument( - '--issue', + "--issue", type=int, - help='issue of the file (e.g. 201740); used iff --file is given' + help="issue of the file (e.g. 201740); used iff --file is given" ) + # fmt: on args = parser.parse_args() if (args.file is None) != (args.issue is None): - raise Exception('--file and --issue must both be present or absent') + raise Exception("--file and --issue must both be present or absent") - date = datetime.datetime.now().strftime('%Y-%m-%d') - print('assuming release date is today, %s' % date) + date = datetime.datetime.now().strftime("%Y-%m-%d") + print(f"assuming release date is today, {date}") ensure_tables_exist() if args.file: @@ -182,29 +191,26 @@ def main(): max_tries = 5 while flag < max_tries: flag = flag + 1 - tmp_dir = ''.join(random.choice('0123456789abcdefghijklmnopqrstuvwxyz') for i in range(8)) - tmp_dir = 'downloads_' + tmp_dir - subprocess.call(["mkdir",tmp_dir]) - # Use temporary directory to avoid data from different time - # downloaded to same folder - download_ecdc_data(download_dir=tmp_dir) - issue = EpiDate.today().get_ew() - files = glob.glob('%s/*.csv' % tmp_dir) - for filename in files: - with open(filename,'r') as f: - _ = f.readline() - db_error = False - for filename in files: - try: - update_from_file(issue, date, filename, test_mode=args.test) - subprocess.call(["rm",filename]) - except: - db_error = True - subprocess.call(["rm","-r",tmp_dir]) - if not db_error: - break # Exit loop with success + with tempfile.TemporaryDirectory() as tmp_dir: + # Use temporary directory to avoid data from different time + # downloaded to same folder + download_ecdc_data(download_dir=tmp_dir) + issue = EpiDate.today().get_ew() + files = glob.glob(f"{tmp_dir}/*.csv") + for filename in files: + with open(filename) as f: + _ = f.readline() + db_error = False + for filename in files: + try: + update_from_file(issue, date, filename, test_mode=args.test) + except: + db_error = True + if not db_error: + break # Exit loop with success if flag >= max_tries: - print('WARNING: Database `ecdc_ili` did not update successfully') + print("WARNING: Database `ecdc_ili` did not update successfully") + -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/src/acquisition/ecdc/ecdc_ili.py b/src/acquisition/ecdc/ecdc_ili.py index 1dd0505d1..dca9b51ae 100644 --- a/src/acquisition/ecdc/ecdc_ili.py +++ b/src/acquisition/ecdc/ecdc_ili.py @@ -11,60 +11,74 @@ from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.support.ui import Select -from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC -def download_ecdc_data(download_dir = "downloads"): - url = 'https://flunewseurope.org/PrimaryCareData' +def download_ecdc_data(download_dir="downloads"): + url = "https://flunewseurope.org/PrimaryCareData" resp = requests.get(url) - soup = BeautifulSoup(resp.content, 'lxml') - mydivs = soup.findAll('div') + soup = BeautifulSoup(resp.content, "lxml") + mydivs = soup.findAll("div") for div in mydivs: dic = div.attrs - if dic.get('class')== ['graph-container'] and dic.get('id')== 'dinfl06': + if dic.get("class") == ["graph-container"] and dic.get("id") == "dinfl06": break # get new url of the ILI chunck - url = div.contents[1].attrs['src'] + url = div.contents[1].attrs["src"] opts = webdriver.firefox.options.Options() opts.set_headless() fp = webdriver.FirefoxProfile() - fp.set_preference("browser.download.folderList",2) - fp.set_preference("browser.download.manager.showWhenStarting",False) - fp.set_preference("browser.download.dir",os.path.abspath(download_dir)) - fp.set_preference("browser.helperApps.neverAsk.saveToDisk","text/csv") + fp.set_preference("browser.download.folderList", 2) + fp.set_preference("browser.download.manager.showWhenStarting", False) + fp.set_preference("browser.download.dir", os.path.abspath(download_dir)) + fp.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/csv") try: - driver = webdriver.Firefox(options=opts,firefox_profile=fp) + driver = webdriver.Firefox(options=opts, firefox_profile=fp) driver.get(url) for i in range(2, 54): # select country try: - WebDriverWait(driver,30).until(EC.element_to_be_clickable((By.ID,'fluNewsReportViewer_ctl04_ctl03_ddValue'))) - Select(driver.find_element_by_tag_name('select')).select_by_value(str(i)) + WebDriverWait(driver, 30).until( + EC.element_to_be_clickable((By.ID, "fluNewsReportViewer_ctl04_ctl03_ddValue")) + ) + Select(driver.find_element_by_tag_name("select")).select_by_value(str(i)) time.sleep(3) - soup = BeautifulSoup(driver.page_source, 'html.parser') - options = soup.select('#fluNewsReportViewer_ctl04_ctl05_ddValue')[0].find_all('option') + soup = BeautifulSoup(driver.page_source, "html.parser") + options = soup.select("#fluNewsReportViewer_ctl04_ctl05_ddValue")[0].find_all( + "option" + ) ind = 1 for j in range(len(options)): - if 'ILI' in str(options[j]): - pattern = re.compile(r'\d+') + if "ILI" in str(options[j]): + pattern = re.compile(r"\d+") ind = re.findall(pattern, str(options[j]))[0] break if type(ind) == str: # select clinical tyle - WebDriverWait(driver,30).until(EC.element_to_be_clickable((By.ID,'fluNewsReportViewer_ctl04_ctl05_ddValue'))) - Select(driver.find_element_by_id('fluNewsReportViewer_ctl04_ctl05_ddValue')).select_by_value(ind) - WebDriverWait(driver,30).until(EC.element_to_be_clickable((By.ID,'btnSelectExportType'))) - driver.find_element_by_id('btnSelectExportType').click() - WebDriverWait(driver,30).until(EC.element_to_be_clickable((By.ID,'btnExportToCsv'))) - driver.find_element_by_id('btnExportToCsv').click() + WebDriverWait(driver, 30).until( + EC.element_to_be_clickable( + (By.ID, "fluNewsReportViewer_ctl04_ctl05_ddValue") + ) + ) + Select( + driver.find_element_by_id("fluNewsReportViewer_ctl04_ctl05_ddValue") + ).select_by_value(ind) + WebDriverWait(driver, 30).until( + EC.element_to_be_clickable((By.ID, "btnSelectExportType")) + ) + driver.find_element_by_id("btnSelectExportType").click() + WebDriverWait(driver, 30).until( + EC.element_to_be_clickable((By.ID, "btnExportToCsv")) + ) + driver.find_element_by_id("btnExportToCsv").click() time.sleep(3) except: driver.get(url) except: - print('WARNING: ECDC Scraper may not have downloaded all of the available data.') - #cleanup - os.system('''pkill "firefox" ''') + print("WARNING: ECDC Scraper may not have downloaded all of the available data.") + # cleanup + os.system("""pkill "firefox" """) os.system('''pkill "(firefox-bin)"''') os.system('''pkill "geckodriver*"''') diff --git a/src/acquisition/flusurv/flusurv.py b/src/acquisition/flusurv/flusurv.py index 6b8d247ae..28105d933 100644 --- a/src/acquisition/flusurv/flusurv.py +++ b/src/acquisition/flusurv/flusurv.py @@ -50,167 +50,170 @@ # all currently available FluSurv locations and their associated codes # the number pair represents NetworkID and CatchmentID location_codes = { - 'CA': (2, 1), - 'CO': (2, 2), - 'CT': (2, 3), - 'GA': (2, 4), - 'IA': (3, 5), - 'ID': (3, 6), - 'MD': (2, 7), - 'MI': (3, 8), - 'MN': (2, 9), - 'NM': (2, 11), - 'NY_albany': (2, 13), - 'NY_rochester': (2, 14), - 'OH': (3, 15), - 'OK': (3, 16), - 'OR': (2, 17), - 'RI': (3, 18), - 'SD': (3, 19), - 'TN': (2, 20), - 'UT': (3, 21), - 'network_all': (1, 22), - 'network_eip': (2, 22), - 'network_ihsp': (3, 22), + "CA": (2, 1), + "CO": (2, 2), + "CT": (2, 3), + "GA": (2, 4), + "IA": (3, 5), + "ID": (3, 6), + "MD": (2, 7), + "MI": (3, 8), + "MN": (2, 9), + "NM": (2, 11), + "NY_albany": (2, 13), + "NY_rochester": (2, 14), + "OH": (3, 15), + "OK": (3, 16), + "OR": (2, 17), + "RI": (3, 18), + "SD": (3, 19), + "TN": (2, 20), + "UT": (3, 21), + "network_all": (1, 22), + "network_eip": (2, 22), + "network_ihsp": (3, 22), } def fetch_json(path, payload, call_count=1, requests_impl=requests): - """Send a request to the server and return the parsed JSON response.""" - - # it's polite to self-identify this "bot" - delphi_url = 'https://delphi.cmu.edu/index.html' - user_agent = 'Mozilla/5.0 (compatible; delphibot/1.0; +%s)' % delphi_url - - # the FluSurv AMF server - flusurv_url = 'https://gis.cdc.gov/GRASP/Flu3/' + path - - # request headers - headers = { - 'Accept-Encoding': 'gzip', - 'User-Agent': user_agent, - } - if payload is not None: - headers['Content-Type'] = 'application/json;charset=UTF-8' - - # send the request and read the response - if payload is None: - method = requests_impl.get - data = None - else: - method = requests_impl.post - data = json.dumps(payload) - resp = method(flusurv_url, headers=headers, data=data) - - # check the HTTP status code - if resp.status_code == 500 and call_count <= 2: - # the server often fails with this status, so wait and retry - delay = 10 * call_count - print('got status %d, will retry in %d sec...' % (resp.status_code, delay)) - time.sleep(delay) - return fetch_json(path, payload, call_count=call_count + 1) - elif resp.status_code != 200: - raise Exception(['status code != 200', resp.status_code]) - - # check response mime type - if 'application/json' not in resp.headers.get('Content-Type', ''): - raise Exception('response is not json') - - # return the decoded json object - return resp.json() + """Send a request to the server and return the parsed JSON response.""" + + # it's polite to self-identify this "bot" + delphi_url = "https://delphi.cmu.edu/index.html" + user_agent = f"Mozilla/5.0 (compatible; delphibot/1.0; +{delphi_url})" + + # the FluSurv AMF server + flusurv_url = "https://gis.cdc.gov/GRASP/Flu3/" + path + + # request headers + headers = { + "Accept-Encoding": "gzip", + "User-Agent": user_agent, + } + if payload is not None: + headers["Content-Type"] = "application/json;charset=UTF-8" + + # send the request and read the response + if payload is None: + method = requests_impl.get + data = None + else: + method = requests_impl.post + data = json.dumps(payload) + resp = method(flusurv_url, headers=headers, data=data) + + # check the HTTP status code + if resp.status_code == 500 and call_count <= 2: + # the server often fails with this status, so wait and retry + delay = 10 * call_count + print(f"got status {int(resp.status_code)}, will retry in {int(delay)} sec...") + time.sleep(delay) + return fetch_json(path, payload, call_count=call_count + 1) + elif resp.status_code != 200: + raise Exception(["status code != 200", resp.status_code]) + + # check response mime type + if "application/json" not in resp.headers.get("Content-Type", ""): + raise Exception("response is not json") + + # return the decoded json object + return resp.json() def fetch_flusurv_object(location_code): - """Return decoded FluSurv JSON object for the given location.""" - return fetch_json('PostPhase03GetData', { - 'appversion': 'Public', - 'networkid': location_code[0], - 'cacthmentid': location_code[1], - }) + """Return decoded FluSurv JSON object for the given location.""" + return fetch_json( + "PostPhase03GetData", + { + "appversion": "Public", + "networkid": location_code[0], + "cacthmentid": location_code[1], + }, + ) def mmwrid_to_epiweek(mmwrid): - """Convert a CDC week index into an epiweek.""" + """Convert a CDC week index into an epiweek.""" - # Add the difference in IDs, which are sequential, to a reference epiweek, - # which is 2003w40 in this case. - epiweek_200340 = EpiDate(2003, 9, 28) - mmwrid_200340 = 2179 - return epiweek_200340.add_weeks(mmwrid - mmwrid_200340).get_ew() + # Add the difference in IDs, which are sequential, to a reference epiweek, + # which is 2003w40 in this case. + epiweek_200340 = EpiDate(2003, 9, 28) + mmwrid_200340 = 2179 + return epiweek_200340.add_weeks(mmwrid - mmwrid_200340).get_ew() def extract_from_object(data_in): - """ - Given a FluSurv data object, return hospitaliation rates. - - The returned object is indexed first by epiweek, then by zero-indexed age - group. - """ - - # an object to hold the result - data_out = {} - - # iterate over all seasons and age groups - for obj in data_in['busdata']['dataseries']: - if obj['age'] in (10, 11, 12): - # TODO(https://github.com/cmu-delphi/delphi-epidata/issues/242): - # capture as-of-yet undefined age groups 10, 11, and 12 - continue - age_index = obj['age'] - 1 - # iterage over weeks - for mmwrid, _, _, rate in obj['data']: - epiweek = mmwrid_to_epiweek(mmwrid) - if epiweek not in data_out: - # weekly rate of each age group - data_out[epiweek] = [None] * 9 - prev_rate = data_out[epiweek][age_index] - if prev_rate is None: - # this is the first time to see a rate for this epiweek/age - data_out[epiweek][age_index] = rate - elif prev_rate != rate: - # a different rate was already found for this epiweek/age - format_args = (epiweek, obj['age'], prev_rate, rate) - print('warning: %d %d %f != %f' % format_args) - - # sanity check the result - if len(data_out) == 0: - raise Exception('no data found') - - # print the result and return flu data - print('found data for %d weeks' % len(data_out)) - return data_out + """ + Given a FluSurv data object, return hospitaliation rates. + + The returned object is indexed first by epiweek, then by zero-indexed age + group. + """ + + # an object to hold the result + data_out = {} + + # iterate over all seasons and age groups + for obj in data_in["busdata"]["dataseries"]: + if obj["age"] in (10, 11, 12): + # TODO(https://github.com/cmu-delphi/delphi-epidata/issues/242): + # capture as-of-yet undefined age groups 10, 11, and 12 + continue + age_index = obj["age"] - 1 + # iterage over weeks + for mmwrid, _, _, rate in obj["data"]: + epiweek = mmwrid_to_epiweek(mmwrid) + if epiweek not in data_out: + # weekly rate of each age group + data_out[epiweek] = [None] * 9 + prev_rate = data_out[epiweek][age_index] + if prev_rate is None: + # this is the first time to see a rate for this epiweek/age + data_out[epiweek][age_index] = rate + elif prev_rate != rate: + # a different rate was already found for this epiweek/age + format_args = (epiweek, obj["age"], prev_rate, rate) + print("warning: %d %d %f != %f" % format_args) + + # sanity check the result + if len(data_out) == 0: + raise Exception("no data found") + + # print the result and return flu data + print(f"found data for {len(data_out)} weeks") + return data_out def get_data(location_code): - """ - Fetch and parse flu data for the given location. + """ + Fetch and parse flu data for the given location. - This method performs the following operations: - - fetches FluSurv data from CDC - - extracts and returns hospitaliation rates - """ + This method performs the following operations: + - fetches FluSurv data from CDC + - extracts and returns hospitaliation rates + """ - # fetch - print('[fetching flusurv data...]') - data_in = fetch_flusurv_object(location_code) + # fetch + print("[fetching flusurv data...]") + data_in = fetch_flusurv_object(location_code) - # extract - print('[extracting values...]') - data_out = extract_from_object(data_in) + # extract + print("[extracting values...]") + data_out = extract_from_object(data_in) - # return - print('[scraped successfully]') - return data_out + # return + print("[scraped successfully]") + return data_out def get_current_issue(): - """Scrape the current issue from the FluSurv main page.""" + """Scrape the current issue from the FluSurv main page.""" - # fetch - data = fetch_json('GetPhase03InitApp?appVersion=Public', None) + # fetch + data = fetch_json("GetPhase03InitApp?appVersion=Public", None) - # extract - date = datetime.strptime(data['loaddatetime'], '%b %d, %Y') + # extract + date = datetime.strptime(data["loaddatetime"], "%b %d, %Y") - # convert and return - return EpiDate(date.year, date.month, date.day).get_ew() + # convert and return + return EpiDate(date.year, date.month, date.day).get_ew() diff --git a/src/acquisition/flusurv/flusurv_update.py b/src/acquisition/flusurv/flusurv_update.py index 35fadba05..1aa8e9885 100644 --- a/src/acquisition/flusurv/flusurv_update.py +++ b/src/acquisition/flusurv/flusurv_update.py @@ -82,108 +82,112 @@ def get_rows(cur): - """Return the number of rows in the `flusurv` table.""" + """Return the number of rows in the `flusurv` table.""" - # count all rows - cur.execute('SELECT count(1) `num` FROM `flusurv`') - for (num,) in cur: - return num + # count all rows + cur.execute("SELECT count(1) `num` FROM `flusurv`") + for (num,) in cur: + return num def update(issue, location_name, test_mode=False): - """Fetch and store the currently avialble weekly FluSurv dataset.""" - - # fetch data - location_code = flusurv.location_codes[location_name] - print('fetching data for', location_name, location_code) - data = flusurv.get_data(location_code) - - # metadata - epiweeks = sorted(data.keys()) - location = location_name - release_date = str(EpiDate.today()) - - # connect to the database - u, p = secrets.db.epi - cnx = mysql.connector.connect( - host=secrets.db.host, user=u, password=p, database='epidata') - cur = cnx.cursor() - rows1 = get_rows(cur) - print('rows before: %d' % rows1) - - # SQL for insert/update - sql = ''' - INSERT INTO `flusurv` ( - `release_date`, `issue`, `epiweek`, `location`, `lag`, `rate_age_0`, - `rate_age_1`, `rate_age_2`, `rate_age_3`, `rate_age_4`, `rate_overall`, - `rate_age_5`, `rate_age_6`, `rate_age_7` - ) - VALUES ( - %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s - ) - ON DUPLICATE KEY UPDATE - `release_date` = least(`release_date`, %s), - `rate_age_0` = coalesce(%s, `rate_age_0`), - `rate_age_1` = coalesce(%s, `rate_age_1`), - `rate_age_2` = coalesce(%s, `rate_age_2`), - `rate_age_3` = coalesce(%s, `rate_age_3`), - `rate_age_4` = coalesce(%s, `rate_age_4`), - `rate_overall` = coalesce(%s, `rate_overall`), - `rate_age_5` = coalesce(%s, `rate_age_5`), - `rate_age_6` = coalesce(%s, `rate_age_6`), - `rate_age_7` = coalesce(%s, `rate_age_7`) - ''' - - # insert/update each row of data (one per epiweek) - for epiweek in epiweeks: - lag = delta_epiweeks(epiweek, issue) - if lag > 52: - # Ignore values older than one year, as (1) they are assumed not to - # change, and (2) it would adversely affect database performance if all - # values (including duplicates) were stored on each run. - continue - args_meta = [release_date, issue, epiweek, location, lag] - args_insert = data[epiweek] - args_update = [release_date] + data[epiweek] - cur.execute(sql, tuple(args_meta + args_insert + args_update)) - - # commit and disconnect - rows2 = get_rows(cur) - print('rows after: %d (+%d)' % (rows2, rows2 - rows1)) - cur.close() - if test_mode: - print('test mode: not committing database changes') - else: - cnx.commit() - cnx.close() + """Fetch and store the currently avialble weekly FluSurv dataset.""" + + # fetch data + location_code = flusurv.location_codes[location_name] + print("fetching data for", location_name, location_code) + data = flusurv.get_data(location_code) + + # metadata + epiweeks = sorted(data.keys()) + location = location_name + release_date = str(EpiDate.today()) + + # connect to the database + u, p = secrets.db.epi + cnx = mysql.connector.connect(host=secrets.db.host, user=u, password=p, database="epidata") + cur = cnx.cursor() + rows1 = get_rows(cur) + print(f"rows before: {int(rows1)}") + + # SQL for insert/update + sql = """ + INSERT INTO `flusurv` ( + `release_date`, `issue`, `epiweek`, `location`, `lag`, `rate_age_0`, + `rate_age_1`, `rate_age_2`, `rate_age_3`, `rate_age_4`, `rate_overall`, + `rate_age_5`, `rate_age_6`, `rate_age_7` + ) + VALUES ( + %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s + ) + ON DUPLICATE KEY UPDATE + `release_date` = least(`release_date`, %s), + `rate_age_0` = coalesce(%s, `rate_age_0`), + `rate_age_1` = coalesce(%s, `rate_age_1`), + `rate_age_2` = coalesce(%s, `rate_age_2`), + `rate_age_3` = coalesce(%s, `rate_age_3`), + `rate_age_4` = coalesce(%s, `rate_age_4`), + `rate_overall` = coalesce(%s, `rate_overall`), + `rate_age_5` = coalesce(%s, `rate_age_5`), + `rate_age_6` = coalesce(%s, `rate_age_6`), + `rate_age_7` = coalesce(%s, `rate_age_7`) + """ + + # insert/update each row of data (one per epiweek) + for epiweek in epiweeks: + lag = delta_epiweeks(epiweek, issue) + if lag > 52: + # Ignore values older than one year, as (1) they are assumed not to + # change, and (2) it would adversely affect database performance if all + # values (including duplicates) were stored on each run. + continue + args_meta = [release_date, issue, epiweek, location, lag] + args_insert = data[epiweek] + args_update = [release_date] + data[epiweek] + cur.execute(sql, tuple(args_meta + args_insert + args_update)) + + # commit and disconnect + rows2 = get_rows(cur) + print(f"rows after: {int(rows2)} (+{int(rows2 - rows1)})") + cur.close() + if test_mode: + print("test mode: not committing database changes") + else: + cnx.commit() + cnx.close() def main(): - # args and usage - parser = argparse.ArgumentParser() - parser.add_argument( - 'location', - help='location for which data should be scraped (e.g. "CA" or "all")' - ) - parser.add_argument( - '--test', '-t', - default=False, action='store_true', help='do not commit database changes' - ) - args = parser.parse_args() - - # scrape current issue from the main page - issue = flusurv.get_current_issue() - print('current issue: %d' % issue) - - # fetch flusurv data - if args.location == 'all': - # all locations - for location in flusurv.location_codes.keys(): - update(issue, location, args.test) - else: - # single location - update(issue, args.location, args.test) - - -if __name__ == '__main__': - main() + # args and usage + parser = argparse.ArgumentParser() + # fmt: off + parser.add_argument( + "location", + help='location for which data should be scraped (e.g. "CA" or "all")' + ) + parser.add_argument( + "--test", + "-t", + default=False, + action="store_true", + help="do not commit database changes" + ) + # fmt: on + args = parser.parse_args() + + # scrape current issue from the main page + issue = flusurv.get_current_issue() + print(f"current issue: {int(issue)}") + + # fetch flusurv data + if args.location == "all": + # all locations + for location in flusurv.location_codes.keys(): + update(issue, location, args.test) + else: + # single location + update(issue, args.location, args.test) + + +if __name__ == "__main__": + main() diff --git a/src/acquisition/fluview/fluview.py b/src/acquisition/fluview/fluview.py index d723cbc59..9b4e6f537 100644 --- a/src/acquisition/fluview/fluview.py +++ b/src/acquisition/fluview/fluview.py @@ -34,183 +34,188 @@ class Key: - """ - Constants for navigating the metadata object contained in the web response - from CDC. - """ + """ + Constants for navigating the metadata object contained in the web response + from CDC. + """ - class TierType: - nat = 'National' - hhs = 'HHS Regions' - cen = 'Census Divisions' - sta = 'State' + class TierType: + nat = "National" + hhs = "HHS Regions" + cen = "Census Divisions" + sta = "State" - class TierListEntry: - hhs = 'hhsregion' - cen = 'censusregions' - sta = 'states' + class TierListEntry: + hhs = "hhsregion" + cen = "censusregions" + sta = "states" - class TierIdEntry: - hhs = 'hhsregionid' - cen = 'censusregionid' - sta = 'stateid' + class TierIdEntry: + hhs = "hhsregionid" + cen = "censusregionid" + sta = "stateid" def check_status(resp, status, content_type): - """Raise an exception if the status code or content type is unexpected.""" - if resp.status_code != status: - raise Exception('got unexpected status code: ' + str(resp.status_code)) - actual_type = resp.headers.get('Content-Type', None) - if actual_type is None or content_type not in actual_type.lower(): - raise Exception('got unexpected content type: ' + str(actual_type)) + """Raise an exception if the status code or content type is unexpected.""" + if resp.status_code != status: + raise Exception("got unexpected status code: " + str(resp.status_code)) + actual_type = resp.headers.get("Content-Type", None) + if actual_type is None or content_type not in actual_type.lower(): + raise Exception("got unexpected content type: " + str(actual_type)) def fetch_metadata(sess): - """ - Return metadata indicating the current issue and also numeric constants - representing the various locations. - """ - url = 'https://gis.cdc.gov/grasp/flu2/GetPhase02InitApp?appVersion=Public' - resp = sess.get(url) - check_status(resp, 200, 'application/json') - return resp.json() + """ + Return metadata indicating the current issue and also numeric constants + representing the various locations. + """ + url = "https://gis.cdc.gov/grasp/flu2/GetPhase02InitApp?appVersion=Public" + resp = sess.get(url) + check_status(resp, 200, "application/json") + return resp.json() def get_issue_and_locations(data): - """Extract the issue and per-tier location lists from the metadata object.""" - - def get_tier_ids(name): - for row in data['regiontypes']: - if row['description'] == name: - return row['regiontypeid'] - raise Exception() - - tier_ids = dict((name, get_tier_ids(name)) for name in ( - Key.TierType.nat, - Key.TierType.hhs, - Key.TierType.cen, - Key.TierType.sta, - )) - - location_ids = { - Key.TierType.nat: [0], - Key.TierType.hhs: [], - Key.TierType.cen: [], - Key.TierType.sta: [], - } - - # add location ids for HHS - for row in data[Key.TierListEntry.hhs]: - location_ids[Key.TierType.hhs].append(row[Key.TierIdEntry.hhs]) - location_ids[Key.TierType.hhs] = sorted(set(location_ids[Key.TierType.hhs])) - num = len(location_ids[Key.TierType.hhs]) - if num != 10: - raise Exception('expected 10 hhs regions, found %d' % num) - - # add location ids for census divisions - for row in data[Key.TierListEntry.cen]: - location_ids[Key.TierType.cen].append(row[Key.TierIdEntry.cen]) - location_ids[Key.TierType.cen] = sorted(set(location_ids[Key.TierType.cen])) - num = len(location_ids[Key.TierType.cen]) - if num != 9: - raise Exception('expected 9 census divisions, found %d' % num) - - # add location ids for states - for row in data[Key.TierListEntry.sta]: - location_ids[Key.TierType.sta].append(row[Key.TierIdEntry.sta]) - location_ids[Key.TierType.sta] = sorted(set(location_ids[Key.TierType.sta])) - num = len(location_ids[Key.TierType.sta]) - if num != 57: - raise Exception('expected 57 states/territories/cities, found %d' % num) - - # return a useful subset of the metadata - # (latest epiweek, latest season, tier ids, location ids) - return { - 'epiweek': data['mmwr'][-1]['yearweek'], - 'season_id': data['mmwr'][-1]['seasonid'], - 'tier_ids': tier_ids, - 'location_ids': location_ids, - } + """Extract the issue and per-tier location lists from the metadata object.""" + + def get_tier_ids(name): + for row in data["regiontypes"]: + if row["description"] == name: + return row["regiontypeid"] + raise Exception() + + tier_ids = { + name: get_tier_ids(name) + for name in ( + Key.TierType.nat, + Key.TierType.hhs, + Key.TierType.cen, + Key.TierType.sta, + ) + } + + location_ids = { + Key.TierType.nat: [0], + Key.TierType.hhs: [], + Key.TierType.cen: [], + Key.TierType.sta: [], + } + + # add location ids for HHS + for row in data[Key.TierListEntry.hhs]: + location_ids[Key.TierType.hhs].append(row[Key.TierIdEntry.hhs]) + location_ids[Key.TierType.hhs] = sorted(set(location_ids[Key.TierType.hhs])) + num = len(location_ids[Key.TierType.hhs]) + if num != 10: + raise Exception(f"expected 10 hhs regions, found {int(num)}") + + # add location ids for census divisions + for row in data[Key.TierListEntry.cen]: + location_ids[Key.TierType.cen].append(row[Key.TierIdEntry.cen]) + location_ids[Key.TierType.cen] = sorted(set(location_ids[Key.TierType.cen])) + num = len(location_ids[Key.TierType.cen]) + if num != 9: + raise Exception(f"expected 9 census divisions, found {int(num)}") + + # add location ids for states + for row in data[Key.TierListEntry.sta]: + location_ids[Key.TierType.sta].append(row[Key.TierIdEntry.sta]) + location_ids[Key.TierType.sta] = sorted(set(location_ids[Key.TierType.sta])) + num = len(location_ids[Key.TierType.sta]) + if num != 57: + raise Exception(f"expected 57 states/territories/cities, found {int(num)}") + + # return a useful subset of the metadata + # (latest epiweek, latest season, tier ids, location ids) + return { + "epiweek": data["mmwr"][-1]["yearweek"], + "season_id": data["mmwr"][-1]["seasonid"], + "tier_ids": tier_ids, + "location_ids": location_ids, + } def download_data(tier_id, location_ids, season_ids, filename): - """Download zipped ILINet data for the given locations and seasons.""" - - def get_entry(num, name=None): - return {'ID': num, 'Name': (name if name else num)} - - # download the data (in memory) - url = 'https://gis.cdc.gov/grasp/flu2/PostPhase02DataDownload' - data = { - 'AppVersion': 'Public', - 'DatasourceDT': [get_entry(1, 'ILINet'), get_entry(0, 'WHO_NREVSS')], - 'RegionTypeId': tier_id, - 'SubRegionsDT': [get_entry(loc) for loc in sorted(location_ids)], - 'SeasonsDT': [get_entry(season) for season in sorted(season_ids)], - } - resp = requests.post(url, json=data) - check_status(resp, 200, 'application/octet-stream') - payload = resp.content - - # save the data to file and return the file length - with open(filename, 'wb') as f: - f.write(payload) - return len(payload) + """Download zipped ILINet data for the given locations and seasons.""" + + def get_entry(num, name=None): + return {"ID": num, "Name": (name if name else num)} + + # download the data (in memory) + url = "https://gis.cdc.gov/grasp/flu2/PostPhase02DataDownload" + data = { + "AppVersion": "Public", + "DatasourceDT": [get_entry(1, "ILINet"), get_entry(0, "WHO_NREVSS")], + "RegionTypeId": tier_id, + "SubRegionsDT": [get_entry(loc) for loc in sorted(location_ids)], + "SeasonsDT": [get_entry(season) for season in sorted(season_ids)], + } + resp = requests.post(url, json=data) + check_status(resp, 200, "application/octet-stream") + payload = resp.content + + # save the data to file and return the file length + with open(filename, "wb") as f: + f.write(payload) + return len(payload) def save_latest(path=None): - """ - Save the latest two seasons of data for all locations, separately for each - location tier (i.e. national, HHS, census, and states). - """ - - # set up the session - sess = requests.session() - sess.headers.update({ - # it's polite to self-identify this "bot" - 'User-Agent': 'delphibot/1.0 (+https://delphi.cmu.edu/)', - }) - - # get metatdata - print('looking up ilinet metadata') - data = fetch_metadata(sess) - info = get_issue_and_locations(data) - issue = info['epiweek'] - print('current issue: %d' % issue) - - # establish timing - dt = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') - current_season = info['season_id'] - seasons = [s for s in range(current_season - 1, current_season + 1)] - - # make the destination path if it doesn't already exist - if path is not None: - os.makedirs(path, exist_ok=True) - - # download the data file for each tier - files = [] - for delphi_name, cdc_name in ( - ('nat', Key.TierType.nat), - ('hhs', Key.TierType.hhs), - ('cen', Key.TierType.cen), - ('sta', Key.TierType.sta), - ): - name = 'ilinet_%s_%d_%s.zip' % (delphi_name, issue, dt) - if path is None: - filename = name - else: - filename = os.path.join(path, name) - tier_id = info['tier_ids'][cdc_name] - locations = info['location_ids'][cdc_name] - - # download and show timing information - print('downloading %s' % delphi_name) - t0 = time.time() - size = download_data(tier_id, locations, seasons, filename) - t1 = time.time() - - print(' saved %s (%d bytes in %.1f seconds)' % (filename, size, t1 - t0)) - files.append(filename) - - # return the current issue and the list of downloaded files - return issue, files + """ + Save the latest two seasons of data for all locations, separately for each + location tier (i.e. national, HHS, census, and states). + """ + + # set up the session + sess = requests.session() + sess.headers.update( + { + # it's polite to self-identify this "bot" + "User-Agent": "delphibot/1.0 (+https://delphi.cmu.edu/)", + } + ) + + # get metatdata + print("looking up ilinet metadata") + data = fetch_metadata(sess) + info = get_issue_and_locations(data) + issue = info["epiweek"] + print(f"current issue: {int(issue)}") + + # establish timing + dt = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + current_season = info["season_id"] + seasons = [s for s in range(current_season - 1, current_season + 1)] + + # make the destination path if it doesn't already exist + if path is not None: + os.makedirs(path, exist_ok=True) + + # download the data file for each tier + files = [] + for delphi_name, cdc_name in ( + ("nat", Key.TierType.nat), + ("hhs", Key.TierType.hhs), + ("cen", Key.TierType.cen), + ("sta", Key.TierType.sta), + ): + name = f"ilinet_{delphi_name}_{int(issue)}_{dt}.zip" + if path is None: + filename = name + else: + filename = os.path.join(path, name) + tier_id = info["tier_ids"][cdc_name] + locations = info["location_ids"][cdc_name] + + # download and show timing information + print(f"downloading {delphi_name}") + t0 = time.time() + size = download_data(tier_id, locations, seasons, filename) + t1 = time.time() + + print(f" saved {filename} ({int(size)} bytes in {t1 - t0:.1f} seconds)") + files.append(filename) + + # return the current issue and the list of downloaded files + return issue, files diff --git a/src/acquisition/fluview/fluview_locations.py b/src/acquisition/fluview/fluview_locations.py index 9c851bc6f..e5ebe0fc3 100644 --- a/src/acquisition/fluview/fluview_locations.py +++ b/src/acquisition/fluview/fluview_locations.py @@ -15,100 +15,100 @@ # https://gis.cdc.gov/grasp/flu2/GetPhase02InitApp?appVersion=Public # The values are used in queries of Delphi's Epidata API. cdc_to_delphi = { - 'national': { - 'x': 'nat', - }, - 'hhs regions': { - 'region 1': 'hhs1', - 'region 2': 'hhs2', - 'region 3': 'hhs3', - 'region 4': 'hhs4', - 'region 5': 'hhs5', - 'region 6': 'hhs6', - 'region 7': 'hhs7', - 'region 8': 'hhs8', - 'region 9': 'hhs9', - 'region 10': 'hhs10', - }, - 'census regions': { - 'new england': 'cen1', - 'mid-atlantic': 'cen2', - 'east north central': 'cen3', - 'west north central': 'cen4', - 'south atlantic': 'cen5', - 'east south central': 'cen6', - 'west south central': 'cen7', - 'mountain': 'cen8', - 'pacific': 'cen9', - }, - 'states': { - # states/territories: two-letter ISO 3166 - 'alabama': 'al', - 'alaska': 'ak', - 'arizona': 'az', - 'arkansas': 'ar', - 'california': 'ca', - 'colorado': 'co', - 'connecticut': 'ct', - 'delaware': 'de', - 'florida': 'fl', - 'georgia': 'ga', - 'hawaii': 'hi', - 'idaho': 'id', - 'illinois': 'il', - 'indiana': 'in', - 'iowa': 'ia', - 'kansas': 'ks', - 'kentucky': 'ky', - 'louisiana': 'la', - 'maine': 'me', - 'maryland': 'md', - 'massachusetts': 'ma', - 'michigan': 'mi', - 'minnesota': 'mn', - 'mississippi': 'ms', - 'missouri': 'mo', - 'montana': 'mt', - 'nebraska': 'ne', - 'nevada': 'nv', - 'new hampshire': 'nh', - 'new jersey': 'nj', - 'new mexico': 'nm', - # Even though it's called "New York", this location doesn't include New - # York City ("jfk"). New York ("ny") is actually this *plus* jfk. - 'new york': 'ny_minus_jfk', - 'north carolina': 'nc', - 'north dakota': 'nd', - 'ohio': 'oh', - 'oklahoma': 'ok', - 'oregon': 'or', - 'pennsylvania': 'pa', - 'rhode island': 'ri', - 'south carolina': 'sc', - 'south dakota': 'sd', - 'tennessee': 'tn', - 'texas': 'tx', - 'utah': 'ut', - 'vermont': 'vt', - 'virginia': 'va', - 'washington': 'wa', - 'west virginia': 'wv', - 'wisconsin': 'wi', - 'wyoming': 'wy', - 'american samoa': 'as', - 'commonwealth of the northern mariana islands': 'mp', - 'district of columbia': 'dc', - 'guam': 'gu', - 'puerto rico': 'pr', - 'virgin islands': 'vi', - # cities: three-letter IATA - 'chicago': 'ord', - 'los angeles': 'lax', - 'new york city': 'jfk', - }, + "national": { + "x": "nat", + }, + "hhs regions": { + "region 1": "hhs1", + "region 2": "hhs2", + "region 3": "hhs3", + "region 4": "hhs4", + "region 5": "hhs5", + "region 6": "hhs6", + "region 7": "hhs7", + "region 8": "hhs8", + "region 9": "hhs9", + "region 10": "hhs10", + }, + "census regions": { + "new england": "cen1", + "mid-atlantic": "cen2", + "east north central": "cen3", + "west north central": "cen4", + "south atlantic": "cen5", + "east south central": "cen6", + "west south central": "cen7", + "mountain": "cen8", + "pacific": "cen9", + }, + "states": { + # states/territories: two-letter ISO 3166 + "alabama": "al", + "alaska": "ak", + "arizona": "az", + "arkansas": "ar", + "california": "ca", + "colorado": "co", + "connecticut": "ct", + "delaware": "de", + "florida": "fl", + "georgia": "ga", + "hawaii": "hi", + "idaho": "id", + "illinois": "il", + "indiana": "in", + "iowa": "ia", + "kansas": "ks", + "kentucky": "ky", + "louisiana": "la", + "maine": "me", + "maryland": "md", + "massachusetts": "ma", + "michigan": "mi", + "minnesota": "mn", + "mississippi": "ms", + "missouri": "mo", + "montana": "mt", + "nebraska": "ne", + "nevada": "nv", + "new hampshire": "nh", + "new jersey": "nj", + "new mexico": "nm", + # Even though it's called "New York", this location doesn't include New + # York City ("jfk"). New York ("ny") is actually this *plus* jfk. + "new york": "ny_minus_jfk", + "north carolina": "nc", + "north dakota": "nd", + "ohio": "oh", + "oklahoma": "ok", + "oregon": "or", + "pennsylvania": "pa", + "rhode island": "ri", + "south carolina": "sc", + "south dakota": "sd", + "tennessee": "tn", + "texas": "tx", + "utah": "ut", + "vermont": "vt", + "virginia": "va", + "washington": "wa", + "west virginia": "wv", + "wisconsin": "wi", + "wyoming": "wy", + "american samoa": "as", + "commonwealth of the northern mariana islands": "mp", + "district of columbia": "dc", + "guam": "gu", + "puerto rico": "pr", + "virgin islands": "vi", + # cities: three-letter IATA + "chicago": "ord", + "los angeles": "lax", + "new york city": "jfk", + }, } def get_location_name(region_type, region_name): - """Convert a CDC location type and name pair into a Delphi location name.""" - return cdc_to_delphi[region_type.lower()][region_name.lower()] + """Convert a CDC location type and name pair into a Delphi location name.""" + return cdc_to_delphi[region_type.lower()][region_name.lower()] diff --git a/src/acquisition/fluview/fluview_notify.py b/src/acquisition/fluview/fluview_notify.py index 13f0f3559..a280889a5 100644 --- a/src/acquisition/fluview/fluview_notify.py +++ b/src/acquisition/fluview/fluview_notify.py @@ -31,41 +31,53 @@ import delphi.operations.secrets as secrets -if __name__ == '__main__': - # Args and usage - parser = argparse.ArgumentParser() - parser.add_argument('-t', '--test', action='store_const', const=True, default=False, help="do dry run only, don't update the database") - args = parser.parse_args() +if __name__ == "__main__": + # Args and usage + parser = argparse.ArgumentParser() + parser.add_argument( + "-t", + "--test", + action="store_const", + const=True, + default=False, + help="do dry run only, don't update the database", + ) + args = parser.parse_args() - # connect - u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - cur = cnx.cursor() + # connect + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") + cur = cnx.cursor() - # get the last known issue from the automation table `variables` - cur.execute('SELECT `value` FROM automation.`variables` WHERE `name` = %s', ('most_recent_issue',)) - for (issue1,) in cur: - issue1 = int(issue1) - print('last known issue:', issue1) - # get the most recent issue from the epidata table `fluview` - cur.execute('SELECT max(`issue`) FROM `fluview`') - for (issue2,) in cur: - issue2 = int(issue2) - print('most recent issue:', issue2) + # get the last known issue from the automation table `variables` + cur.execute( + "SELECT `value` FROM automation.`variables` WHERE `name` = %s", ("most_recent_issue",) + ) + for (issue1,) in cur: + issue1 = int(issue1) + print("last known issue:", issue1) + # get the most recent issue from the epidata table `fluview` + cur.execute("SELECT max(`issue`) FROM `fluview`") + for (issue2,) in cur: + issue2 = int(issue2) + print("most recent issue:", issue2) - if issue2 > issue1: - print('new data is available!') - if args.test: - print('test mode - not making any changes') - else: - # update the variable - cur.execute('UPDATE automation.`variables` SET `value` = %s WHERE `name` = %s', (issue2, 'most_recent_issue')) - # queue the 'New FluView Available' flow - cur.execute('CALL automation.RunStep(36)') - elif issue2 < issue2: - raise Exception('most recent issue is older than the last known issue') + if issue2 > issue1: + print("new data is available!") + if args.test: + print("test mode - not making any changes") + else: + # update the variable + cur.execute( + "UPDATE automation.`variables` SET `value` = %s WHERE `name` = %s", + (issue2, "most_recent_issue"), + ) + # queue the 'New FluView Available' flow + cur.execute("CALL automation.RunStep(36)") + elif issue2 < issue2: + raise Exception("most recent issue is older than the last known issue") - # cleanup - cnx.commit() - cur.close() - cnx.close() + # cleanup + cnx.commit() + cur.close() + cnx.close() diff --git a/src/acquisition/fluview/fluview_update.py b/src/acquisition/fluview/fluview_update.py index 65bec7a40..406725b8a 100644 --- a/src/acquisition/fluview/fluview_update.py +++ b/src/acquisition/fluview/fluview_update.py @@ -130,398 +130,430 @@ from . import fluview_locations # sheet names -ILINET_SHEET = 'ILINet.csv' -PHL_SHEET = 'WHO_NREVSS_Public_Health_Labs.csv' -CL_SHEET = 'WHO_NREVSS_Clinical_Labs.csv' +ILINET_SHEET = "ILINet.csv" +PHL_SHEET = "WHO_NREVSS_Public_Health_Labs.csv" +CL_SHEET = "WHO_NREVSS_Clinical_Labs.csv" # table names -CL_TABLE = 'fluview_clinical' -PHL_TABLE = 'fluview_public' +CL_TABLE = "fluview_clinical" +PHL_TABLE = "fluview_public" + def optional_int(i): - return int(i) if i not in ('', 'X') else None + return int(i) if i not in ("", "X") else None + def optional_float(i, j): - return float(i) if i not in ('', 'X') else float(j) + return float(i) if i not in ("", "X") else float(j) + def nullable_float(i): - return float(i) if i not in ('', 'X') else None + return float(i) if i not in ("", "X") else None + def get_ilinet_data(row): - if row[0] == 'REGION TYPE' and row != [ - 'REGION TYPE', - 'REGION', - 'YEAR', - 'WEEK', - '% WEIGHTED ILI', - '%UNWEIGHTED ILI', - 'AGE 0-4', - 'AGE 25-49', - 'AGE 25-64', - 'AGE 5-24', - 'AGE 50-64', - 'AGE 65', - 'ILITOTAL', - 'NUM. OF PROVIDERS', - 'TOTAL PATIENTS' - ]: - raise Exception('header row has changed') - if len(row) == 1 or row[0] == 'REGION TYPE': - # this is a header row - return None - if row[5] == 'X': - # ILI isn't reported, ignore this row - return None - return { - 'location': fluview_locations.get_location_name(*row[:2]), - 'epiweek': join_epiweek(int(row[2]), int(row[3])), - 'wili': optional_float(*row[4:6]), - 'ili': float(row[5]), - 'age0': optional_int(row[6]), - 'age1': optional_int(row[9]), - 'age2': optional_int(row[8]), - 'age3': optional_int(row[7]), - 'age4': optional_int(row[10]), - 'age5': optional_int(row[11]), - 'n_ili': optional_int(row[12]), - 'n_providers': optional_int(row[13]), - 'n_patients': optional_int(row[14]), - } + if row[0] == "REGION TYPE" and row != [ + "REGION TYPE", + "REGION", + "YEAR", + "WEEK", + "% WEIGHTED ILI", + "%UNWEIGHTED ILI", + "AGE 0-4", + "AGE 25-49", + "AGE 25-64", + "AGE 5-24", + "AGE 50-64", + "AGE 65", + "ILITOTAL", + "NUM. OF PROVIDERS", + "TOTAL PATIENTS", + ]: + raise Exception("header row has changed") + if len(row) == 1 or row[0] == "REGION TYPE": + # this is a header row + return None + if row[5] == "X": + # ILI isn't reported, ignore this row + return None + return { + "location": fluview_locations.get_location_name(*row[:2]), + "epiweek": join_epiweek(int(row[2]), int(row[3])), + "wili": optional_float(*row[4:6]), + "ili": float(row[5]), + "age0": optional_int(row[6]), + "age1": optional_int(row[9]), + "age2": optional_int(row[8]), + "age3": optional_int(row[7]), + "age4": optional_int(row[10]), + "age5": optional_int(row[11]), + "n_ili": optional_int(row[12]), + "n_providers": optional_int(row[13]), + "n_patients": optional_int(row[14]), + } + def get_clinical_data(row): - if row[0] == 'REGION TYPE' and row != [ - 'REGION TYPE', - 'REGION', - 'YEAR', - 'WEEK', - 'TOTAL SPECIMENS', - 'TOTAL A', - 'TOTAL B', - 'PERCENT POSITIVE', - 'PERCENT A', - 'PERCENT B' - ]: - raise Exception('header row has changed for clinical lab data.') - if len(row) == 1 or row[0] == 'REGION TYPE': - # this is a header row - return None - if row[4] == 'X': - # data is not reported, ignore this row - return None - # ignore percentage calculations for now - return { - 'location': fluview_locations.get_location_name(*row[:2]), - 'epiweek': join_epiweek(int(row[2]), int(row[3])), - 'total_specimens': int(row[4]), - 'total_a': optional_int(row[5]), - 'total_b': optional_int(row[6]), - 'percent_positive': nullable_float(row[7]), - 'percent_a': nullable_float(row[8]), - 'percent_b': nullable_float(row[9]) - } + if row[0] == "REGION TYPE" and row != [ + "REGION TYPE", + "REGION", + "YEAR", + "WEEK", + "TOTAL SPECIMENS", + "TOTAL A", + "TOTAL B", + "PERCENT POSITIVE", + "PERCENT A", + "PERCENT B", + ]: + raise Exception("header row has changed for clinical lab data.") + if len(row) == 1 or row[0] == "REGION TYPE": + # this is a header row + return None + if row[4] == "X": + # data is not reported, ignore this row + return None + # ignore percentage calculations for now + return { + "location": fluview_locations.get_location_name(*row[:2]), + "epiweek": join_epiweek(int(row[2]), int(row[3])), + "total_specimens": int(row[4]), + "total_a": optional_int(row[5]), + "total_b": optional_int(row[6]), + "percent_positive": nullable_float(row[7]), + "percent_a": nullable_float(row[8]), + "percent_b": nullable_float(row[9]), + } + def get_public_data(row): - hrow1 = [ - 'REGION TYPE', - 'REGION', - 'SEASON_DESCRIPTION', - 'TOTAL SPECIMENS', - 'A (2009 H1N1)', - 'A (H3)', - 'A (Subtyping not Performed)', - 'B', - 'BVic', - 'BYam', - 'H3N2v' - ] - hrow2 = [ - 'REGION TYPE', - 'REGION', - 'YEAR', - 'WEEK', - 'TOTAL SPECIMENS', - 'A (2009 H1N1)', - 'A (H3)', - 'A (Subtyping not Performed)', - 'B', - 'BVic', - 'BYam', - 'H3N2v' - ] - if row[0] == 'REGION TYPE' and row != hrow1 and row != hrow2: - raise Exception('header row has changed for public health lab data.') - if len(row) == 1 or row[0] == 'REGION TYPE': - # header row - return None - if row[3] == 'X': - # data is not reported, ignore this row - return None - # handle case where data is reported by season, not by epiweek - is_weekly = len(row) == len(hrow2) - # set epiweek - if is_weekly: - epiweek = join_epiweek(int(row[2]), int(row[3])) - else: - epiweek = int(row[2][7:11]) * 100 + 40 - # row offset - offset = 1 if is_weekly else 0 - return { - 'location': fluview_locations.get_location_name(*row[:2]), - 'epiweek': epiweek, - 'total_specimens': int(row[3 + offset]), - 'total_a_h1n1': optional_int(row[4+ offset]), - 'total_a_h3': optional_int(row[5 + offset]), - 'total_a_h3n2v': optional_int(row[10 + offset]), - 'total_a_no_sub': optional_int(row[6 + offset]), - 'total_b': optional_int(row[7 + offset]), - 'total_b_vic': optional_int(row[8 + offset]), - 'total_b_yam': optional_int(row[9 + offset]) - } - -def load_zipped_csv(filename, sheetname='ILINet.csv'): - """Read rows from a zipped CSV, which is expected to be named as specified - by the sheetname parameter. Default is ILINet.csv, for the default flu data.""" - with zipfile.ZipFile(filename) as f: - with f.open(sheetname) as ff: - return [row for row in csv.reader(io.StringIO(str(ff.read(), 'utf-8')))] - -def get_rows(cnx, table='fluview'): - """Count and return the number of rows in the `fluview` table. - Looking at the fluview table by default, but may pass parameter - to look at public health or clinical lab data instead.""" - select = cnx.cursor() - select.execute('SELECT count(1) num FROM %s' % table) - for (num,) in select: - pass - select.close() - return num + hrow1 = [ + "REGION TYPE", + "REGION", + "SEASON_DESCRIPTION", + "TOTAL SPECIMENS", + "A (2009 H1N1)", + "A (H3)", + "A (Subtyping not Performed)", + "B", + "BVic", + "BYam", + "H3N2v", + ] + hrow2 = [ + "REGION TYPE", + "REGION", + "YEAR", + "WEEK", + "TOTAL SPECIMENS", + "A (2009 H1N1)", + "A (H3)", + "A (Subtyping not Performed)", + "B", + "BVic", + "BYam", + "H3N2v", + ] + if row[0] == "REGION TYPE" and row != hrow1 and row != hrow2: + raise Exception("header row has changed for public health lab data.") + if len(row) == 1 or row[0] == "REGION TYPE": + # header row + return None + if row[3] == "X": + # data is not reported, ignore this row + return None + # handle case where data is reported by season, not by epiweek + is_weekly = len(row) == len(hrow2) + # set epiweek + if is_weekly: + epiweek = join_epiweek(int(row[2]), int(row[3])) + else: + epiweek = int(row[2][7:11]) * 100 + 40 + # row offset + offset = 1 if is_weekly else 0 + return { + "location": fluview_locations.get_location_name(*row[:2]), + "epiweek": epiweek, + "total_specimens": int(row[3 + offset]), + "total_a_h1n1": optional_int(row[4 + offset]), + "total_a_h3": optional_int(row[5 + offset]), + "total_a_h3n2v": optional_int(row[10 + offset]), + "total_a_no_sub": optional_int(row[6 + offset]), + "total_b": optional_int(row[7 + offset]), + "total_b_vic": optional_int(row[8 + offset]), + "total_b_yam": optional_int(row[9 + offset]), + } + + +def load_zipped_csv(filename, sheetname="ILINet.csv"): + """Read rows from a zipped CSV, which is expected to be named as specified + by the sheetname parameter. Default is ILINet.csv, for the default flu data.""" + with zipfile.ZipFile(filename) as f: + with f.open(sheetname) as ff: + return [row for row in csv.reader(io.StringIO(str(ff.read(), "utf-8")))] + + +def get_rows(cnx, table="fluview"): + """Count and return the number of rows in the `fluview` table. + Looking at the fluview table by default, but may pass parameter + to look at public health or clinical lab data instead.""" + select = cnx.cursor() + select.execute(f"SELECT count(1) num FROM {table}") + for (num,) in select: + pass + select.close() + return num + def update_from_file_clinical(issue, date, filename, test_mode=False): - """ - Read WHO/NREVSS data from a zipped CSV and insert into (or update) the database. - """ - - # database connection - u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - rows1 = get_rows(cnx, CL_TABLE) - print('rows before: %d' % (rows1)) - insert = cnx.cursor() - - # load the data, ignoring empty rows - print('loading data from %s as issued on %d' % (filename, issue)) - rows = load_zipped_csv(filename, CL_SHEET) - print(' loaded %d rows' % len(rows)) - data = [get_clinical_data(row) for row in rows] - entries = [obj for obj in data if obj] - print(' found %d entries' % len(entries)) - - sql = ''' - INSERT INTO - `fluview_clinical` (`release_date`, `issue`, `epiweek`, `region`, `lag`, - `total_specimens`, `total_a`, `total_b`, `percent_positive`, `percent_a`, - `percent_b`) - VALUES - (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) - ON DUPLICATE KEY UPDATE - `release_date` = least(`release_date`, %s), - `total_specimens` = %s, - `total_a` = %s, - `total_b` = %s, - `percent_positive` = %s, - `percent_a` = %s, - `percent_b` = %s - ''' - - # insert each row - insert = cnx.cursor() - for row in entries: - lag = delta_epiweeks(row['epiweek'], issue) - args = [ - row['total_specimens'], row['total_a'], row['total_b'], - row['percent_positive'], row['percent_a'], row['percent_b'] - ] - ins_args = [date, issue, row['epiweek'], row['location'], lag] + args - upd_args = [date] + args - insert.execute(sql, ins_args + upd_args) - - # cleanup - insert.close() - if test_mode: - print('test mode, not committing') - rows2 = rows1 - else: - cnx.commit() - rows2 = get_rows(cnx) - print('rows after: %d (added %d)' % (rows2, rows2 - rows1)) - cnx.close() + """ + Read WHO/NREVSS data from a zipped CSV and insert into (or update) the database. + """ + + # database connection + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") + rows1 = get_rows(cnx, CL_TABLE) + print(f"rows before: {int(rows1)}") + insert = cnx.cursor() + + # load the data, ignoring empty rows + print(f"loading data from {filename} as issued on {int(issue)}") + rows = load_zipped_csv(filename, CL_SHEET) + print(f" loaded {len(rows)} rows") + data = [get_clinical_data(row) for row in rows] + entries = [obj for obj in data if obj] + print(f" found {len(entries)} entries") + + sql = """ + INSERT INTO + `fluview_clinical` (`release_date`, `issue`, `epiweek`, `region`, `lag`, + `total_specimens`, `total_a`, `total_b`, `percent_positive`, `percent_a`, + `percent_b`) + VALUES + (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) + ON DUPLICATE KEY UPDATE + `release_date` = least(`release_date`, %s), + `total_specimens` = %s, + `total_a` = %s, + `total_b` = %s, + `percent_positive` = %s, + `percent_a` = %s, + `percent_b` = %s + """ + + # insert each row + insert = cnx.cursor() + for row in entries: + lag = delta_epiweeks(row["epiweek"], issue) + args = [ + row["total_specimens"], + row["total_a"], + row["total_b"], + row["percent_positive"], + row["percent_a"], + row["percent_b"], + ] + ins_args = [date, issue, row["epiweek"], row["location"], lag] + args + upd_args = [date] + args + insert.execute(sql, ins_args + upd_args) + + # cleanup + insert.close() + if test_mode: + print("test mode, not committing") + rows2 = rows1 + else: + cnx.commit() + rows2 = get_rows(cnx) + print(f"rows after: {int(rows2)} (added {int(rows2 - rows1)})") + cnx.close() + def update_from_file_public(issue, date, filename, test_mode=False): - """ - Read WHO/NREVSS data from a zipped CSV and insert into (or update) the database. - """ - - # database connection - u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - rows1 = get_rows(cnx, PHL_TABLE) - print('rows before: %d' % (rows1)) - insert = cnx.cursor() - - # load the data, ignoring empty rows - print('loading data from %s as issued on %d' % (filename, issue)) - rows = load_zipped_csv(filename, PHL_SHEET) - print(' loaded %d rows' % len(rows)) - data = [get_public_data(row) for row in rows] - entries = [obj for obj in data if obj] - print(' found %d entries' % len(entries)) - - sql = ''' - INSERT INTO - `fluview_public` (`release_date`, `issue`, `epiweek`, `region`, `lag`, - `total_specimens`, `total_a_h1n1`, `total_a_h3`, `total_a_h3n2v`, - `total_a_no_sub`, `total_b`, `total_b_vic`, `total_b_yam`) - VALUES - (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) - ON DUPLICATE KEY UPDATE - `release_date` = least(`release_date`, %s), - `total_specimens` = %s, - `total_a_h1n1` = %s, - `total_a_h3` = %s, - `total_a_h3n2v` = %s, - `total_a_no_sub` = %s, - `total_b` = %s, - `total_b_vic` = %s, - `total_b_yam` = %s - ''' - - # insert each row - insert = cnx.cursor() - for row in entries: - lag = delta_epiweeks(row['epiweek'], issue) - args = [ - row['total_specimens'], row['total_a_h1n1'], row['total_a_h3'], - row['total_a_h3n2v'], row['total_a_no_sub'], row['total_b'], - row['total_b_vic'], row['total_b_yam'] - ] - ins_args = [date, issue, row['epiweek'], row['location'], lag] + args - upd_args = [date] + args - insert.execute(sql, ins_args + upd_args) - - # cleanup - insert.close() - if test_mode: - print('test mode, not committing') - rows2 = rows1 - else: - cnx.commit() - rows2 = get_rows(cnx) - print('rows after: %d (added %d)' % (rows2, rows2 - rows1)) - cnx.close() + """ + Read WHO/NREVSS data from a zipped CSV and insert into (or update) the database. + """ + + # database connection + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") + rows1 = get_rows(cnx, PHL_TABLE) + print(f"rows before: {int(rows1)}") + insert = cnx.cursor() + + # load the data, ignoring empty rows + print(f"loading data from {filename} as issued on {int(issue)}") + rows = load_zipped_csv(filename, PHL_SHEET) + print(f" loaded {len(rows)} rows") + data = [get_public_data(row) for row in rows] + entries = [obj for obj in data if obj] + print(f" found {len(entries)} entries") + + sql = """ + INSERT INTO + `fluview_public` (`release_date`, `issue`, `epiweek`, `region`, `lag`, + `total_specimens`, `total_a_h1n1`, `total_a_h3`, `total_a_h3n2v`, + `total_a_no_sub`, `total_b`, `total_b_vic`, `total_b_yam`) + VALUES + (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) + ON DUPLICATE KEY UPDATE + `release_date` = least(`release_date`, %s), + `total_specimens` = %s, + `total_a_h1n1` = %s, + `total_a_h3` = %s, + `total_a_h3n2v` = %s, + `total_a_no_sub` = %s, + `total_b` = %s, + `total_b_vic` = %s, + `total_b_yam` = %s + """ + + # insert each row + insert = cnx.cursor() + for row in entries: + lag = delta_epiweeks(row["epiweek"], issue) + args = [ + row["total_specimens"], + row["total_a_h1n1"], + row["total_a_h3"], + row["total_a_h3n2v"], + row["total_a_no_sub"], + row["total_b"], + row["total_b_vic"], + row["total_b_yam"], + ] + ins_args = [date, issue, row["epiweek"], row["location"], lag] + args + upd_args = [date] + args + insert.execute(sql, ins_args + upd_args) + + # cleanup + insert.close() + if test_mode: + print("test mode, not committing") + rows2 = rows1 + else: + cnx.commit() + rows2 = get_rows(cnx) + print(f"rows after: {int(rows2)} (added {int(rows2 - rows1)})") + cnx.close() + def update_from_file(issue, date, filename, test_mode=False): - """ - Read ILINet data from a zipped CSV and insert into (or update) the database. - """ - - # database connection - u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - rows1 = get_rows(cnx) - print('rows before: %d' % (rows1)) - insert = cnx.cursor() - - # load the data, ignoring empty rows - print('loading data from %s as issued on %d' % (filename, issue)) - rows = load_zipped_csv(filename) - print(' loaded %d rows' % len(rows)) - data = [get_ilinet_data(row) for row in rows] - entries = [obj for obj in data if obj] - print(' found %d entries' % len(entries)) - - sql = ''' - INSERT INTO - `fluview` (`release_date`, `issue`, `epiweek`, `region`, `lag`, `num_ili`, - `num_patients`, `num_providers`, `wili`, `ili`, `num_age_0`, `num_age_1`, - `num_age_2`, `num_age_3`, `num_age_4`, `num_age_5`) - VALUES - (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) - ON DUPLICATE KEY UPDATE - `release_date` = least(`release_date`, %s), - `num_ili` = %s, - `num_patients` = %s, - `num_providers` = %s, - `wili` = %s, - `ili` = %s, - `num_age_0` = coalesce(%s, `num_age_0`), - `num_age_1` = coalesce(%s, `num_age_1`), - `num_age_2` = coalesce(%s, `num_age_2`), - `num_age_3` = coalesce(%s, `num_age_3`), - `num_age_4` = coalesce(%s, `num_age_4`), - `num_age_5` = coalesce(%s, `num_age_5`) - ''' - - # insert each row - insert = cnx.cursor() - for row in entries: - lag = delta_epiweeks(row['epiweek'], issue) - args = [ - row['n_ili'], row['n_patients'], row['n_providers'], row['wili'], - row['ili'], row['age0'], row['age1'], row['age2'], row['age3'], - row['age4'], row['age5'] - ] - ins_args = [date, issue, row['epiweek'], row['location'], lag] + args - upd_args = [date] + args - insert.execute(sql, ins_args + upd_args) - - # cleanup - insert.close() - if test_mode: - print('test mode, not committing') - rows2 = rows1 - else: - cnx.commit() - rows2 = get_rows(cnx) - print('rows after: %d (added %d)' % (rows2, rows2 - rows1)) - cnx.close() + """ + Read ILINet data from a zipped CSV and insert into (or update) the database. + """ + + # database connection + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") + rows1 = get_rows(cnx) + print(f"rows before: {int(rows1)}") + insert = cnx.cursor() + + # load the data, ignoring empty rows + print(f"loading data from {filename} as issued on {int(issue)}") + rows = load_zipped_csv(filename) + print(f" loaded {len(rows)} rows") + data = [get_ilinet_data(row) for row in rows] + entries = [obj for obj in data if obj] + print(f" found {len(entries)} entries") + + sql = """ + INSERT INTO + `fluview` (`release_date`, `issue`, `epiweek`, `region`, `lag`, `num_ili`, + `num_patients`, `num_providers`, `wili`, `ili`, `num_age_0`, `num_age_1`, + `num_age_2`, `num_age_3`, `num_age_4`, `num_age_5`) + VALUES + (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) + ON DUPLICATE KEY UPDATE + `release_date` = least(`release_date`, %s), + `num_ili` = %s, + `num_patients` = %s, + `num_providers` = %s, + `wili` = %s, + `ili` = %s, + `num_age_0` = coalesce(%s, `num_age_0`), + `num_age_1` = coalesce(%s, `num_age_1`), + `num_age_2` = coalesce(%s, `num_age_2`), + `num_age_3` = coalesce(%s, `num_age_3`), + `num_age_4` = coalesce(%s, `num_age_4`), + `num_age_5` = coalesce(%s, `num_age_5`) + """ + + # insert each row + insert = cnx.cursor() + for row in entries: + lag = delta_epiweeks(row["epiweek"], issue) + args = [ + row["n_ili"], + row["n_patients"], + row["n_providers"], + row["wili"], + row["ili"], + row["age0"], + row["age1"], + row["age2"], + row["age3"], + row["age4"], + row["age5"], + ] + ins_args = [date, issue, row["epiweek"], row["location"], lag] + args + upd_args = [date] + args + insert.execute(sql, ins_args + upd_args) + + # cleanup + insert.close() + if test_mode: + print("test mode, not committing") + rows2 = rows1 + else: + cnx.commit() + rows2 = get_rows(cnx) + print(f"rows after: {int(rows2)} (added {int(rows2 - rows1)})") + cnx.close() + def main(): - # args and usage - parser = argparse.ArgumentParser() - parser.add_argument( - '--test', - action='store_true', - help='do dry run only, do not update the database' - ) - parser.add_argument( - '--file', - type=str, - help='load an existing zip file (otherwise fetch current data)' - ) - parser.add_argument( - '--issue', - type=int, - help='issue of the file (e.g. 201740); used iff --file is given' - ) - args = parser.parse_args() - - if (args.file is None) != (args.issue is None): - raise Exception('--file and --issue must both be present or absent') - - date = datetime.datetime.now().strftime('%Y-%m-%d') - print('assuming release date is today, %s' % date) - - if args.file: - update_from_file(args.issue, date, args.file, test_mode=args.test) - update_from_file_clinical(args.issue, date, args.file, test_mode=args.test) - # TODO: header row has changed for public health lab data - # update_from_file_public(args.issue, date, args.file, test_mode=args.test) - else: - issue, files = fluview.save_latest(path='flu_data') - for filename in files: - update_from_file(issue, date, filename, test_mode=args.test) - update_from_file_clinical(issue, date, filename, test_mode=args.test) - # TODO: header row has changed for public health lab data - # update_from_file_public(issue, date, filename, test_mode=args.test) - -if __name__ == '__main__': - main() + # args and usage + parser = argparse.ArgumentParser() + # fmt: off + parser.add_argument( + "--test", + action="store_true", + help="do dry run only, do not update the database" + ) + parser.add_argument( + "--file", + type=str, + help="load an existing zip file (otherwise fetch current data)" + ) + parser.add_argument( + "--issue", + type=int, + help="issue of the file (e.g. 201740); used iff --file is given" + ) + # fmt: on + args = parser.parse_args() + + if (args.file is None) != (args.issue is None): + raise Exception("--file and --issue must both be present or absent") + + date = datetime.datetime.now().strftime("%Y-%m-%d") + print(f"assuming release date is today, {date}") + + if args.file: + update_from_file(args.issue, date, args.file, test_mode=args.test) + update_from_file_clinical(args.issue, date, args.file, test_mode=args.test) + # TODO: header row has changed for public health lab data + # update_from_file_public(args.issue, date, args.file, test_mode=args.test) + else: + issue, files = fluview.save_latest(path="flu_data") + for filename in files: + update_from_file(issue, date, filename, test_mode=args.test) + update_from_file_clinical(issue, date, filename, test_mode=args.test) + # TODO: header row has changed for public health lab data + # update_from_file_public(issue, date, filename, test_mode=args.test) + + +if __name__ == "__main__": + main() diff --git a/src/acquisition/fluview/impute_missing_values.py b/src/acquisition/fluview/impute_missing_values.py index 7f9a23231..4b3e1d684 100644 --- a/src/acquisition/fluview/impute_missing_values.py +++ b/src/acquisition/fluview/impute_missing_values.py @@ -59,290 +59,283 @@ class Database: - """Database wrapper and abstraction layer.""" - - class Sql: - """Container for SQL constants.""" - - # Count the total number of imputed rows. - count_rows = ''' - SELECT - count(1) `num` - FROM - `fluview_imputed` - ''' - - # Find (issue, epiweek) pairs that exist in table `fluview` but not in - # table `fluview_imputed`. Note that only issues >= 201740 are selected - # because that's when CDC first started posting state-level ILINet data. - # This assumes that `fluview` is always missing at least one location. - find_missing_rows = ''' - SELECT - fv.`issue`, fv.`epiweek` - FROM ( + """Database wrapper and abstraction layer.""" + + class Sql: + """Container for SQL constants.""" + + # Count the total number of imputed rows. + count_rows = """ SELECT - `issue`, `epiweek` + count(1) `num` FROM - `fluview` + `fluview_imputed` + """ + + # Find (issue, epiweek) pairs that exist in table `fluview` but not in + # table `fluview_imputed`. Note that only issues >= 201740 are selected + # because that's when CDC first started posting state-level ILINet data. + # This assumes that `fluview` is always missing at least one location. + find_missing_rows = """ + SELECT + fv.`issue`, fv.`epiweek` + FROM ( + SELECT + `issue`, `epiweek` + FROM + `fluview` + WHERE + `issue` >= 201740 + GROUP BY + `issue`, `epiweek` + ) fv + LEFT JOIN ( + SELECT + `issue`, `epiweek` + FROM + `fluview_imputed` + GROUP BY + `issue`, `epiweek` + ) fvi + ON + fvi.`issue` = fv.`issue` AND fvi.`epiweek` = fv.`epiweek` WHERE - `issue` >= 201740 - GROUP BY - `issue`, `epiweek` - ) fv - LEFT JOIN ( + fvi.`issue` IS NULL + """ + + # Read all location rows from the `fluview` table for a given issue and + # epiweek. + get_known_values = """ SELECT - `issue`, `epiweek` + `region`, `num_ili`, `num_patients`, `num_providers` FROM - `fluview_imputed` - GROUP BY - `issue`, `epiweek` - ) fvi - ON - fvi.`issue` = fv.`issue` AND fvi.`epiweek` = fv.`epiweek` - WHERE - fvi.`issue` IS NULL - ''' - - # Read all location rows from the `fluview` table for a given issue and - # epiweek. - get_known_values = ''' - SELECT - `region`, `num_ili`, `num_patients`, `num_providers` - FROM - `fluview` - WHERE - `issue` = %s AND `epiweek` = %s - ''' - - # Insert location rows into the `fluview_imputed` table for a given issue - # and epiweek. - add_imputed_values = ''' - INSERT INTO - `fluview_imputed` ( - `issue`, - `epiweek`, - `region`, - `lag`, - `num_ili`, - `num_patients`, - `num_providers`, - `ili` - ) - VALUES - (%s, %s, %s, %s, %s, %s, %s, %s) - ''' - - def connect(self): - """Connect to the database.""" - u, p = secrets.db.epi - self.cnx = mysql.connector.connect(user=u, password=p, database='epidata') - self.cur = self.cnx.cursor() - - def close(self, commit): - """ - Close the connection to the database, committing or rolling back changes as - indicated. - """ - self.cur.close() - if commit: - self.cnx.commit() - else: - print('test mode, not committing') - self.cnx.close() - - def count_rows(self): - """Count and return the number of rows in the `fluview_imputed` table.""" - self.cur.execute(Database.Sql.count_rows) - for (num,) in self.cur: - return num - - def find_missing_rows(self): - """ - Find rows that still have missing values. Each missing row is uniquely - identified by an (issue, epiweek, location) tuple. This function finds the - first two. - """ + `fluview` + WHERE + `issue` = %s AND `epiweek` = %s + """ + + # Insert location rows into the `fluview_imputed` table for a given issue + # and epiweek. + add_imputed_values = """ + INSERT INTO + `fluview_imputed` ( + `issue`, + `epiweek`, + `region`, + `lag`, + `num_ili`, + `num_patients`, + `num_providers`, + `ili` + ) + VALUES + (%s, %s, %s, %s, %s, %s, %s, %s) + """ + + def connect(self): + """Connect to the database.""" + u, p = secrets.db.epi + self.cnx = mysql.connector.connect(user=u, password=p, database="epidata") + self.cur = self.cnx.cursor() + + def close(self, commit): + """ + Close the connection to the database, committing or rolling back changes as + indicated. + """ + self.cur.close() + if commit: + self.cnx.commit() + else: + print("test mode, not committing") + self.cnx.close() + + def count_rows(self): + """Count and return the number of rows in the `fluview_imputed` table.""" + self.cur.execute(Database.Sql.count_rows) + for (num,) in self.cur: + return num + + def find_missing_rows(self): + """ + Find rows that still have missing values. Each missing row is uniquely + identified by an (issue, epiweek, location) tuple. This function finds the + first two. + """ + + self.cur.execute(Database.Sql.find_missing_rows) + return [(issue, epiweek) for (issue, epiweek) in self.cur] + + def get_known_values(self, issue, epiweek): + """ + Fetch ILINet data for all locations available for the given issue and + epiweek. The returned value is a dict mapping from locations to ILI data. + """ + + self.cur.execute(Database.Sql.get_known_values, (issue, epiweek)) + return {loc: (n_ili, n_pat, n_prov) for (loc, n_ili, n_pat, n_prov) in self.cur} + + def add_imputed_values(self, issue, epiweek, imputed): + """ + Store imputed ILINet data for the given locations on the given issue and + epiweek. The imputed value is a dict mapping from locations to ILI data. + """ + + for loc in imputed.keys(): + lag, n_ili, n_pat, n_prov, ili = imputed[loc] + args = (issue, epiweek, loc, lag, n_ili, n_pat, n_prov, ili) + self.cur.execute(Database.Sql.add_imputed_values, args) - self.cur.execute(Database.Sql.find_missing_rows) - return [(issue, epiweek) for (issue, epiweek) in self.cur] - def get_known_values(self, issue, epiweek): - """ - Fetch ILINet data for all locations available for the given issue and - epiweek. The returned value is a dict mapping from locations to ILI data. - """ +class StatespaceException(Exception): + """Used to indicate that imputation is not possible with the given inputs.""" - self.cur.execute(Database.Sql.get_known_values, (issue, epiweek)) - return dict([ - (loc, (n_ili, n_pat, n_prov)) - for - (loc, n_ili, n_pat, n_prov) - in self.cur - ]) - def add_imputed_values(self, issue, epiweek, imputed): +def get_location_graph(): """ - Store imputed ILINet data for the given locations on the given issue and - epiweek. The imputed value is a dict mapping from locations to ILI data. + Return a matrix where rows represent regions, columns represent atoms, and + each entry is a 1 if the region contains the atom, otherwise 0. The + corresponding lists of regions and atoms are also returned. """ - for loc in imputed.keys(): - lag, n_ili, n_pat, n_prov, ili = imputed[loc] - args = (issue, epiweek, loc, lag, n_ili, n_pat, n_prov, ili) - self.cur.execute(Database.Sql.add_imputed_values, args) - - -class StatespaceException(Exception): - """Used to indicate that imputation is not possible with the given inputs.""" - - -def get_location_graph(): - """ - Return a matrix where rows represent regions, columns represent atoms, and - each entry is a 1 if the region contains the atom, otherwise 0. The - corresponding lists of regions and atoms are also returned. - """ - - regions = sorted(Locations.region_list) - atoms = sorted(Locations.atom_list) - graph = np.zeros((len(regions), len(atoms))) - for i, r in enumerate(regions): - for a in Locations.region_map[r]: - j = atoms.index(a) - graph[i, j] = 1 - return graph, regions, atoms + regions = sorted(Locations.region_list) + atoms = sorted(Locations.atom_list) + graph = np.zeros((len(regions), len(atoms))) + for i, r in enumerate(regions): + for a in Locations.region_map[r]: + j = atoms.index(a) + graph[i, j] = 1 + return graph, regions, atoms def get_fusion_parameters(known_locations): - """ - Return a matrix that fuses known ILI values into unknown ILI values. The - corresponding lists of known and unknown locations are also returned. + """ + Return a matrix that fuses known ILI values into unknown ILI values. The + corresponding lists of known and unknown locations are also returned. - The goal is to infer ILI data in all locations, given ILI data in some - partial set of locations. This function takes a sensor fusion approach. + The goal is to infer ILI data in all locations, given ILI data in some + partial set of locations. This function takes a sensor fusion approach. - Let $z$ be a column vector of values in reported locations. Let $y$ be the - desired column vector of values in unreported locations. With matrices $H$ - (mapping from latent state to reported values), $W$ (mapping from latent - state to unreported values), and $R = I$ (covariance, which is identity): + Let $z$ be a column vector of values in reported locations. Let $y$ be the + desired column vector of values in unreported locations. With matrices $H$ + (mapping from latent state to reported values), $W$ (mapping from latent + state to unreported values), and $R = I$ (covariance, which is identity): - $y = W (H^T R^{-1} H)^{-1} H^T R^{-1} z$ - $y = W (H^T H)^{-1} H^T z$ + $y = W (H^T R^{-1} H)^{-1} H^T R^{-1} z$ + $y = W (H^T H)^{-1} H^T z$ - This is equavalent to OLS regression with an added translation from atomic - locations to missing locations. Unknown values are computed as a linear - combination of known values. - """ + This is equavalent to OLS regression with an added translation from atomic + locations to missing locations. Unknown values are computed as a linear + combination of known values. + """ - graph, regions, atoms = get_location_graph() - is_known = np.array([r in known_locations for r in regions]) - is_unknown = np.logical_not(is_known) - if not np.any(is_known): - raise StatespaceException('no values are known') - if not np.any(is_unknown): - raise StatespaceException('no values are unknown') + graph, regions, atoms = get_location_graph() + is_known = np.array([r in known_locations for r in regions]) + is_unknown = np.logical_not(is_known) + if not np.any(is_known): + raise StatespaceException("no values are known") + if not np.any(is_unknown): + raise StatespaceException("no values are unknown") - H = graph[is_known, :] - W = graph[is_unknown, :] - if np.linalg.matrix_rank(H) != len(atoms): - raise StatespaceException('system is underdetermined') + H = graph[is_known, :] + W = graph[is_unknown, :] + if np.linalg.matrix_rank(H) != len(atoms): + raise StatespaceException("system is underdetermined") - HtH = np.dot(H.T, H) - HtH_inv = np.linalg.inv(HtH) - H_pseudo_inv = np.dot(HtH_inv, H.T) - fuser = np.dot(W, H_pseudo_inv) + HtH = np.dot(H.T, H) + HtH_inv = np.linalg.inv(HtH) + H_pseudo_inv = np.dot(HtH_inv, H.T) + fuser = np.dot(W, H_pseudo_inv) - locations = np.array(regions) - filter_locations = lambda selected: list(map(str, locations[selected])) - return fuser, filter_locations(is_known), filter_locations(is_unknown) + locations = np.array(regions) + filter_locations = lambda selected: list(map(str, locations[selected])) + return fuser, filter_locations(is_known), filter_locations(is_unknown) def get_lag_and_ili(issue, epiweek, num_ili, num_patients): - """ - Compute and return reporting lag and percent ILI from imputed ILINet data. - """ - lag = delta_epiweeks(epiweek, issue) - ili = 100.0 * (0 if num_patients == 0 else num_ili / num_patients) - return lag, ili + """ + Compute and return reporting lag and percent ILI from imputed ILINet data. + """ + lag = delta_epiweeks(epiweek, issue) + ili = 100.0 * (0 if num_patients == 0 else num_ili / num_patients) + return lag, ili def impute_missing_values(database, test_mode=False): - """ - Determine whether values are missing for any states and territories. If so, - impute them and store them in the database. - """ - - # database connection - database.connect() - rows1 = database.count_rows() - print('rows before: %d' % (rows1)) - - # iterate over missing epiweeks - missing_rows = database.find_missing_rows() - print('missing data for %d epiweeks' % len(missing_rows)) - for issue, epiweek in missing_rows: - print('i=%d e=%d' % (issue, epiweek)) - - # get known values from table `fluview` - known_values = database.get_known_values(issue, epiweek) - - # Unlike most other state-level data, which typically begins publicly on - # 2010w40, data for PR begins on 2013w40. Before this, there are no reports - # for PR. Here we assume that no report is equivalent to a report of all - # zeros (number of ILI, patients, and providers). That's mostly true, with - # the notable exception of wILI, but that's not relevant here. By assuming - # that PR reports zero on those weeks, it's possible to impute values for - # VI, which are otherwise not reported until 2015w40. - assume_pr_zero = epiweek < 201340 and 'pr' not in known_values - if assume_pr_zero: - known_values['pr'] = (0, 0, 0) - - # get the imputation matrix and lists of known and unknown locations - F, known, unknown = get_fusion_parameters(known_values.keys()) - - # finally, impute the missing values - z = np.array([known_values[k] for k in known]) - y = np.dot(F, z) - - # possibly also record the assumptions made for PR - if assume_pr_zero: - unknown.append('pr') - y = np.vstack((y, [known_values['pr']])) - - # add lag and percent ILI to the data for each imputed location - imputed_values = {} - for loc, values in zip(unknown, y): - n_ili, n_pat, n_prov = map(int, np.rint(values)) - lag, ili = get_lag_and_ili(issue, epiweek, n_ili, n_pat) - imputed_values[loc] = (lag, n_ili, n_pat, n_prov, ili) - print(' %s: %s' % (loc, str(imputed_values[loc]))) - - # save all imputed values in table `fluview_imputed` - database.add_imputed_values(issue, epiweek, imputed_values) - - # database cleanup - rows2 = database.count_rows() - print('rows after: %d (added %d)' % (rows2, rows2 - rows1)) - commit = not test_mode - database.close(commit) + """ + Determine whether values are missing for any states and territories. If so, + impute them and store them in the database. + """ + + # database connection + database.connect() + rows1 = database.count_rows() + print(f"rows before: {int(rows1)}") + + # iterate over missing epiweeks + missing_rows = database.find_missing_rows() + print(f"missing data for {len(missing_rows)} epiweeks") + for issue, epiweek in missing_rows: + print(f"i={int(issue)} e={int(epiweek)}") + + # get known values from table `fluview` + known_values = database.get_known_values(issue, epiweek) + + # Unlike most other state-level data, which typically begins publicly on + # 2010w40, data for PR begins on 2013w40. Before this, there are no reports + # for PR. Here we assume that no report is equivalent to a report of all + # zeros (number of ILI, patients, and providers). That's mostly true, with + # the notable exception of wILI, but that's not relevant here. By assuming + # that PR reports zero on those weeks, it's possible to impute values for + # VI, which are otherwise not reported until 2015w40. + assume_pr_zero = epiweek < 201340 and "pr" not in known_values + if assume_pr_zero: + known_values["pr"] = (0, 0, 0) + + # get the imputation matrix and lists of known and unknown locations + F, known, unknown = get_fusion_parameters(known_values.keys()) + + # finally, impute the missing values + z = np.array([known_values[k] for k in known]) + y = np.dot(F, z) + + # possibly also record the assumptions made for PR + if assume_pr_zero: + unknown.append("pr") + y = np.vstack((y, [known_values["pr"]])) + + # add lag and percent ILI to the data for each imputed location + imputed_values = {} + for loc, values in zip(unknown, y): + n_ili, n_pat, n_prov = map(int, np.rint(values)) + lag, ili = get_lag_and_ili(issue, epiweek, n_ili, n_pat) + imputed_values[loc] = (lag, n_ili, n_pat, n_prov, ili) + print(f" {loc}: {str(imputed_values[loc])}") + + # save all imputed values in table `fluview_imputed` + database.add_imputed_values(issue, epiweek, imputed_values) + + # database cleanup + rows2 = database.count_rows() + print(f"rows after: {int(rows2)} (added {int(rows2 - rows1)})") + commit = not test_mode + database.close(commit) def get_argument_parser(): - """Set up command line arguments and usage.""" - parser = argparse.ArgumentParser() - parser.add_argument( - '--test', - action='store_true', - help='do dry run only, do not update the database' - ) - return parser + """Set up command line arguments and usage.""" + parser = argparse.ArgumentParser() + parser.add_argument( + "--test", action="store_true", help="do dry run only, do not update the database" + ) + return parser def main(): - """Run this script from the command line.""" - args = get_argument_parser().parse_args() - impute_missing_values(Database(), test_mode=args.test) + """Run this script from the command line.""" + args = get_argument_parser().parse_args() + impute_missing_values(Database(), test_mode=args.test) -if __name__ == '__main__': - main() +if __name__ == "__main__": + main() diff --git a/src/acquisition/ght/ght_update.py b/src/acquisition/ght/ght_update.py index c1e9b8d94..9e8d48d1d 100644 --- a/src/acquisition/ght/ght_update.py +++ b/src/acquisition/ght/ght_update.py @@ -1,4 +1,4 @@ -''' +""" =============== === Purpose === =============== @@ -63,7 +63,7 @@ * fixed multiple-word queries (surround with quotes) 2015-12-01 * Original version -''' +""" # standard library import argparse @@ -88,304 +88,339 @@ # 2010-04-19 and 2015-05-05 # see: https://www.google.com/trends/correlate TERMS = [ - '/m/0cycc', - 'influenza type a', - 'flu duration', - 'flu fever', - 'treating flu', - 'fever flu', - 'flu recovery', - 'braun thermoscan', - 'oscillococcinum', - 'treating the flu', - 'cold or flu', - 'flu versus cold', - 'flu remedies', - 'contagious flu', - 'type a influenza', - 'flu or cold', - 'duration of flu', - 'cold versus flu', - 'flu cough', - 'flu headache', - 'thermoscan', - 'influenza incubation period', - 'flu lasts', - 'length of flu', - 'flu stomach', - 'cold vs flu', - 'flu and fever', - 'getting over the flu', - 'influenza a', - 'treatment for flu', - 'flu length', - 'treatment for the flu', - 'influenza symptoms', - 'over the counter flu', - 'flu complications', - 'cold and flu symptoms', - 'influenza incubation', - 'treatment of flu', - 'human temperature', - 'low body', - 'flu contagious', - 'robitussin ac', - 'flu how long', - 'ear thermometer', - 'flu contagious period', - 'treat flu', - 'cough flu', - 'low body temperature', - 'expectorant', - 'flu and cold', - 'rapid flu', - 'flu vs. cold', - 'how to treat the flu', - 'how long does the flu last?', - 'viral pneumonia', - 'flu in kids', - 'type a flu', - 'influenza treatment', - 'fighting the flu', - 'flu relief', - 'treat the flu', - 'flu medicine', - 'dangerous fever', - 'what is influenza', - 'tussin', - 'low body temp', - 'flu care', - 'flu in infants', - 'flu dizziness', - 'feed a fever', - 'flu vs cold', - 'flu vomiting', - 'bacterial pneumonia', - 'flu activity', - 'flu chills', - 'anas barbariae', - 'flu germs', - 'tylenol cold', - 'how to get over the flu', - 'flu in children', - 'influenza a and b', - 'duration of the flu', - 'cold symptoms', - 'flu report', - 'rapid flu test', - 'flu relapse', - 'get over the flu', - 'flu during pregnancy', - 'flu recovery time', - 'cure for flu', - 'tamiflu and breastfeeding', - 'flu chest pain', - 'flu treatment', - 'flu nausea', - 'remedies for the flu', - 'tamiflu in pregnancy', - 'side effects of tamiflu', - 'how to treat flu', - 'viral bronchitis', - 'flu how long contagious', - 'flu remedy', + "/m/0cycc", + "influenza type a", + "flu duration", + "flu fever", + "treating flu", + "fever flu", + "flu recovery", + "braun thermoscan", + "oscillococcinum", + "treating the flu", + "cold or flu", + "flu versus cold", + "flu remedies", + "contagious flu", + "type a influenza", + "flu or cold", + "duration of flu", + "cold versus flu", + "flu cough", + "flu headache", + "thermoscan", + "influenza incubation period", + "flu lasts", + "length of flu", + "flu stomach", + "cold vs flu", + "flu and fever", + "getting over the flu", + "influenza a", + "treatment for flu", + "flu length", + "treatment for the flu", + "influenza symptoms", + "over the counter flu", + "flu complications", + "cold and flu symptoms", + "influenza incubation", + "treatment of flu", + "human temperature", + "low body", + "flu contagious", + "robitussin ac", + "flu how long", + "ear thermometer", + "flu contagious period", + "treat flu", + "cough flu", + "low body temperature", + "expectorant", + "flu and cold", + "rapid flu", + "flu vs. cold", + "how to treat the flu", + "how long does the flu last?", + "viral pneumonia", + "flu in kids", + "type a flu", + "influenza treatment", + "fighting the flu", + "flu relief", + "treat the flu", + "flu medicine", + "dangerous fever", + "what is influenza", + "tussin", + "low body temp", + "flu care", + "flu in infants", + "flu dizziness", + "feed a fever", + "flu vs cold", + "flu vomiting", + "bacterial pneumonia", + "flu activity", + "flu chills", + "anas barbariae", + "flu germs", + "tylenol cold", + "how to get over the flu", + "flu in children", + "influenza a and b", + "duration of the flu", + "cold symptoms", + "flu report", + "rapid flu test", + "flu relapse", + "get over the flu", + "flu during pregnancy", + "flu recovery time", + "cure for flu", + "tamiflu and breastfeeding", + "flu chest pain", + "flu treatment", + "flu nausea", + "remedies for the flu", + "tamiflu in pregnancy", + "side effects of tamiflu", + "how to treat flu", + "viral bronchitis", + "flu how long contagious", + "flu remedy", ] # a list of all US states, including DC and the US as a whole LOCATIONS = [ - 'US', - 'AL', - 'AK', - 'AZ', - 'AR', - 'CA', - 'CO', - 'CT', - 'DC', - 'DE', - 'FL', - 'GA', - 'HI', - 'ID', - 'IL', - 'IN', - 'IA', - 'KS', - 'KY', - 'LA', - 'ME', - 'MD', - 'MA', - 'MI', - 'MN', - 'MS', - 'MO', - 'MT', - 'NE', - 'NV', - 'NH', - 'NJ', - 'NM', - 'NY', - 'NC', - 'ND', - 'OH', - 'OK', - 'OR', - 'PA', - 'RI', - 'SC', - 'SD', - 'TN', - 'TX', - 'UT', - 'VT', - 'VA', - 'WA', - 'WV', - 'WI', - 'WY', + "US", + "AL", + "AK", + "AZ", + "AR", + "CA", + "CO", + "CT", + "DC", + "DE", + "FL", + "GA", + "HI", + "ID", + "IL", + "IN", + "IA", + "KS", + "KY", + "LA", + "ME", + "MD", + "MA", + "MI", + "MN", + "MS", + "MO", + "MT", + "NE", + "NV", + "NH", + "NJ", + "NM", + "NY", + "NC", + "ND", + "OH", + "OK", + "OR", + "PA", + "RI", + "SC", + "SD", + "TN", + "TX", + "UT", + "VT", + "VA", + "WA", + "WV", + "WI", + "WY", ] -def update(locations, terms, first=None, last=None, countries=['US']): - # connect to the database - u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - cur = cnx.cursor() +def update(locations, terms, first=None, last=None, countries=["US"]): + # connect to the database + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") + cur = cnx.cursor() - def get_num_rows(): - cur.execute('SELECT count(1) `num` FROM `ght`') - for (num,) in cur: - pass - return num + def get_num_rows(): + cur.execute("SELECT count(1) `num` FROM `ght`") + for (num,) in cur: + pass + return num - # check from 4 weeks preceeding the last week with data through this week - cur.execute('SELECT max(`epiweek`) `ew0`, yearweek(now(), 6) `ew1` FROM `ght`') - for (ew0, ew1) in cur: - ew0 = 200401 if ew0 is None else flu.add_epiweeks(ew0, -4) - ew0 = ew0 if first is None else first - ew1 = ew1 if last is None else last - print('Checking epiweeks between %d and %d...' % (ew0, ew1)) + # check from 4 weeks preceeding the last week with data through this week + cur.execute("SELECT max(`epiweek`) `ew0`, yearweek(now(), 6) `ew1` FROM `ght`") + for (ew0, ew1) in cur: + ew0 = 200401 if ew0 is None else flu.add_epiweeks(ew0, -4) + ew0 = ew0 if first is None else first + ew1 = ew1 if last is None else last + print(f"Checking epiweeks between {int(ew0)} and {int(ew1)}...") - # keep track of how many rows were added - rows_before = get_num_rows() + # keep track of how many rows were added + rows_before = get_num_rows() - # check Google Trends for new and/or revised data - sql = ''' + # check Google Trends for new and/or revised data + sql = """ INSERT INTO `ght` (`query`, `location`, `epiweek`, `value`) VALUES (%s, %s, %s, %s) ON DUPLICATE KEY UPDATE `value` = %s - ''' - total_rows = 0 - ght = GHT(API_KEY) - for term in terms: - print(' [%s] using term' % term) - ll, cl = len(locations), len(countries) - for i in range(max(ll,cl)): - location = locations[i] if i < ll else locations[0] - country = countries[i] if i < cl else countries[0] - try: - #term2 = ('"%s"' % term) if ' ' in term else term - term2 = term - attempt = 0 - while True: - attempt += 1 - try: - result = ght.get_data(ew0, ew1, location, term2, country=country) - break - except Exception as ex: - if attempt >= 5: - raise ex - else: - delay = 2 ** attempt - print(' [%s|%s] caught exception (will retry in %ds):' % (term, location, delay), ex) - time.sleep(delay) - values = [p['value'] for p in result['data']['lines'][0]['points']] - ew = result['start_week'] - num_missing = 0 - for v in values: - # Default SQL location value for US country for backwards compatibility - # i.e. California's location is still stored as 'CA', - # and having location == 'US' is still stored as 'US' - sql_location = location if location != NO_LOCATION_STR else country - - # Change SQL location for non-US countries - if country != 'US': - # Underscore added to distinguish countries from 2-letter US states - sql_location = country + "_" - if location != NO_LOCATION_STR: - sql_location = sql_location + location - sql_data = (term, sql_location, ew, v, v) - cur.execute(sql, sql_data) - total_rows += 1 - if v == 0: - num_missing += 1 - #print(' [%s|%s|%d] missing value' % (term, location, ew)) - ew = flu.add_epiweeks(ew, 1) - if num_missing > 0: - print(' [%s|%s] missing %d/%d value(s)' % (term, location, num_missing, len(values))) - except Exception as ex: - print(' [%s|%s] caught exception (will NOT retry):' % (term, location), ex) - - # keep track of how many rows were added - rows_after = get_num_rows() - print('Inserted %d/%d row(s)'%(rows_after - rows_before, total_rows)) - - # cleanup - cur.close() - cnx.commit() - cnx.close() + """ + total_rows = 0 + ght = GHT(API_KEY) + for term in terms: + print(f" [{term}] using term") + ll, cl = len(locations), len(countries) + for i in range(max(ll, cl)): + location = locations[i] if i < ll else locations[0] + country = countries[i] if i < cl else countries[0] + try: + # term2 = ('"%s"' % term) if ' ' in term else term + term2 = term + attempt = 0 + while True: + attempt += 1 + try: + result = ght.get_data(ew0, ew1, location, term2, country=country) + break + except Exception as ex: + if attempt >= 5: + raise ex + else: + delay = 2**attempt + print( + f" [{term}|{location}] caught exception (will retry in {int(delay)}s):", + ex, + ) + time.sleep(delay) + values = [p["value"] for p in result["data"]["lines"][0]["points"]] + ew = result["start_week"] + num_missing = 0 + for v in values: + # Default SQL location value for US country for backwards compatibility + # i.e. California's location is still stored as 'CA', + # and having location == 'US' is still stored as 'US' + sql_location = location if location != NO_LOCATION_STR else country + + # Change SQL location for non-US countries + if country != "US": + # Underscore added to distinguish countries from 2-letter US states + sql_location = country + "_" + if location != NO_LOCATION_STR: + sql_location = sql_location + location + sql_data = (term, sql_location, ew, v, v) + cur.execute(sql, sql_data) + total_rows += 1 + if v == 0: + num_missing += 1 + # print(' [%s|%s|%d] missing value' % (term, location, ew)) + ew = flu.add_epiweeks(ew, 1) + if num_missing > 0: + print(f" [{term}|{location}] missing {int(num_missing)}/{len(values)} value(s)") + except Exception as ex: + print(f" [{term}|{location}] caught exception (will NOT retry):", ex) + + # keep track of how many rows were added + rows_after = get_num_rows() + print(f"Inserted {int(rows_after - rows_before)}/{int(total_rows)} row(s)") + + # cleanup + cur.close() + cnx.commit() + cnx.close() def main(): - # args and usage - parser = argparse.ArgumentParser() - parser.add_argument('location', action='store', type=str, default=None, help='location(s) (ex: all; US; TX; CA,LA,WY)') - parser.add_argument('term', action='store', type=str, default=None, help='term/query/topic (ex: all; /m/0cycc; "flu fever")') - parser.add_argument('--first', '-f', default=None, type=int, help='first epiweek override') - parser.add_argument('--last', '-l', default=None, type=int, help='last epiweek override') - parser.add_argument('--country', '-c', default='US', type=str, help='location country (ex: US; BR)') - args = parser.parse_args() - - # sanity check - first, last = args.first, args.last - if first is not None: - flu.check_epiweek(first) - if last is not None: - flu.check_epiweek(last) - if first is not None and last is not None and first > last: - raise Exception('epiweeks in the wrong order') - - # decide what to update - if args.location.lower() == 'all': - locations = LOCATIONS - elif args.location.lower() == 'none': - locations = [NO_LOCATION_STR] - else: - locations = args.location.upper().split(',') - if args.term.lower() == 'all': - terms = TERMS - else: - terms = [args.term] - - # country argument - # Check that country follows ISO 1366 Alpha-2 code. - # See https://www.iso.org/obp/ui/#search. - countries = args.country.upper().split(',') - if not all(map(lambda x: len(x) == 2, countries)): - raise Exception('country name must be two letters (ISO 1366 Alpha-2)') - - # if length of locations and countries is > 1, need to be the same - if len(locations) > 1 and len(countries) > 1 and len(locations) != len(countries): - raise Exception('locations and countries must be length 1, or same length') - - # run the update - update(locations, terms, first, last, countries) - - -if __name__ == '__main__': - main() + # args and usage + parser = argparse.ArgumentParser() + # fmt: off + parser.add_argument( + "location", + action="store", + type=str, + default=None, + help="location(s) (ex: all; US; TX; CA,LA,WY)" + ) + parser.add_argument( + "term", + action="store", + type=str, + default=None, + help='term/query/topic (ex: all; /m/0cycc; "flu fever")' + ) + parser.add_argument( + "--first", + "-f", + default=None, + type=int, + help="first epiweek override" + ) + parser.add_argument( + "--last", + "-l", + default=None, + type=int, + help="last epiweek override" + ) + parser.add_argument( + "--country", + "-c", + default="US", + type=str, + help="location country (ex: US; BR)" + ) + # fmt: on + args = parser.parse_args() + + # sanity check + first, last = args.first, args.last + if first is not None: + flu.check_epiweek(first) + if last is not None: + flu.check_epiweek(last) + if first is not None and last is not None and first > last: + raise Exception("epiweeks in the wrong order") + + # decide what to update + if args.location.lower() == "all": + locations = LOCATIONS + elif args.location.lower() == "none": + locations = [NO_LOCATION_STR] + else: + locations = args.location.upper().split(",") + if args.term.lower() == "all": + terms = TERMS + else: + terms = [args.term] + + # country argument + # Check that country follows ISO 1366 Alpha-2 code. + # See https://www.iso.org/obp/ui/#search. + countries = args.country.upper().split(",") + if not all(map(lambda x: len(x) == 2, countries)): + raise Exception("country name must be two letters (ISO 1366 Alpha-2)") + + # if length of locations and countries is > 1, need to be the same + if len(locations) > 1 and len(countries) > 1 and len(locations) != len(countries): + raise Exception("locations and countries must be length 1, or same length") + + # run the update + update(locations, terms, first, last, countries) + + +if __name__ == "__main__": + main() diff --git a/src/acquisition/ght/google_health_trends.py b/src/acquisition/ght/google_health_trends.py index 66a11c227..4bb8df25f 100644 --- a/src/acquisition/ght/google_health_trends.py +++ b/src/acquisition/ght/google_health_trends.py @@ -1,4 +1,4 @@ -''' +""" =============== === Purpose === =============== @@ -18,7 +18,7 @@ + sample command line usage + extract array of values from returned data * separated GHT class from ght_update.py -''' +""" # standard library import argparse @@ -31,109 +31,144 @@ from delphi.utils.epidate import EpiDate import delphi.utils.epiweek as flu -NO_LOCATION_STR = 'none' +NO_LOCATION_STR = "none" + class GHT: - # Google Trends API endpoint - DISCOVERY_URL = 'https://www.googleapis.com/discovery/v1/apis/trends/v1beta/rest' - - def __init__(self, key, delay=1): - self.service = build('trends', 'v1beta', developerKey=key, discoveryServiceUrl=GHT.DISCOVERY_URL) - self.delay = delay - - # converts a YYYYWW week into a YYYY-MM-DD date (using Sunday of the week) - @staticmethod - def _ew2date(ew): - # parse the epiweek - year, week = flu.split_epiweek(ew) - # get the date object (middle of the week; Wednesday) - date = EpiDate.from_epiweek(year, week) - # go to the first day of the week (Sunday) - date = date.add_days(-3) - # date as string - return str(date) - - # get data from Google APIs - # see: https://developers.google.com/apis-explorer/#p/trends/v1beta/trends.getTimelinesForHealth - def get_data(self, start_week, end_week, location, term, resolution='week', country='US'): - start_date = GHT._ew2date(start_week) - end_date = GHT._ew2date(end_week) - num_weeks = flu.delta_epiweeks(start_week, end_week) + 1 - - # getTimelinesForHealth parameters - params = { - 'terms': term, - 'time_startDate': start_date, - 'time_endDate': end_date, - 'timelineResolution': resolution, - } - # We have a special check for the US for backwards compatibility. - # i.e. if the country is 'US' AND the location is 'US', just put the geo-restriction for country. - # In contrast, another country might have a sub-region with initials 'US' and we want the region restriction instead. - if country == 'US': - if location == 'US' or location == NO_LOCATION_STR: - params['geoRestriction_country'] = 'US' - else: - params['geoRestriction_region'] = 'US-' + location - else: - if location == NO_LOCATION_STR: - params['geoRestriction_country'] = country - else: - params['geoRestriction_region'] = country + '-' + location - - # make the API call - data = self.service.getTimelinesForHealth(**params).execute() - - # extract the values - try: - values = [p['value'] for p in data['lines'][0]['points']] - except: - values = None - - # throttle request rate - time.sleep(self.delay) - - # return the results - return { - 'start_week': start_week, - 'end_week': end_week, - 'num_weeks': num_weeks, - 'location': location, - 'country' : country, - 'term': term, - 'resolution': resolution, - 'data': data, - 'values': values, - } + # Google Trends API endpoint + DISCOVERY_URL = "https://www.googleapis.com/discovery/v1/apis/trends/v1beta/rest" + + def __init__(self, key, delay=1): + self.service = build( + "trends", "v1beta", developerKey=key, discoveryServiceUrl=GHT.DISCOVERY_URL + ) + self.delay = delay + + # converts a YYYYWW week into a YYYY-MM-DD date (using Sunday of the week) + @staticmethod + def _ew2date(ew): + # parse the epiweek + year, week = flu.split_epiweek(ew) + # get the date object (middle of the week; Wednesday) + date = EpiDate.from_epiweek(year, week) + # go to the first day of the week (Sunday) + date = date.add_days(-3) + # date as string + return str(date) + + # get data from Google APIs + # see: https://developers.google.com/apis-explorer/#p/trends/v1beta/trends.getTimelinesForHealth + def get_data(self, start_week, end_week, location, term, resolution="week", country="US"): + start_date = GHT._ew2date(start_week) + end_date = GHT._ew2date(end_week) + num_weeks = flu.delta_epiweeks(start_week, end_week) + 1 + + # getTimelinesForHealth parameters + params = { + "terms": term, + "time_startDate": start_date, + "time_endDate": end_date, + "timelineResolution": resolution, + } + # We have a special check for the US for backwards compatibility. + # i.e. if the country is 'US' AND the location is 'US', just put the geo-restriction for country. + # In contrast, another country might have a sub-region with initials 'US' and we want the region restriction instead. + if country == "US": + if location == "US" or location == NO_LOCATION_STR: + params["geoRestriction_country"] = "US" + else: + params["geoRestriction_region"] = "US-" + location + else: + if location == NO_LOCATION_STR: + params["geoRestriction_country"] = country + else: + params["geoRestriction_region"] = country + "-" + location + + # make the API call + data = self.service.getTimelinesForHealth(**params).execute() + + # extract the values + try: + values = [p["value"] for p in data["lines"][0]["points"]] + except: + values = None + + # throttle request rate + time.sleep(self.delay) + + # return the results + return { + "start_week": start_week, + "end_week": end_week, + "num_weeks": num_weeks, + "location": location, + "country": country, + "term": term, + "resolution": resolution, + "data": data, + "values": values, + } def main(): - # args and usage - parser = argparse.ArgumentParser() - parser.add_argument('apikey', action='store', type=str, default=None, help='API key') - parser.add_argument('startweek', action='store', type=int, default=None, help='first week (ex: 201440)') - parser.add_argument('endweek', action='store', type=int, default=None, help='last week (ex: 201520)') - parser.add_argument('location', action='store', type=str, default=None, help='location (ex: US)') - parser.add_argument('term', action='store', type=str, default=None, help='term/query/topic (ex: /m/0cycc)') - args = parser.parse_args() - - # get the data - ght = GHT(args.apikey) - result = ght.get_data(args.startweek, args.endweek, args.location, args.term) - values = result['values'] - - # sanity check - expected_weeks = result['num_weeks'] - received_weeks = len([v for v in values if v is not None and type(v) == float and v >= 0]) - if expected_weeks != received_weeks: - raise Exception('expected %d weeks, received %d' % (expected_weeks, received_weeks)) - - # results - epiweeks = [ew for ew in flu.range_epiweeks(args.startweek, args.endweek, inclusive=True)] - for (epiweek, value) in zip(epiweeks, values): - print('%6d: %.3f' % (epiweek, value)) - - -if __name__ == '__main__': - main() + # args and usage + parser = argparse.ArgumentParser() + # fmt: off + parser.add_argument( + "apikey", + action="store", + type=str, + default=None, + help="API key" + ) + parser.add_argument( + "startweek", + action="store", + type=int, + default=None, + help="first week (ex: 201440)" + ) + parser.add_argument( + "endweek", + action="store", + type=int, + default=None, + help="last week (ex: 201520)" + ) + parser.add_argument( + "location", + action="store", + type=str, + default=None, + help="location (ex: US)" + ) + parser.add_argument( + "term", + action="store", + type=str, + default=None, + help="term/query/topic (ex: /m/0cycc)" + ) + # fmt: on + args = parser.parse_args() + + # get the data + ght = GHT(args.apikey) + result = ght.get_data(args.startweek, args.endweek, args.location, args.term) + values = result["values"] + + # sanity check + expected_weeks = result["num_weeks"] + received_weeks = len([v for v in values if v is not None and type(v) == float and v >= 0]) + if expected_weeks != received_weeks: + raise Exception(f"expected {int(expected_weeks)} weeks, received {int(received_weeks)}") + + # results + epiweeks = [ew for ew in flu.range_epiweeks(args.startweek, args.endweek, inclusive=True)] + for (epiweek, value) in zip(epiweeks, values): + print(f"{int(epiweek):6}: {value:.3f}") + + +if __name__ == "__main__": + main() diff --git a/src/acquisition/kcdc/kcdc_update.py b/src/acquisition/kcdc/kcdc_update.py index 70c167738..713b21f00 100644 --- a/src/acquisition/kcdc/kcdc_update.py +++ b/src/acquisition/kcdc/kcdc_update.py @@ -42,12 +42,14 @@ from delphi.utils.epiweek import delta_epiweeks, range_epiweeks, add_epiweeks from delphi.utils.epidate import EpiDate + def ensure_tables_exist(): - (u,p) = secrets.db.epi - cnx = mysql.connector.connect(user=u,password=p,database='epidata') + (u, p) = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") try: cursor = cnx.cursor() - cursor.execute(''' + cursor.execute( + """ CREATE TABLE IF NOT EXISTS `kcdc_ili` ( `id` INT(11) NOT NULL PRIMARY KEY AUTO_INCREMENT, `release_date` DATE NOT NULL, @@ -58,69 +60,76 @@ def ensure_tables_exist(): `ili` DOUBLE NOT NULL, UNIQUE KEY (`issue`, `epiweek`, `region`) ); - '''); + """ + ) cnx.commit() finally: cnx.close() + def safe_float(f): try: - return float(f.replace(',','')) + return float(f.replace(",", "")) except: return 0 + def safe_int(i): try: - return int(i.replace(',','')) + return int(i.replace(",", "")) except: return 0 -def get_rows(cnx, table='kcdc_ili'): - # Count and return the number of rows in the `kcdc_ili` table. - select = cnx.cursor() - select.execute('SELECT count(1) num FROM %s' % table) - for (num,) in select: - pass - select.close() - return num + +def get_rows(cnx, table="kcdc_ili"): + # Count and return the number of rows in the `kcdc_ili` table. + select = cnx.cursor() + select.execute(f"SELECT count(1) num FROM {table}") + for (num,) in select: + pass + select.close() + return num + def get_kcdc_data(): issue = EpiDate.today().get_ew() - last_season = issue//100 + (1 if issue % 100 > 35 else 0) - url = 'http://www.cdc.go.kr/npt/biz/npp/iss/influenzaListAjax.do' + last_season = issue // 100 + (1 if issue % 100 > 35 else 0) + url = "https://www.cdc.go.kr/npt/biz/npp/iss/influenzaListAjax.do" + # Started in 2004 params = { - 'icdNm': 'influenza', - 'startYear': '2004', # Started in 2004 - 'endYear': str(last_season) + "icdNm": "influenza", + "startYear": "2004", + "endYear": str(last_season), } response = requests.post(url, params) datas = response.json() - data = datas['data'] + data = datas["data"] ews = [] ilis = [] ew1 = 200436 - for year in range(2004,last_season): - year_data = data[year-2004] + for year in range(2004, last_season): + year_data = data[year - 2004] if year > 2004: ew1 = ews[-1] + 1 - ili_yr = year_data["VALUE"].split('`') - ili_yr = [float(f) for f in ili_yr if f != ''] - ew2 = add_epiweeks(ew1,len(ili_yr)) - new_ews = list(range_epiweeks(ew1,ew2)) + ili_yr = year_data["VALUE"].split("`") + ili_yr = [float(f) for f in ili_yr if f != ""] + ew2 = add_epiweeks(ew1, len(ili_yr)) + new_ews = list(range_epiweeks(ew1, ew2)) for i in range(len(new_ews)): j = float(ili_yr[i]) ilis.append(j) ews.append(new_ews[i]) return ews, ilis + def update_from_data(ews, ilis, date, issue, test_mode=False): u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') + cnx = mysql.connector.connect(user=u, password=p, database="epidata") rows1 = get_rows(cnx) - print('rows before: %d' % (rows1)) + print(f"rows before: {int(rows1)}") insert = cnx.cursor() - sql = ''' + sql = """ INSERT INTO `kcdc_ili` (`release_date`, `issue`, `epiweek`, `region`, `lag`, `ili`) @@ -129,15 +138,15 @@ def update_from_data(ews, ilis, date, issue, test_mode=False): ON DUPLICATE KEY UPDATE `release_date` = least(`release_date`, '%s'), `ili` = %s - ''' + """ for i in range(len(ews)): ew = ews[i] ili = ilis[i] lag = delta_epiweeks(ews[i], issue) - insert_args = [date,issue,ew,'ROK',lag,ili] - update_args = [date,ili] + insert_args = [date, issue, ew, "ROK", lag, ili] + update_args = [date, ili] try: insert.execute(sql % tuple(insert_args + update_args)) except Exception: @@ -146,34 +155,33 @@ def update_from_data(ews, ilis, date, issue, test_mode=False): # cleanup insert.close() if test_mode: - print('test mode, not committing') + print("test mode, not committing") rows2 = rows1 else: cnx.commit() rows2 = get_rows(cnx) - print('rows after: %d (added %d)' % (rows2,rows2-rows1)) + print(f"rows after: {int(rows2)} (added {int(rows2 - rows1)})") cnx.close() + def main(): # args and usage parser = argparse.ArgumentParser() parser.add_argument( - '--test', - action='store_true', - help='do dry run only, do not update the database' + "--test", action="store_true", help="do dry run only, do not update the database" ) args = parser.parse_args() - date = datetime.datetime.now().strftime('%Y-%m-%d') - print('assuming release date is today, %s' % date) + date = datetime.datetime.now().strftime("%Y-%m-%d") + print(f"assuming release date is today, {date}") issue = EpiDate.today().get_ew() ensure_tables_exist() - ews,ilis = get_kcdc_data() + ews, ilis = get_kcdc_data() update_from_data(ews, ilis, date, issue, test_mode=args.test) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/src/acquisition/nidss/taiwan_nidss.py b/src/acquisition/nidss/taiwan_nidss.py index 27da863e1..b2e369e63 100644 --- a/src/acquisition/nidss/taiwan_nidss.py +++ b/src/acquisition/nidss/taiwan_nidss.py @@ -4,7 +4,7 @@ =============== Scrapes weekly flu data from Taiwan's National Infectious Disease Statistics -System (NIDSS): http://nidss.cdc.gov.tw/en/ +System (NIDSS): https://nidss.cdc.gov.tw/en/ ================= @@ -37,233 +37,234 @@ class NIDSS: - """An API for scraping the NIDSS site.""" + """An API for scraping the NIDSS site.""" - # The page where the flu data is kept - FLU_URL = 'https://nidss.cdc.gov.tw/en/CDCWNH01.aspx?dc=wnh' + # The page where the flu data is kept + FLU_URL = "https://nidss.cdc.gov.tw/en/CDCWNH01.aspx?dc=wnh" - # Link to the dengue data - DENGUE_URL = 'http://nidss.cdc.gov.tw/Download/Weekly_Age_County_Gender_061.csv' + # Link to the dengue data + DENGUE_URL = "https://nidss.cdc.gov.tw/Download/Weekly_Age_County_Gender_061.csv" - # Translate location names to English - # https://en.wikipedia.org/wiki/List_of_administrative_divisions_of_Taiwan - _TRANSLATED = { - b'5Y2X5oqV57ij': 'Nantou_County', - b'5Y+w5Lit5biC': 'Taichung_City', - b'5Y+w5YyX5biC': 'Taipei_City', - b'5Y+w5Y2X5biC': 'Tainan_City', - b'5Y+w5p2x57ij': 'Taitung_County', - b'5ZiJ576p5biC': 'Chiayi_City', - b'5ZiJ576p57ij': 'Chiayi_County', - b'5Z+66ZqG5biC': 'Keelung_City', - b'5a6c6Jit57ij': 'Yilan_County', - b'5bGP5p2x57ij': 'Pingtung_County', - b'5b2w5YyW57ij': 'Changhua_County', - b'5paw5YyX5biC': 'New_Taipei_City', - b'5paw56u55biC': 'Hsinchu_City', - b'5paw56u557ij': 'Hsinchu_County', - b'5qGD5ZyS5biC': 'Taoyuan_City', - b'5r6O5rmW57ij': 'Penghu_County', - b'6Iqx6JOu57ij': 'Hualien_County', - b'6IuX5qCX57ij': 'Miaoli_County', - b'6YeR6ZaA57ij': 'Kinmen_County', - b'6Zuy5p6X57ij': 'Yunlin_County', - b'6auY6ZuE5biC': 'Kaohsiung_City', - b'6YCj5rGf57ij': 'Lienchiang_County', - } + # Translate location names to English + # https://en.wikipedia.org/wiki/List_of_administrative_divisions_of_Taiwan + _TRANSLATED = { + b"5Y2X5oqV57ij": "Nantou_County", + b"5Y+w5Lit5biC": "Taichung_City", + b"5Y+w5YyX5biC": "Taipei_City", + b"5Y+w5Y2X5biC": "Tainan_City", + b"5Y+w5p2x57ij": "Taitung_County", + b"5ZiJ576p5biC": "Chiayi_City", + b"5ZiJ576p57ij": "Chiayi_County", + b"5Z+66ZqG5biC": "Keelung_City", + b"5a6c6Jit57ij": "Yilan_County", + b"5bGP5p2x57ij": "Pingtung_County", + b"5b2w5YyW57ij": "Changhua_County", + b"5paw5YyX5biC": "New_Taipei_City", + b"5paw56u55biC": "Hsinchu_City", + b"5paw56u557ij": "Hsinchu_County", + b"5qGD5ZyS5biC": "Taoyuan_City", + b"5r6O5rmW57ij": "Penghu_County", + b"6Iqx6JOu57ij": "Hualien_County", + b"6IuX5qCX57ij": "Miaoli_County", + b"6YeR6ZaA57ij": "Kinmen_County", + b"6Zuy5p6X57ij": "Yunlin_County", + b"6auY6ZuE5biC": "Kaohsiung_City", + b"6YCj5rGf57ij": "Lienchiang_County", + } - # Map locations to regions - # https://en.wikipedia.org/wiki/List_of_administrative_divisions_of_Taiwan - # https://en.wikipedia.org/wiki/Regions_of_Taiwan#Hexchotomy - LOCATION_TO_REGION = { - # Taipei - 'Taipei_City': 'Taipei', - 'Keelung_City': 'Taipei', - 'New_Taipei_City': 'Taipei', - 'Yilan_County': 'Taipei', - 'Kinmen_County': 'Taipei', - 'Lienchiang_County': 'Taipei', - # Northern - 'Hsinchu_City': 'Northern', - 'Taoyuan_City': 'Northern', - 'Hsinchu_County': 'Northern', - 'Miaoli_County': 'Northern', - # Central - 'Taichung_City': 'Central', - 'Changhua_County': 'Central', - 'Nantou_County': 'Central', - # Southern - 'Tainan_City': 'Southern', - 'Chiayi_City': 'Southern', - 'Yunlin_County': 'Southern', - 'Chiayi_County': 'Southern', - # Kaoping - 'Kaohsiung_City': 'Kaoping', - 'Pingtung_County': 'Kaoping', - 'Penghu_County': 'Kaoping', - # Eastern - 'Hualien_County': 'Eastern', - 'Taitung_County': 'Eastern', - } + # Map locations to regions + # https://en.wikipedia.org/wiki/List_of_administrative_divisions_of_Taiwan + # https://en.wikipedia.org/wiki/Regions_of_Taiwan#Hexchotomy + LOCATION_TO_REGION = { + # Taipei + "Taipei_City": "Taipei", + "Keelung_City": "Taipei", + "New_Taipei_City": "Taipei", + "Yilan_County": "Taipei", + "Kinmen_County": "Taipei", + "Lienchiang_County": "Taipei", + # Northern + "Hsinchu_City": "Northern", + "Taoyuan_City": "Northern", + "Hsinchu_County": "Northern", + "Miaoli_County": "Northern", + # Central + "Taichung_City": "Central", + "Changhua_County": "Central", + "Nantou_County": "Central", + # Southern + "Tainan_City": "Southern", + "Chiayi_City": "Southern", + "Yunlin_County": "Southern", + "Chiayi_County": "Southern", + # Kaoping + "Kaohsiung_City": "Kaoping", + "Pingtung_County": "Kaoping", + "Penghu_County": "Kaoping", + # Eastern + "Hualien_County": "Eastern", + "Taitung_County": "Eastern", + } - @staticmethod - def _get_metadata(html): - issue_pattern = re.compile('^.*Latest available data: Week (\\d+), (\\d{4})\\..*$') - release_pattern = re.compile('^.*Data as of \\d+:\\d+:\\d+, (\\d{4})/(\\d{2})/(\\d{2})\\..*$') - issue, release = None, None - for line in html.split('\n'): - match = issue_pattern.match(line) - if match is not None: - year, week = int(match.group(2)), int(match.group(1)) - issue = year * 100 + week - match = release_pattern.match(line) - if match is not None: - year, month, day = int(match.group(1)), int(match.group(2)), int(match.group(3)) - release = '%04d-%02d-%02d' % (year, month, day) - if issue is None or release is None: - raise Exception('metadata not found') - return issue, release + @staticmethod + def _get_metadata(html): + issue_pattern = re.compile("^.*Latest available data: Week (\\d+), (\\d{4})\\..*$") + release_pattern = re.compile( + "^.*Data as of \\d+:\\d+:\\d+, (\\d{4})/(\\d{2})/(\\d{2})\\..*$" + ) + issue, release = None, None + for line in html.split("\n"): + match = issue_pattern.match(line) + if match is not None: + year, week = int(match.group(2)), int(match.group(1)) + issue = year * 100 + week + match = release_pattern.match(line) + if match is not None: + year, month, day = int(match.group(1)), int(match.group(2)), int(match.group(3)) + release = f"{int(year):04}-{int(month):02}-{int(day):02}" + if issue is None or release is None: + raise Exception("metadata not found") + return issue, release - @staticmethod - def _get_flu_data(html): - week_pattern = re.compile('^categories: \\[(.*)\\],$') - value_pattern = re.compile('^series: \\[(.*)\\],$') - data = {} - parsing_ili = True - for line in html.split('\n'): - line = line.strip() - match = week_pattern.match(line) - if match is not None: - weeks = [int(x[1:-1]) for x in match.group(1).split(',')] - for week in weeks: - check_epiweek(week) - if week not in data: - data[week] = {} - match = value_pattern.match(line) - if match is not None: - for item in match.group(1).split('},{'): - parts = item.replace('{', '').replace('}', '').strip().split(' ') - location = parts[1][1:-2] - def num(value): - if parsing_ili: - return float(value) - else: - if '.' in value: - raise Exception('expected type int for visits') - return int(value) - values = [num(x) for x in parts[3][1:-1].split(',')] - unit = 'ili' if parsing_ili else 'visits' - if len(weeks) != len(values): - raise Exception('len(weeks) != len(values)') - for week, value in zip(weeks, values): - if location not in data[week]: - data[week][location] = {} - data[week][location][unit] = value - parsing_ili = False - if len(data) == 0: - raise Exception('no data') - return data + @staticmethod + def _get_flu_data(html): + week_pattern = re.compile("^categories: \\[(.*)\\],$") + value_pattern = re.compile("^series: \\[(.*)\\],$") + data = {} + parsing_ili = True + for line in html.split("\n"): + line = line.strip() + match = week_pattern.match(line) + if match is not None: + weeks = [int(x[1:-1]) for x in match.group(1).split(",")] + for week in weeks: + check_epiweek(week) + if week not in data: + data[week] = {} + match = value_pattern.match(line) + if match is not None: + for item in match.group(1).split("},{"): + parts = item.replace("{", "").replace("}", "").strip().split(" ") + location = parts[1][1:-2] + + def num(value): + if parsing_ili: + return float(value) + else: + if "." in value: + raise Exception("expected type int for visits") + return int(value) - @staticmethod - def get_flu_data(): - # Fetch the flu page - response = requests.get(NIDSS.FLU_URL) - if response.status_code != 200: - raise Exception('request failed [%d]' % response.status_code) - html = response.text - # Parse metadata - latest_week, release_date = NIDSS._get_metadata(html) - # Parse flu data - data = NIDSS._get_flu_data(html) - # Return results indexed by week and location - return latest_week, release_date, data + values = [num(x) for x in parts[3][1:-1].split(",")] + unit = "ili" if parsing_ili else "visits" + if len(weeks) != len(values): + raise Exception("len(weeks) != len(values)") + for week, value in zip(weeks, values): + if location not in data[week]: + data[week][location] = {} + data[week][location][unit] = value + parsing_ili = False + if len(data) == 0: + raise Exception("no data") + return data - @staticmethod - def get_dengue_data(first_week, last_week): - # Check week order - if first_week > last_week: - first_week, last_week = last_week, first_week - # Bounds check - if first_week < 200301 or last_week < 200301: - raise Exception('week out of range') - # Initialize data by week and location (zeroes are not reported) - data = {} - for week in range_epiweeks(first_week, add_epiweeks(last_week, 1)): - data[week] = {} - for location in NIDSS.LOCATION_TO_REGION.keys(): - data[week][location] = 0 - # Download CSV - response = requests.get(NIDSS.DENGUE_URL) - if response.status_code != 200: - raise Exception('export Dengue failed [%d]' % response.status_code) - csv = response.content.decode('big5-tw') - # Parse the data - lines = [l.strip() for l in csv.split('\n')[1:] if l.strip() != ''] - for line in lines: - fields = line.split(',') - location_b64 = base64.b64encode(fields[3].encode('utf-8')) - location = NIDSS._TRANSLATED[location_b64] - # Fields currently unused: - # region = NIDSS.LOCATION_TO_REGION[location] - # imported_b64 = base64.b64encode(fields[6].encode('utf-8')) - # imported = imported_b64 == b'5piv' - # sex = fields[5] - # age = fields[7] - count = int(fields[8]) - year = int(fields[1]) - week = int(fields[2]) - # Week 53 was reported each year in 2003-2007 - if year < 2008 and year != 2003 and week > 52: - week = 52 - # Epiweek system change in 2009 - # See also: http://research.undefinedx.com/forum/index.php?topic=300.0 - if year == 2009: - week -= 1 - if week == 0: - year, week = 2008, 53 - epiweek = year * 100 + week - if epiweek < first_week or epiweek > last_week: - # Outside of the requested range - continue - if epiweek not in data or location not in data[epiweek]: - # Not a vaild U.S. epiweek - raise Exception('data missing %d-%s' % (epiweek, location)) - # Add the counts to the location on this epiweek - data[epiweek][location] += count - # Return results indexed by week and location - return data + @staticmethod + def get_flu_data(): + # Fetch the flu page + response = requests.get(NIDSS.FLU_URL) + if response.status_code != 200: + raise Exception(f"request failed [{int(response.status_code)}]") + html = response.text + # Parse metadata + latest_week, release_date = NIDSS._get_metadata(html) + # Parse flu data + data = NIDSS._get_flu_data(html) + # Return results indexed by week and location + return latest_week, release_date, data + + @staticmethod + def get_dengue_data(first_week, last_week): + # Check week order + if first_week > last_week: + first_week, last_week = last_week, first_week + # Bounds check + if first_week < 200301 or last_week < 200301: + raise Exception("week out of range") + # Initialize data by week and location (zeroes are not reported) + data = {} + for week in range_epiweeks(first_week, add_epiweeks(last_week, 1)): + data[week] = {} + for location in NIDSS.LOCATION_TO_REGION.keys(): + data[week][location] = 0 + # Download CSV + response = requests.get(NIDSS.DENGUE_URL) + if response.status_code != 200: + raise Exception(f"export Dengue failed [{int(response.status_code)}]") + csv = response.content.decode("big5-tw") + # Parse the data + lines = [l.strip() for l in csv.split("\n")[1:] if l.strip() != ""] + for line in lines: + fields = line.split(",") + location_b64 = base64.b64encode(fields[3].encode("utf-8")) + location = NIDSS._TRANSLATED[location_b64] + # Fields currently unused: + # region = NIDSS.LOCATION_TO_REGION[location] + # imported_b64 = base64.b64encode(fields[6].encode('utf-8')) + # imported = imported_b64 == b'5piv' + # sex = fields[5] + # age = fields[7] + count = int(fields[8]) + year = int(fields[1]) + week = int(fields[2]) + # Week 53 was reported each year in 2003-2007 + if year < 2008 and year != 2003 and week > 52: + week = 52 + # Epiweek system change in 2009 + # See also: https://research.undefinedx.com/forum/index.php?topic=300.0 + if year == 2009: + week -= 1 + if week == 0: + year, week = 2008, 53 + epiweek = year * 100 + week + if epiweek < first_week or epiweek > last_week: + # Outside of the requested range + continue + if epiweek not in data or location not in data[epiweek]: + # Not a vaild U.S. epiweek + raise Exception(f"data missing {int(epiweek)}-{location}") + # Add the counts to the location on this epiweek + data[epiweek][location] += count + # Return results indexed by week and location + return data def main(): - # Args and usage - parser = argparse.ArgumentParser() - parser.add_argument( - 'epiweek', - action='store', - type=int, - help='fetch data on this epiweek (ex: 201537)' - ) - args = parser.parse_args() - ew = args.epiweek + # Args and usage + parser = argparse.ArgumentParser() + parser.add_argument( + "epiweek", action="store", type=int, help="fetch data on this epiweek (ex: 201537)" + ) + args = parser.parse_args() + ew = args.epiweek - # Get the data - latest_week, release_date, fdata = NIDSS.get_flu_data() - ddata = NIDSS.get_dengue_data(ew, ew) + # Get the data + latest_week, release_date, fdata = NIDSS.get_flu_data() + ddata = NIDSS.get_dengue_data(ew, ew) - # Print the results - print('*** Meta ***') - print('latest_week:', latest_week) - print('release_date:', release_date) - print('*** Flu ***') - for region in sorted(list(fdata[ew].keys())): - visits, ili = fdata[ew][region]['visits'], fdata[ew][region]['ili'] - print('region=%s | visits=%d | ili=%.3f' % (region, visits, ili)) - print('*** Dengue ***') - for location in sorted(list(ddata[ew].keys())): - region = NIDSS.LOCATION_TO_REGION[location] - count = ddata[ew][location] - print('location=%s | region=%s | count=%d' % (location, region, count)) + # Print the results + print("*** Meta ***") + print("latest_week:", latest_week) + print("release_date:", release_date) + print("*** Flu ***") + for region in sorted(list(fdata[ew].keys())): + visits, ili = fdata[ew][region]["visits"], fdata[ew][region]["ili"] + print(f"region={region} | visits={int(visits)} | ili={ili:.3f}") + print("*** Dengue ***") + for location in sorted(list(ddata[ew].keys())): + region = NIDSS.LOCATION_TO_REGION[location] + count = ddata[ew][location] + print(f"location={location} | region={region} | count={int(count)}") -if __name__ == '__main__': - main() +if __name__ == "__main__": + main() diff --git a/src/acquisition/nidss/taiwan_update.py b/src/acquisition/nidss/taiwan_update.py index 830a7738d..30d458481 100644 --- a/src/acquisition/nidss/taiwan_update.py +++ b/src/acquisition/nidss/taiwan_update.py @@ -87,92 +87,88 @@ # Get a row count just to know how many new rows are inserted def get_rows(cnx): - select = cnx.cursor() - select.execute('SELECT count(1) num FROM nidss_flu') - for (num,) in select: - rows_flu = num - select.execute('SELECT count(1) num FROM nidss_dengue') - for (num,) in select: - rows_dengue = num - select.close() - return (rows_flu, rows_dengue) + select = cnx.cursor() + select.execute("SELECT count(1) num FROM nidss_flu") + for (num,) in select: + rows_flu = num + select.execute("SELECT count(1) num FROM nidss_dengue") + for (num,) in select: + rows_dengue = num + select.close() + return (rows_flu, rows_dengue) def update(test_mode=False): - # test mode - if test_mode: - print('test mode enabled: changes will not be saved') - - # Database connection - u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - rows1 = get_rows(cnx) - print('rows before (flu): %d' % (rows1[0])) - print('rows before (dengue): %d' % (rows1[1])) - insert = cnx.cursor() - sql_flu = ''' - INSERT INTO - `nidss_flu` (`release_date`, `issue`, `epiweek`, `region`, `lag`, `visits`, `ili`) - VALUES - (%s, %s, %s, %s, %s, %s, %s) - ON DUPLICATE KEY UPDATE - `release_date` = least(`release_date`, %s), `visits` = %s, `ili` = %s - ''' - sql_dengue = ''' - INSERT INTO - `nidss_dengue` (`epiweek`, `location`, `region`, `count`) - VALUES - (%s, %s, %s, %s) - ON DUPLICATE KEY UPDATE - `count` = %s - ''' - - # Scrape flu data - current_week, release_date, data = NIDSS.get_flu_data() - for epiweek in sorted(list(data.keys())): - lag = delta_epiweeks(epiweek, current_week) - for region in data[epiweek].keys(): - visits, ili = data[epiweek][region]['visits'], data[epiweek][region]['ili'] - params1 = [release_date, current_week, epiweek, region, lag, visits, ili] - params2 = [release_date, visits, ili] - insert.execute(sql_flu, tuple(params1 + params2)) - - # Scrape dengue data from the past year - data = NIDSS.get_dengue_data(add_epiweeks(current_week, -51), current_week) - for epiweek in sorted(list(data.keys())): - for location in sorted(list(data[epiweek].keys())): - region = NIDSS.LOCATION_TO_REGION[location] - count = data[epiweek][location] - params = (epiweek, location, region, count, count) - insert.execute(sql_dengue, params) - - # Cleanup - insert.close() - rows2 = get_rows(cnx) - print('rows after (flu): %d (added %d)' % (rows2[0], rows2[0] - rows1[0])) - print('rows after (dengue): %d (added %d)' % (rows2[1], rows2[1] - rows1[1])) - if test_mode: - print('test mode: changes not commited') - else: - cnx.commit() - cnx.close() + # test mode + if test_mode: + print("test mode enabled: changes will not be saved") + + # Database connection + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") + rows1 = get_rows(cnx) + print(f"rows before (flu): {int(rows1[0])}") + print(f"rows before (dengue): {int(rows1[1])}") + insert = cnx.cursor() + sql_flu = """ + INSERT INTO + `nidss_flu` (`release_date`, `issue`, `epiweek`, `region`, `lag`, `visits`, `ili`) + VALUES + (%s, %s, %s, %s, %s, %s, %s) + ON DUPLICATE KEY UPDATE + `release_date` = least(`release_date`, %s), `visits` = %s, `ili` = %s + """ + sql_dengue = """ + INSERT INTO + `nidss_dengue` (`epiweek`, `location`, `region`, `count`) + VALUES + (%s, %s, %s, %s) + ON DUPLICATE KEY UPDATE + `count` = %s + """ + + # Scrape flu data + current_week, release_date, data = NIDSS.get_flu_data() + for epiweek in sorted(list(data.keys())): + lag = delta_epiweeks(epiweek, current_week) + for region in data[epiweek].keys(): + visits, ili = data[epiweek][region]["visits"], data[epiweek][region]["ili"] + params1 = [release_date, current_week, epiweek, region, lag, visits, ili] + params2 = [release_date, visits, ili] + insert.execute(sql_flu, tuple(params1 + params2)) + + # Scrape dengue data from the past year + data = NIDSS.get_dengue_data(add_epiweeks(current_week, -51), current_week) + for epiweek in sorted(list(data.keys())): + for location in sorted(list(data[epiweek].keys())): + region = NIDSS.LOCATION_TO_REGION[location] + count = data[epiweek][location] + params = (epiweek, location, region, count, count) + insert.execute(sql_dengue, params) + + # Cleanup + insert.close() + rows2 = get_rows(cnx) + print(f"rows after (flu): {int(rows2[0])} (added {int(rows2[0] - rows1[0])})") + print(f"rows after (dengue): {int(rows2[1])} (added {int(rows2[1] - rows1[1])})") + if test_mode: + print("test mode: changes not commited") + else: + cnx.commit() + cnx.close() def main(): - # args and usage - parser = argparse.ArgumentParser() - parser.add_argument( - '--test', - '-t', - action='store_true', - default=False, - help='test mode, do not commit changes' - ) - args = parser.parse_args() - - # fetch and store NIDSS data - update(args.test) - - -if __name__ == '__main__': - main() + # args and usage + parser = argparse.ArgumentParser() + parser.add_argument( + "--test", "-t", action="store_true", default=False, help="test mode, do not commit changes" + ) + args = parser.parse_args() + + # fetch and store NIDSS data + update(args.test) + + +if __name__ == "__main__": + main() diff --git a/src/acquisition/paho/paho_db_update.py b/src/acquisition/paho/paho_db_update.py index d07885f79..b351d3ff2 100644 --- a/src/acquisition/paho/paho_db_update.py +++ b/src/acquisition/paho/paho_db_update.py @@ -50,9 +50,8 @@ import csv import datetime import glob -import subprocess -import random from io import StringIO +import tempfile # third party import mysql.connector @@ -64,12 +63,14 @@ from delphi.utils.epiweek import delta_epiweeks, check_epiweek from delphi.utils.epidate import EpiDate + def ensure_tables_exist(): - (u,p) = secrets.db.epi - cnx = mysql.connector.connect(user=u,password=p,database='epidata') + (u, p) = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") try: cursor = cnx.cursor() - cursor.execute(''' + cursor.execute( + """ CREATE TABLE IF NOT EXISTS `paho_dengue` ( `id` INT(11) NOT NULL PRIMARY KEY AUTO_INCREMENT, `release_date` DATE NOT NULL, @@ -85,35 +86,44 @@ def ensure_tables_exist(): `num_deaths` INT(11) NOT NULL, UNIQUE KEY (`issue`, `epiweek`, `region`) ); - '''); + """ + ) cnx.commit() finally: cnx.close() + def safe_float(f): try: - return float(f.replace(',','')) + return float(f.replace(",", "")) except: return 0 + def safe_int(i): try: - return int(i.replace(',','')) + return int(i.replace(",", "")) except: return 0 -def get_rows(cnx, table='paho_dengue'): - # Count and return the number of rows in the `fluview` table. - select = cnx.cursor() - select.execute('SELECT count(1) num FROM %s' % table) - for (num,) in select: - pass - select.close() - return num + +def get_rows(cnx, table="paho_dengue"): + # Count and return the number of rows in the `fluview` table. + select = cnx.cursor() + select.execute(f"SELECT count(1) num FROM {table}") + for (num,) in select: + pass + select.close() + return num + def get_paho_row(row): - if row[0] == "\ufeffIncidence Rate (c)" and row != "\ufeffIncidence Rate (c),(SD/D) x100 (e),CFR (f),ID,Country or Subregion,Deaths,EW,Confirmed,Epidemiological Week (a),Pop (no usar),Serotype,Severe Dengue (d),Total of Dengue Cases (b),Year,Population x 1000".split(","): - raise Exception('PAHO header row has changed') + if row[ + 0 + ] == "\ufeffIncidence Rate (c)" and row != "\ufeffIncidence Rate (c),(SD/D) x100 (e),CFR (f),ID,Country or Subregion,Deaths,EW,Confirmed,Epidemiological Week (a),Pop (no usar),Serotype,Severe Dengue (d),Total of Dengue Cases (b),Year,Population x 1000".split( + "," + ): + raise Exception("PAHO header row has changed") if len(row) == 1 or row[0] == "Incidence Rate (c)": # this is a header row return None @@ -128,23 +138,26 @@ def get_paho_row(row): except: return None try: - check_epiweek(safe_int(row[13])*100 + safe_int(row[8]), safe_int(row[13])*100 + safe_int(row[6])) + check_epiweek( + safe_int(row[13]) * 100 + safe_int(row[8]), safe_int(row[13]) * 100 + safe_int(row[6]) + ) except: return None return { - 'issue': safe_int(row[13])*100 + safe_int(row[6]), - 'epiweek': safe_int(row[13])*100 + safe_int(row[8]), - 'region': country, - 'total_pop': safe_int(row[14]), - 'serotype': row[10], - 'num_dengue': safe_int(row[12]), - 'incidence_rate': safe_float(row[0]), - 'num_severe': safe_int(row[11]), - 'num_deaths': safe_int(row[5]), - 'severe_ratio': safe_float(row[1]), - 'cfr': safe_float(row[2]) + "issue": safe_int(row[13]) * 100 + safe_int(row[6]), + "epiweek": safe_int(row[13]) * 100 + safe_int(row[8]), + "region": country, + "total_pop": safe_int(row[14]), + "serotype": row[10], + "num_dengue": safe_int(row[12]), + "incidence_rate": safe_float(row[0]), + "num_severe": safe_int(row[11]), + "num_deaths": safe_int(row[5]), + "severe_ratio": safe_float(row[1]), + "cfr": safe_float(row[2]), } + def update_from_file(issue, date, filename, test_mode=False): # Read PAHO data from CSV and insert into (or update) the database. @@ -156,23 +169,23 @@ def update_from_file(issue, date, filename, test_mode=False): # database connection u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - rows1 = get_rows(cnx, 'paho_dengue') - print('rows before: %d' % (rows1)) + cnx = mysql.connector.connect(user=u, password=p, database="epidata") + rows1 = get_rows(cnx, "paho_dengue") + print(f"rows before: {int(rows1)}") insert = cnx.cursor() # load the data, ignoring empty rows - print('loading data from %s as issued on %d' % (filename, issue)) - with open(filename,'r',encoding='utf-8') as f: + print(f"loading data from {filename} as issued on {int(issue)}") + with open(filename, encoding="utf-8") as f: c = f.read() rows = [] - for l in csv.reader(StringIO(c), delimiter=','): + for l in csv.reader(StringIO(c), delimiter=","): rows.append(get_paho_row(l)) - print(' loaded %d rows' % len(rows)) + print(f" loaded {len(rows)} rows") entries = [obj for obj in rows if obj] - print(' found %d entries' % len(entries)) + print(f" found {len(entries)} entries") - sql = ''' + sql = """ INSERT INTO `paho_dengue` (`release_date`, `issue`, `epiweek`, `region`, `lag`, `total_pop`, `serotype`, `num_dengue`, `incidence_rate`, @@ -187,55 +200,64 @@ def update_from_file(issue, date, filename, test_mode=False): `incidence_rate` = %s, `num_severe` = %s, `num_deaths` = %s - ''' + """ for row in entries: - if row['issue'] > issue: # Issued in a week that hasn't happened yet + if row["issue"] > issue: # Issued in a week that hasn't happened yet continue - lag = delta_epiweeks(row['epiweek'], issue) - data_args = [row['total_pop'], row['serotype'], row['num_dengue'], - row['incidence_rate'], row['num_severe'], row['num_deaths']] + lag = delta_epiweeks(row["epiweek"], issue) + data_args = [ + row["total_pop"], + row["serotype"], + row["num_dengue"], + row["incidence_rate"], + row["num_severe"], + row["num_deaths"], + ] - insert_args = [date,issue,row['epiweek'],row['region'],lag] + data_args + insert_args = [date, issue, row["epiweek"], row["region"], lag] + data_args update_args = [date] + data_args insert.execute(sql % tuple(insert_args + update_args)) # cleanup insert.close() if test_mode: - print('test mode, not committing') + print("test mode, not committing") rows2 = rows1 else: cnx.commit() rows2 = get_rows(cnx) - print('rows after: %d (added %d)' % (rows2,rows2-rows1)) + print(f"rows after: {int(rows2)} (added {int(rows2 - rows1)})") cnx.close() + def main(): # args and usage parser = argparse.ArgumentParser() + # fmt: off parser.add_argument( - '--test', - action='store_true', - help='do dry run only, do not update the database' + "--test", + action="store_true", + help="do dry run only, do not update the database" ) parser.add_argument( - '--file', + "--file", type=str, - help='load an existing zip file (otherwise fetch current data)' + help="load an existing zip file (otherwise fetch current data)" ) parser.add_argument( - '--issue', + "--issue", type=int, - help='issue of the file (e.g. 201740); used iff --file is given' + help="issue of the file (e.g. 201740); used iff --file is given" ) + # fmt: on args = parser.parse_args() if (args.file is None) != (args.issue is None): - raise Exception('--file and --issue must both be present or absent') + raise Exception("--file and --issue must both be present or absent") - date = datetime.datetime.now().strftime('%Y-%m-%d') - print('assuming release date is today, %s' % date) + date = datetime.datetime.now().strftime("%Y-%m-%d") + print(f"assuming release date is today, {date}") if args.file: update_from_file(args.issue, date, args.file, test_mode=args.test) @@ -247,34 +269,31 @@ def main(): max_tries = 5 while flag < max_tries: flag = flag + 1 - tmp_dir = ''.join(random.choice('0123456789abcdefghijklmnopqrstuvwxyz') for i in range(8)) - tmp_dir = 'downloads_' + tmp_dir - subprocess.call(["mkdir",tmp_dir]) - # Use temporary directory to avoid data from different time - # downloaded to same folder - get_paho_data(dir=tmp_dir) - issue = EpiDate.today().get_ew() - # Check to make sure we downloaded a file for every week - issueset = set() - files = glob.glob('%s/*.csv' % tmp_dir) - for filename in files: - with open(filename,'r') as f: - _ = f.readline() - data = f.readline().split(',') - issueset.add(data[6]) - db_error = False - if len(issueset) >= 53: # Shouldn't be more than 53 + with tempfile.TemporaryDirectory() as tmp_dir: + # Use temporary directory to avoid data from different time + # downloaded to same folder + get_paho_data(dir=tmp_dir) + issue = EpiDate.today().get_ew() + # Check to make sure we downloaded a file for every week + issueset = set() + files = glob.glob(f"{tmp_dir}/*.csv") for filename in files: - try: - update_from_file(issue, date, filename, test_mode=args.test) - subprocess.call(["rm",filename]) - except: - db_error = True - subprocess.call(["rm","-r",tmp_dir]) - if not db_error: - break # Exit loop with success + with open(filename) as f: + _ = f.readline() + data = f.readline().split(",") + issueset.add(data[6]) + db_error = False + if len(issueset) >= 53: # Shouldn't be more than 53 + for filename in files: + try: + update_from_file(issue, date, filename, test_mode=args.test) + except: + db_error = True + if not db_error: + break # Exit loop with success if flag >= max_tries: - print('WARNING: Database `paho_dengue` did not update successfully') + print("WARNING: Database `paho_dengue` did not update successfully") + -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/src/acquisition/paho/paho_download.py b/src/acquisition/paho/paho_download.py index 60dd13ae8..c6fa70285 100644 --- a/src/acquisition/paho/paho_download.py +++ b/src/acquisition/paho/paho_download.py @@ -1,4 +1,3 @@ - # IMPORTANT: This code is extremely unstable. # Slight changes to the PAHO website may render this script partially or entirely useless. @@ -15,42 +14,51 @@ headerheight = 0 + def wait_for(browser, css_selector, delay=10): try: - WebDriverWait(browser, delay).until(EC.presence_of_element_located((By.CSS_SELECTOR, css_selector))) - WebDriverWait(browser, delay).until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_selector))) - print('Success Loading %s' % (css_selector)) + WebDriverWait(browser, delay).until( + EC.presence_of_element_located((By.CSS_SELECTOR, css_selector)) + ) + WebDriverWait(browser, delay).until( + EC.element_to_be_clickable((By.CSS_SELECTOR, css_selector)) + ) + print(f"Success Loading {css_selector}") except TimeoutException: - print("Loading %s took too much time!" % (css_selector)) - + print(f"Loading {css_selector} took too much time!") + + def find_and_click(browser, element): element.location_once_scrolled_into_view browser.switch_to.default_content() - browser.execute_script("window.scrollBy(0,-%d)"%headerheight) + browser.execute_script(f"window.scrollBy(0,-{int(headerheight)})") browser.switch_to.frame(browser.find_element_by_tag_name("iframe")) browser.switch_to.frame(browser.find_element_by_tag_name("iframe")) element.click() -def get_paho_data(offset=0, dir='downloads'): + +def get_paho_data(offset=0, dir="downloads"): opts = Options() opts.set_headless() assert opts.headless # Operating in headless mode fp = FirefoxProfile() - fp.set_preference("browser.download.folderList",2) - fp.set_preference("browser.download.manager.showWhenStarting",False) - fp.set_preference("browser.download.dir",os.path.abspath(dir)) - fp.set_preference("browser.helperApps.neverAsk.saveToDisk","text/csv") - - browser = Firefox(options=opts,firefox_profile=fp) - browser.get('http://www.paho.org/data/index.php/en/mnu-topics/indicadores-dengue-en/dengue-nacional-en/252-dengue-pais-ano-en.html?showall=&start=1') + fp.set_preference("browser.download.folderList", 2) + fp.set_preference("browser.download.manager.showWhenStarting", False) + fp.set_preference("browser.download.dir", os.path.abspath(dir)) + fp.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/csv") + + browser = Firefox(options=opts, firefox_profile=fp) + browser.get( + "https://www.paho.org/data/index.php/en/mnu-topics/indicadores-dengue-en/dengue-nacional-en/252-dengue-pais-ano-en.html?showall=&start=1" + ) tab1 = browser.window_handles[0] - browser.execute_script('''window.open("","_blank");''') + browser.execute_script("""window.open("","_blank");""") tab2 = browser.window_handles[1] browser.switch_to.window(tab1) - + curr_offset = offset - + wait_for(browser, "div.rt-top-inner", delay=30) header = browser.find_element_by_css_selector("div.rt-top-inner") global headerheight @@ -59,41 +67,51 @@ def get_paho_data(offset=0, dir='downloads'): # The actual content of the data of this webpage is within 2 iframes, so we need to navigate into them first browser.switch_to.frame(browser.find_element_by_tag_name("iframe")) browser.switch_to.frame(browser.find_element_by_tag_name("iframe")) - + # Locate the button that allows to download the table - downloadoption = browser.find_elements_by_css_selector("div.tabToolbarButton.tab-widget.download")[0] + downloadoption = browser.find_elements_by_css_selector( + "div.tabToolbarButton.tab-widget.download" + )[0] find_and_click(browser, downloadoption) wait_for(browser, "div[data-tb-test-id='DownloadImage-Button']") # Locate the button that prepares the table for download as an image - imagebutton = browser.find_elements_by_css_selector("div[data-tb-test-id='DownloadImage-Button']")[0] + imagebutton = browser.find_elements_by_css_selector( + "div[data-tb-test-id='DownloadImage-Button']" + )[0] find_and_click(browser, imagebutton) wait_for(browser, ".tabDownloadFileButton[data-test-id='DownloadLink']") # Locate the button that downloads the table as an image - downloadbutton = browser.find_elements_by_css_selector(".tabDownloadFileButton[data-test-id='DownloadLink']")[0] + downloadbutton = browser.find_elements_by_css_selector( + ".tabDownloadFileButton[data-test-id='DownloadLink']" + )[0] # Extract session ID href = downloadbutton.get_attribute("href") startidx = href.index("sessions/") + len("sessions/") - endidx = href.index("/",startidx) + endidx = href.index("/", startidx) sessionid = href[startidx:endidx] - dataurl = "http://phip.paho.org/vizql/w/Casosdedengue_tben/v/ByLastAvailableEpiWeek/viewData/sessions/%s/views/18076444178507886853_9530488980060483892?maxrows=200&viz=%%7B%%22worksheet%%22:%%22W%%20By%%20Last%%20Available%%20EpiWeek%%22,%%22dashboard%%22:%%22By%%20Last%%20Available%%20Epi%%20Week%%22%%7D"%sessionid + dataurl = f"https://phip.paho.org/vizql/w/Casosdedengue_tben/v/ByLastAvailableEpiWeek/viewData/sessions/{sessionid}/views/18076444178507886853_9530488980060483892?maxrows=200&viz=%%7B%%22worksheet%%22:%%22W%%20By%%20Last%%20Available%%20EpiWeek%%22,%%22dashboard%%22:%%22By%%20Last%%20Available%%20Epi%%20Week%%22%%7D" wait_for(browser, "div[data-tb-test-id='CancelBtn-Button']") # Cancel image download - cancelbutton = browser.find_elements_by_css_selector("div[data-tb-test-id='CancelBtn-Button']")[0] + cancelbutton = browser.find_elements_by_css_selector("div[data-tb-test-id='CancelBtn-Button']")[ + 0 + ] find_and_click(browser, cancelbutton) wait_for(browser, "div[id='tableau_base_widget_FilterPanel_0']") # Default is to show data for current year, we want to get all years # Clicks drop-down menu to open options - yearselector = browser.find_elements_by_css_selector("div[id='tableau_base_widget_FilterPanel_0']")[0] + yearselector = browser.find_elements_by_css_selector( + "div[id='tableau_base_widget_FilterPanel_0']" + )[0] find_and_click(browser, yearselector) wait_for(browser, "div.facetOverflow") @@ -107,27 +125,29 @@ def get_paho_data(offset=0, dir='downloads'): for i in range(offset): gp = browser.find_element_by_css_selector("div.wcGlassPane") - #print gp.is_enabled() - #print gp.is_selected() - #print gp.is_displayed() + # print gp.is_enabled() + # print gp.is_selected() + # print gp.is_displayed() try: WebDriverWait(browser, 10).until(EC.staleness_of(gp)) - print("Loaded next week % d" % (53-offset)) + print(f"Loaded next week {int(53 - offset)}") except TimeoutException: - print("Loading next week %d took too much time!" % (53-offset)) + print(f"Loading next week {int(53 - offset)} took too much time!") gp = browser.find_element_by_css_selector("div.wcGlassPane") - #print gp.is_enabled() - #print gp.is_selected() - #print gp.is_displayed() - x = browser.find_elements_by_css_selector("div.dijitReset.dijitSliderButtonContainer.dijitSliderButtonContainerH.tableauArrowDec")[0] + # print gp.is_enabled() + # print gp.is_selected() + # print gp.is_displayed() + x = browser.find_elements_by_css_selector( + "div.dijitReset.dijitSliderButtonContainer.dijitSliderButtonContainerH.tableauArrowDec" + )[0] find_and_click(browser, x) # Cycle through all weeks, downloading each week as a separate .csv # Theoretically, need to cycle 53 times, but in practice only 54 works, unsure why - for i in range(54-offset): + for i in range(54 - offset): # If something goes wrong for whatever reason, try from the beginning try: - print('Loading week %d' % (53-i)) + print(f"Loading week {int(53 - i)}") # (Re-)load URL browser.switch_to.window(tab2) browser.get(dataurl) @@ -137,7 +157,9 @@ def get_paho_data(offset=0, dir='downloads'): full_data_tab = browser.find_elements_by_css_selector("li[id='tab-view-full-data']")[0] full_data_tab.click() - wait_for(browser, "a.csvLink") # Sometimes this fails but the button is successfully clicked anyway, not sure why + wait_for( + browser, "a.csvLink" + ) # Sometimes this fails but the button is successfully clicked anyway, not sure why # Actually download the data as a .csv (Will be downloaded to Firefox's default download destination) data_links = browser.find_elements_by_css_selector("a.csvLink") data_link = None @@ -149,16 +171,22 @@ def get_paho_data(offset=0, dir='downloads'): # Locate button that decreases the current week by 1 browser.switch_to.window(tab1) - wait_for(browser, "div.dijitReset.dijitSliderButtonContainer.dijitSliderButtonContainerH.tableauArrowDec") - - x = browser.find_elements_by_css_selector("div.dijitReset.dijitSliderButtonContainer.dijitSliderButtonContainerH.tableauArrowDec")[0] + wait_for( + browser, + "div.dijitReset.dijitSliderButtonContainer.dijitSliderButtonContainerH.tableauArrowDec", + ) + + x = browser.find_elements_by_css_selector( + "div.dijitReset.dijitSliderButtonContainer.dijitSliderButtonContainerH.tableauArrowDec" + )[0] find_and_click(browser, x) curr_offset += 1 except Exception as e: - print('Got exception %s\nTrying again from week %d' % (e,53-offset)) + print(f"Got exception {e}\nTrying again from week {int(53 - offset)}") browser.quit() get_paho_data(offset=curr_offset) browser.quit() -if __name__ == '__main__': - get_paho_data(dir='downloads/') + +if __name__ == "__main__": + get_paho_data(dir="downloads/") diff --git a/src/acquisition/quidel/quidel.py b/src/acquisition/quidel/quidel.py index a7c9a2918..0540d5e7c 100644 --- a/src/acquisition/quidel/quidel.py +++ b/src/acquisition/quidel/quidel.py @@ -1,4 +1,4 @@ -''' +""" =============== === Purpose === =============== @@ -15,7 +15,7 @@ * add end date, end week check 2017-12-02: * original version -''' +""" # standard library from collections import defaultdict @@ -35,148 +35,187 @@ import delphi.utils.epidate as ED from delphi.utils.geo.locations import Locations -def word_map(row,terms): - for (k,v) in terms.items(): - row = row.replace(k,v) + +def word_map(row, terms): + for (k, v) in terms.items(): + row = row.replace(k, v) return row -def date_less_than(d1,d2): - y1,m1,d1 = [int(x) for x in d1.split('-')] - y2,m2,d2 = [int(x) for x in d2.split('-')] - if y1*10000+m1*100+d10: shifted to future def date_to_epiweek(date, shift=0): - y,m,d = [int(x) for x in date.split('-')] + y, m, d = (int(x) for x in date.split("-")) - epidate = ED.EpiDate(y,m,d) + epidate = ED.EpiDate(y, m, d) epidate = epidate.add_days(shift) ew = epidate.get_ew() return ew + # convert measurment to time series format # startweek and endweek are inclusive -def measurement_to_ts(m,index,startweek=None,endweek=None): +def measurement_to_ts(m, index, startweek=None, endweek=None): if startweek is None: startweek = 0 if endweek is None: endweek = 999999 res = {} - for r,rdict in m.items(): - res[r]={} - for t,vals in rdict.items(): - if index>=len(vals): + for r, rdict in m.items(): + res[r] = {} + for t, vals in rdict.items(): + if index >= len(vals): raise Exception("Index is invalid") - if t>=startweek and t<=endweek: + if t >= startweek and t <= endweek: res[r][t] = vals[index] return res + class QuidelData: def __init__(self, raw_path, load_email=True): self.data_path = raw_path - self.excel_uptodate_path = join(raw_path,'excel/uptodate') - self.excel_history_path = join(raw_path,'excel/history') - self.csv_path = join(raw_path,'csv') + self.excel_uptodate_path = join(raw_path, "excel/uptodate") + self.excel_history_path = join(raw_path, "excel/history") + self.csv_path = join(raw_path, "csv") self.xlsx_uptodate_list = [ - f[:-5] for f in listdir(self.excel_uptodate_path) if isfile(join(self.excel_uptodate_path, f)) and f[-5:]=='.xlsx' + f[:-5] + for f in listdir(self.excel_uptodate_path) + if isfile(join(self.excel_uptodate_path, f)) and f[-5:] == ".xlsx" ] self.xlsx_history_list = [ - f[:-5] for f in listdir(self.excel_history_path) if isfile(join(self.excel_history_path, f)) and f[-5:]=='.xlsx' + f[:-5] + for f in listdir(self.excel_history_path) + if isfile(join(self.excel_history_path, f)) and f[-5:] == ".xlsx" + ] + self.csv_list = [ + f[:-4] + for f in listdir(self.csv_path) + if isfile(join(self.csv_path, f)) and f[-4:] == ".csv" ] - self.csv_list = [f[:-4] for f in listdir(self.csv_path) if isfile(join(self.csv_path, f)) and f[-4:]=='.csv'] self.map_terms = { - ' FL 34637"':'FL', + ' FL 34637"': "FL", } # hardcoded parameters self.date_dim = 1 self.state_dim = 4 self.fields = [ - 'sofia_ser','date','fac_id','city','state','zip','age', - 'fluA','fluB','fluAll','county','fac_type' + "sofia_ser", + "date", + "fac_id", + "city", + "state", + "zip", + "age", + "fluA", + "fluB", + "fluAll", + "county", + "fac_type", ] - self.fields_to_keep = ['fac_id','fluA','fluB','fluAll'] + self.fields_to_keep = ["fac_id", "fluA", "fluB", "fluAll"] self.dims_to_keep = [self.fields.index(x) for x in self.fields_to_keep] if load_email: self.retrieve_excels() self.prepare_csv() def retrieve_excels(self): - detach_dir = self.excel_uptodate_path # directory where to save attachments (default: current) + detach_dir = ( + self.excel_uptodate_path + ) # directory where to save attachments (default: current) # connecting to the gmail imap server m = imaplib.IMAP4_SSL("imap.gmail.com") - m.login(secrets.quidel.email_addr,secrets.quidel.email_pwd) - m.select("INBOX") # here you a can choose a mail box like INBOX instead + m.login(secrets.quidel.email_addr, secrets.quidel.email_pwd) + m.select("INBOX") # here you a can choose a mail box like INBOX instead # use m.list() to get all the mailboxes - _, items = m.search(None, "ALL") # you could filter using the IMAP rules here (check http://www.example-code.com/csharp/imap-search-critera.asp) - items = items[0].split() # getting the mails id + # you could filter using the IMAP rules here (check https://www.example-code.com/csharp/imap-search-critera.asp) + _, items = m.search(None, "ALL") + items = items[0].split() # getting the mails id # The emailids are ordered from past to now for emailid in items: - _, data = m.fetch(emailid, "(RFC822)") # fetching the mail, "`(RFC822)`" means "get the whole stuff", but you can ask for headers only, etc - email_body = data[0][1].decode('utf-8') # getting the mail content - mail = email.message_from_string(email_body) # parsing the mail content to get a mail object - - #Check if any attachments at all - if mail.get_content_maintype() != 'multipart': + _, data = m.fetch( + emailid, "(RFC822)" + ) # fetching the mail, "`(RFC822)`" means "get the whole stuff", but you can ask for headers only, etc + email_body = data[0][1].decode("utf-8") # getting the mail content + mail = email.message_from_string( + email_body + ) # parsing the mail content to get a mail object + + # Check if any attachments at all + if mail.get_content_maintype() != "multipart": continue # we use walk to create a generator so we can iterate on the parts and forget about the recursive headach for part in mail.walk(): # multipart are just containers, so we skip them - if part.get_content_maintype() == 'multipart': + if part.get_content_maintype() == "multipart": continue # is this part an attachment ? - if part.get('Content-Disposition') is None: + if part.get("Content-Disposition") is None: continue filename = part.get_filename() # check duplicates - if filename[-5:]!='.xlsx' or filename[:-5] in self.xlsx_uptodate_list+self.xlsx_history_list: + if ( + filename[-5:] != ".xlsx" + or filename[:-5] in self.xlsx_uptodate_list + self.xlsx_history_list + ): continue self.xlsx_uptodate_list.append(filename[:-5]) att_path = os.path.join(detach_dir, filename) - #Check if its already there - if not os.path.isfile(att_path) : + # Check if its already there + if not os.path.isfile(att_path): # finally write the stuff - fp = open(att_path, 'wb') + fp = open(att_path, "wb") fp.write(part.get_payload(decode=True)) fp.close() def prepare_csv(self): - need_update=False + need_update = False for f in self.xlsx_uptodate_list: if f in self.csv_list: continue else: - need_update=True + need_update = True - date_regex = '\d{2}-\d{2}-\d{4}' - date_items = re.findall(date_regex,f) + date_regex = r"\d{2}-\d{2}-\d{4}" + date_items = re.findall(date_regex, f) if date_items: - end_date = '-'.join(date_items[-1].split('-')[x] for x in [2,0,1]) + end_date = "-".join(date_items[-1].split("-")[x] for x in [2, 0, 1]) else: - print("End date not found in file name:"+f) + print("End date not found in file name:" + f) end_date = None - df_dict = pd.read_excel(join(self.excel_uptodate_path, f+'.xlsx'), sheet_name=None) - for (_,df) in df_dict.items(): - df = df.dropna(axis=0, how='all') - df['TestDate'] = df['TestDate'].apply(lambda x: x.strftime('%Y-%m-%d')) - df_filtered = df[df['TestDate']!=''] + df_dict = pd.read_excel(join(self.excel_uptodate_path, f + ".xlsx"), sheet_name=None) + for (_, df) in df_dict.items(): + df = df.dropna(axis=0, how="all") + df["TestDate"] = df["TestDate"].apply(lambda x: x.strftime("%Y-%m-%d")) + df_filtered = df[df["TestDate"] != ""] if end_date is not None: - df_filtered = df_filtered[df.apply(lambda x: date_less_than(end_date,x['TestDate'])!=1, axis=1)] - df_filtered.to_csv(join(self.csv_path, f+'.csv'), index=False, encoding='utf-8') - self.csv_list = [f[:-4] for f in listdir(self.csv_path) if isfile(join(self.csv_path, f)) and f[-4:]=='.csv'] + df_filtered = df_filtered[ + df.apply(lambda x: date_less_than(end_date, x["TestDate"]) != 1, axis=1) + ] + df_filtered.to_csv(join(self.csv_path, f + ".csv"), index=False, encoding="utf-8") + self.csv_list = [ + f[:-4] + for f in listdir(self.csv_path) + if isfile(join(self.csv_path, f)) and f[-4:] == ".csv" + ] self.need_update = need_update def load_csv(self, dims=None): @@ -186,12 +225,12 @@ def load_csv(self, dims=None): for f in self.csv_list: if f in self.xlsx_history_list: continue - rf = open(join(self.csv_path,f+'.csv')) + rf = open(join(self.csv_path, f + ".csv")) lines = rf.readlines() for l in lines[1:]: - l = word_map(l,self.map_terms) - row = l.strip().split(',') + l = word_map(l, self.map_terms) + row = l.strip().split(",") date = row[self.date_dim] state = row[self.state_dim] if state not in parsed_dict[date]: @@ -202,7 +241,7 @@ def load_csv(self, dims=None): # hardcoded aggregation function # output: [#unique_device,fluA,fluB,fluAll,total] - def prepare_measurements(self,data_dict,use_hhs=True,start_weekday=6): + def prepare_measurements(self, data_dict, use_hhs=True, start_weekday=6): buffer_dict = {} if use_hhs: region_list = Locations.hhs_list @@ -210,34 +249,35 @@ def prepare_measurements(self,data_dict,use_hhs=True,start_weekday=6): region_list = Locations.atom_list def get_hhs_region(atom): - for region in Locations.hhs_list: - if atom.lower() in Locations.hhs_map[region]: - return region - if atom.lower() == 'ny': - return 'hhs2' - return atom + for region in Locations.hhs_list: + if atom.lower() in Locations.hhs_map[region]: + return region + if atom.lower() == "ny": + return "hhs2" + return atom day_shift = 6 - start_weekday - time_map = lambda x:date_to_epiweek(x,shift=day_shift) - region_map = lambda x:get_hhs_region(x) \ - if use_hhs and x not in Locations.hhs_list else x # a bit hacky + time_map = lambda x: date_to_epiweek(x, shift=day_shift) + region_map = ( + lambda x: get_hhs_region(x) if use_hhs and x not in Locations.hhs_list else x + ) # a bit hacky end_date = sorted(data_dict.keys())[-1] # count the latest week in only if Thurs data is included - end_epiweek = date_to_epiweek(end_date,shift=-4) + end_epiweek = date_to_epiweek(end_date, shift=-4) # first pass: prepare device_id set device_dict = {} - for (date,daily_dict) in data_dict.items(): + for (date, daily_dict) in data_dict.items(): if not date: continue ew = time_map(date) - if ew == -1 or ew>end_epiweek: + if ew == -1 or ew > end_epiweek: continue if ew not in device_dict: - device_dict[ew]={} + device_dict[ew] = {} for r in region_list: device_dict[ew][r] = set() - for (state,rec_list) in daily_dict.items(): + for (state, rec_list) in daily_dict.items(): region = region_map(state) # get rid of non-US regions if region not in region_list: @@ -247,38 +287,40 @@ def get_hhs_region(atom): device_dict[ew][region].add(fac) # second pass: prepare all measurements - for (date,daily_dict) in data_dict.items(): + for (date, daily_dict) in data_dict.items(): ew = time_map(date) - if ew == -1 or ew>end_epiweek: + if ew == -1 or ew > end_epiweek: continue if ew not in buffer_dict: - buffer_dict[ew]={} + buffer_dict[ew] = {} for r in region_list: - buffer_dict[ew][r] = [0.0]*8 + buffer_dict[ew][r] = [0.0] * 8 - for (state,rec_list) in daily_dict.items(): + for (state, rec_list) in daily_dict.items(): region = region_map(state) # get rid of non-US regions if region not in region_list: continue for rec in rec_list: fac_num = float(len(device_dict[ew][region])) - buffer_dict[ew][region]= np.add( - buffer_dict[ew][region],[ - rec[1]=='positive', - rec[2]=='positive', - rec[3]=='positive', + buffer_dict[ew][region] = np.add( + buffer_dict[ew][region], + [ + rec[1] == "positive", + rec[2] == "positive", + rec[3] == "positive", 1.0, - float(rec[1]=='positive')/fac_num, - float(rec[2]=='positive')/fac_num, - float(rec[3]=='positive')/fac_num, - 1.0/fac_num, - ]).tolist() + float(rec[1] == "positive") / fac_num, + float(rec[2] == "positive") / fac_num, + float(rec[3] == "positive") / fac_num, + 1.0 / fac_num, + ], + ).tolist() # switch two dims of dict result_dict = {} for r in region_list: - result_dict[r]={} - for (k,v) in buffer_dict.items(): - result_dict[r][k]=v[r] + result_dict[r] = {} + for (k, v) in buffer_dict.items(): + result_dict[r][k] = v[r] return result_dict diff --git a/src/acquisition/quidel/quidel_update.py b/src/acquisition/quidel/quidel_update.py index b6303533c..563cea898 100644 --- a/src/acquisition/quidel/quidel_update.py +++ b/src/acquisition/quidel/quidel_update.py @@ -1,4 +1,4 @@ -''' +""" =============== === Purpose === =============== @@ -33,7 +33,7 @@ 2017-12-02: * original version -''' +""" # standard library import argparse @@ -49,106 +49,142 @@ from delphi.utils.geo.locations import Locations LOCATIONS = Locations.hhs_list -DATAPATH = '/home/automation/quidel_data' +DATAPATH = "/home/automation/quidel_data" + def update(locations, first=None, last=None, force_update=False, load_email=True): - # download and prepare data first - qd = quidel.QuidelData(DATAPATH,load_email) - if not qd.need_update and not force_update: - print('Data not updated, nothing needs change.') - return - - qd_data = qd.load_csv() - qd_measurements = qd.prepare_measurements(qd_data,start_weekday=4) - qd_ts = quidel.measurement_to_ts(qd_measurements,7,startweek=first,endweek=last) - # connect to the database - u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - cur = cnx.cursor() - - def get_num_rows(): - cur.execute('SELECT count(1) `num` FROM `quidel`') - for (num,) in cur: - pass - return num - - # check from 4 weeks preceeding the last week with data through this week - cur.execute('SELECT max(`epiweek`) `ew0`, yearweek(now(), 6) `ew1` FROM `quidel`') - for (ew0, ew1) in cur: - ew0 = 200401 if ew0 is None else flu.add_epiweeks(ew0, -4) - ew0 = ew0 if first is None else first - ew1 = ew1 if last is None else last - print('Checking epiweeks between %d and %d...' % (ew0, ew1)) - - # keep track of how many rows were added - rows_before = get_num_rows() - - # check Quidel for new and/or revised data - sql = ''' + # download and prepare data first + qd = quidel.QuidelData(DATAPATH, load_email) + if not qd.need_update and not force_update: + print("Data not updated, nothing needs change.") + return + + qd_data = qd.load_csv() + qd_measurements = qd.prepare_measurements(qd_data, start_weekday=4) + qd_ts = quidel.measurement_to_ts(qd_measurements, 7, startweek=first, endweek=last) + # connect to the database + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") + cur = cnx.cursor() + + def get_num_rows(): + cur.execute("SELECT count(1) `num` FROM `quidel`") + for (num,) in cur: + pass + return num + + # check from 4 weeks preceeding the last week with data through this week + cur.execute("SELECT max(`epiweek`) `ew0`, yearweek(now(), 6) `ew1` FROM `quidel`") + for (ew0, ew1) in cur: + ew0 = 200401 if ew0 is None else flu.add_epiweeks(ew0, -4) + ew0 = ew0 if first is None else first + ew1 = ew1 if last is None else last + print(f"Checking epiweeks between {int(ew0)} and {int(ew1)}...") + + # keep track of how many rows were added + rows_before = get_num_rows() + + # check Quidel for new and/or revised data + sql = """ INSERT INTO `quidel` (`location`, `epiweek`, `value`) VALUES (%s, %s, %s) ON DUPLICATE KEY UPDATE `value` = %s - ''' - - total_rows = 0 - - for location in locations: - if location not in qd_ts: - continue - ews = sorted(qd_ts[location].keys()) - num_missing = 0 - for ew in ews: - v = qd_ts[location][ew] - sql_data = (location, ew, v, v) - cur.execute(sql, sql_data) - total_rows += 1 - if v == 0: - num_missing += 1 - if num_missing > 0: - print(' [%s] missing %d/%d value(s)' % (location, num_missing, len(ews))) - - # keep track of how many rows were added - rows_after = get_num_rows() - print('Inserted %d/%d row(s)'%(rows_after - rows_before, total_rows)) - - # cleanup - cur.close() - cnx.commit() - cnx.close() + """ + + total_rows = 0 + + for location in locations: + if location not in qd_ts: + continue + ews = sorted(qd_ts[location].keys()) + num_missing = 0 + for ew in ews: + v = qd_ts[location][ew] + sql_data = (location, ew, v, v) + cur.execute(sql, sql_data) + total_rows += 1 + if v == 0: + num_missing += 1 + if num_missing > 0: + print(f" [{location}] missing {int(num_missing)}/{len(ews)} value(s)") + + # keep track of how many rows were added + rows_after = get_num_rows() + print(f"Inserted {int(rows_after - rows_before)}/{int(total_rows)} row(s)") + + # cleanup + cur.close() + cnx.commit() + cnx.close() def main(): - # args and usage - parser = argparse.ArgumentParser() - parser.add_argument('--location', action='store', type=str, default=None, help='location(s) (ex: all; any of hhs1-10)') - parser.add_argument('--first', '-f', default=None, type=int, help='first epiweek override') - parser.add_argument('--last', '-l', default=None, type=int, help='last epiweek override') - parser.add_argument('--force_update', '-u', action='store_true', help='force update db values') - parser.add_argument('--skip_email', '-s', action='store_true', help='skip email downloading step') - args = parser.parse_args() - - # sanity check - first, last, force_update, skip_email = args.first, args.last, args.force_update, args.skip_email - load_email = not skip_email - if first is not None: - flu.check_epiweek(first) - if last is not None: - flu.check_epiweek(last) - if first is not None and last is not None and first > last: - raise Exception('epiweeks in the wrong order') - - # decide what to update - if args.location.lower() == 'all': - locations = LOCATIONS - else: - locations = args.location.lower().split(',') - - # run the update - update(locations, first, last, force_update, load_email) - - -if __name__ == '__main__': - main() + # args and usage + parser = argparse.ArgumentParser() + # fmt: off + parser.add_argument( + "--location", + action="store", + type=str, + default=None, + help="location(s) (ex: all; any of hhs1-10)" + ) + parser.add_argument( + "--first", + "-f", + default=None, + type=int, + help="first epiweek override" + ) + parser.add_argument( + "--last", + "-l", + default=None, + type=int, + help="last epiweek override" + ) + parser.add_argument( + "--force_update", + "-u", + action="store_true", + help="force update db values" + ) + parser.add_argument( + "--skip_email", + "-s", + action="store_true", + help="skip email downloading step" + ) + # fmt: on + args = parser.parse_args() + + # sanity check + first, last, force_update, skip_email = ( + args.first, + args.last, + args.force_update, + args.skip_email, + ) + load_email = not skip_email + if first is not None: + flu.check_epiweek(first) + if last is not None: + flu.check_epiweek(last) + if first is not None and last is not None and first > last: + raise Exception("epiweeks in the wrong order") + + # decide what to update + if args.location.lower() == "all": + locations = LOCATIONS + else: + locations = args.location.lower().split(",") + + # run the update + update(locations, first, last, force_update, load_email) + + +if __name__ == "__main__": + main() diff --git a/src/acquisition/twtr/healthtweets.py b/src/acquisition/twtr/healthtweets.py index 78eb2b3ec..c1e345162 100644 --- a/src/acquisition/twtr/healthtweets.py +++ b/src/acquisition/twtr/healthtweets.py @@ -1,4 +1,4 @@ -''' +""" =============== === Purpose === =============== @@ -20,7 +20,7 @@ * Fetching daily values instead of weekly values 2015-03-?? * Original version -''' +""" # standard library import argparse @@ -36,132 +36,242 @@ class HealthTweets: - # mapping from state abbreviations to location codes used by healthtweets.org - STATE_CODES = {'AL': 3024, 'AK': 3025, 'AZ': 3026, 'AR': 3027, 'CA': 440, 'CO': 3029, 'CT': 3030, 'DE': 3031, 'DC': 3032, 'FL': 3033, 'GA': 3034, 'HI': 3035, 'ID': 3036, 'IL': 3037, 'IN': 3038, 'IA': 3039, 'KS': 3040, 'KY': 3041, 'LA': 2183, 'ME': 3043, 'MD': 3044, 'MA': 450, 'MI': 3046, 'MN': 3047, 'MS': 3048, 'MO': 3049, 'MT': 3050, 'NE': 3051, 'NV': 3052, 'NH': 3053, 'NJ': 478, 'NM': 2225, 'NY': 631, 'NC': 3057, 'ND': 3058, 'OH': 3059, 'OK': 3060, 'OR': 281, 'PA': 3062, 'RI': 3063, 'SC': 3064, 'SD': 3065, 'TN': 3066, 'TX': 3067, 'UT': 2272, 'VT': 3069, 'VA': 3070, 'WA': 3071, 'WV': 3072, 'WI': 3073, 'WY': 3074} - - def __init__(self, username, password, debug=False): - self.debug = debug - self.session = requests.Session() - # spoof a web browser - self.session.headers.update({ - 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36', - }) - # get the login token - response = self._go('http://www.healthtweets.org/accounts/login') - token = self._get_token(response.text) - if self.debug: - print('token=%s'%(token)) - data = { - 'csrfmiddlewaretoken': token, - 'username': username, - 'password': password, - 'next': '/', + # mapping from state abbreviations to location codes used by healthtweets.org + STATE_CODES = { + "AL": 3024, + "AK": 3025, + "AZ": 3026, + "AR": 3027, + "CA": 440, + "CO": 3029, + "CT": 3030, + "DE": 3031, + "DC": 3032, + "FL": 3033, + "GA": 3034, + "HI": 3035, + "ID": 3036, + "IL": 3037, + "IN": 3038, + "IA": 3039, + "KS": 3040, + "KY": 3041, + "LA": 2183, + "ME": 3043, + "MD": 3044, + "MA": 450, + "MI": 3046, + "MN": 3047, + "MS": 3048, + "MO": 3049, + "MT": 3050, + "NE": 3051, + "NV": 3052, + "NH": 3053, + "NJ": 478, + "NM": 2225, + "NY": 631, + "NC": 3057, + "ND": 3058, + "OH": 3059, + "OK": 3060, + "OR": 281, + "PA": 3062, + "RI": 3063, + "SC": 3064, + "SD": 3065, + "TN": 3066, + "TX": 3067, + "UT": 2272, + "VT": 3069, + "VA": 3070, + "WA": 3071, + "WV": 3072, + "WI": 3073, + "WY": 3074, } - # login to the site - response = self._go('http://www.healthtweets.org/accounts/login', data=data) - if response.status_code != 200 or 'Your username and password' in response.text: - raise Exception('login failed') - - def get_values(self, state, date1, date2): - ''' - state: two-letter state abbreviation (see STATE_CODES) - date1: the first date in the range, inclusive (format: YYYY-MM-DD) - date2: the last date in the range, inclusive (format: YYYY-MM-DD) - returns a dictionary (by date) of number of flu tweets (num) and total tweets (total) - ''' - # get raw values (number of flu tweets) and normalized values (flu tweets as a percent of total tweets) - raw_values = self._get_values(state, date1, date2, False) - normalized_values = self._get_values(state, date1, date2, True) - values = {} - # save the raw number and calculate the total - for date in raw_values.keys(): - if normalized_values[date] == 0: - continue - values[date] = { - 'num': round(raw_values[date]), - 'total': round(100 * raw_values[date] / normalized_values[date]), - } - print(date, raw_values[date], normalized_values[date]) - return values - - def _get_values(self, state, date1, date2, normalized): - if state not in HealthTweets.STATE_CODES: - raise Exception('invalid state') - state_code = HealthTweets.STATE_CODES[state] - d1, d2 = datetime.strptime(date1, '%Y-%m-%d'), datetime.strptime(date2, '%Y-%m-%d') - s1, s2 = d1.strftime('%m%%2F%d%%2F%Y'), d2.strftime('%m%%2F%d%%2F%Y') - count_type = 'normalized' if normalized else 'raw' - url = 'http://www.healthtweets.org/trends/plot?resolution=Day&count_type=%s&dayNum=%d&from=%s&to=%s&plot1_disease=65&location_plot1=%d'%(count_type, (d2 - d1).days, s1, s2, state_code) - response = self._go('http://www.healthtweets.org/trends/plot?resolution=Day&count_type=%s&dayNum=%d&from=%s&to=%s&plot1_disease=65&location_plot1=%d'%(count_type, (d2 - d1).days, s1, s2, state_code)) - #print(state, date1, date2, normalized) - #print(url) - #print(response.status_code) - if response.status_code != 200: - raise Exception('plot status is ' + str(response.status_code) + ' (when was data last updated?)') - lines = [line.strip() for line in response.text.split('\n')] - data_line = [line for line in lines if line[:16] == 'var chartData = '] - if len(data_line) != 1: - raise Exception('lookup failed') - values = json.loads(data_line[0][16:-1]) - return dict([(datetime.strptime(v[0], '%m/%d/%Y').strftime('%Y-%m-%d'), float(v[1])) for v in values]) - - def check_state(self, state): - ''' - Sanity checks state code mapping. - state: two-letter state abbreviation (see STATE_CODES) - returns the full state name associated with the state abbreviation - ''' - if state not in HealthTweets.STATE_CODES: - raise Exception('invalid state') - state_code = HealthTweets.STATE_CODES[state] - response = self._go('http://www.healthtweets.org/trends/plot?resolution=Day&count_type=normalized&dayNum=7&from=01%%2F01%%2F2015&to=01%%2F07%%2F2015&plot1_disease=65&location_plot1=%d'%(state_code)) - lines = [line.strip() for line in response.text.split('\n')] - data_line = [line for line in lines if line[:29] == 'var plotNames = ["Influenza ('] - if len(data_line) == 0: - raise Exception('check failed') - name = data_line[0][29:] - name = name.split('(')[0] - return name.strip() - - def _get_token(self, html): - page = PageParser.parse(html) - hidden = PageParser.filter_all(page, [('html',), ('body',), ('div',), ('div',), ('div',), ('form',), ('input',)]) - return hidden['attrs']['value'] - - def _go(self, url, method=None, referer=None, data=None): - if self.debug: - print('%s'%(url)) - if method is None: - if data is None: - method = self.session.get - else: - method = self.session.post - response = method(url, headers={'referer': referer}, data=data) - html = response.text - if self.debug: - for item in response.history: - print(' [%d to %s]'%(item.status_code, item.headers['Location'])) - print(' %d (%d bytes)'%(response.status_code, len(html))) - return response + + def __init__(self, username, password, debug=False): + self.debug = debug + self.session = requests.Session() + # spoof a web browser + self.session.headers.update( + { + "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36", + } + ) + # get the login token + response = self._go("https://www.healthtweets.org/accounts/login") + token = self._get_token(response.text) + if self.debug: + print(f"token={token}") + data = { + "csrfmiddlewaretoken": token, + "username": username, + "password": password, + "next": "/", + } + # login to the site + response = self._go("https://www.healthtweets.org/accounts/login", data=data) + if response.status_code != 200 or "Your username and password" in response.text: + raise Exception("login failed") + + def get_values(self, state, date1, date2): + """ + state: two-letter state abbreviation (see STATE_CODES) + date1: the first date in the range, inclusive (format: YYYY-MM-DD) + date2: the last date in the range, inclusive (format: YYYY-MM-DD) + returns a dictionary (by date) of number of flu tweets (num) and total tweets (total) + """ + # get raw values (number of flu tweets) and normalized values (flu tweets as a percent of total tweets) + raw_values = self._get_values(state, date1, date2, False) + normalized_values = self._get_values(state, date1, date2, True) + values = {} + # save the raw number and calculate the total + for date in raw_values.keys(): + if normalized_values[date] == 0: + continue + values[date] = { + "num": round(raw_values[date]), + "total": round(100 * raw_values[date] / normalized_values[date]), + } + print(date, raw_values[date], normalized_values[date]) + return values + + def _get_values(self, state, date1, date2, normalized): + if state not in HealthTweets.STATE_CODES: + raise Exception("invalid state") + state_code = HealthTweets.STATE_CODES[state] + d1, d2 = datetime.strptime(date1, "%Y-%m-%d"), datetime.strptime(date2, "%Y-%m-%d") + s1, s2 = d1.strftime("%m%%2F%d%%2F%Y"), d2.strftime("%m%%2F%d%%2F%Y") + count_type = "normalized" if normalized else "raw" + response = self._go( + "https://www.healthtweets.org/trends/plot?resolution=Day" + f"&count_type={count_type}&dayNum={(d2 - d1).days}&from={s1}" + f"&to={s2}&plot1_disease=65&location_plot1={int(state_code)}" + ) + # print(state, date1, date2, normalized) + # print(url) + # print(response.status_code) + if response.status_code != 200: + raise Exception( + "plot status is " + str(response.status_code) + " (when was data last updated?)" + ) + lines = [line.strip() for line in response.text.split("\n")] + data_line = [line for line in lines if line[:16] == "var chartData = "] + if len(data_line) != 1: + raise Exception("lookup failed") + values = json.loads(data_line[0][16:-1]) + return { + datetime.strptime(v[0], "%m/%d/%Y").strftime("%Y-%m-%d"): float(v[1]) for v in values + } + + def check_state(self, state): + """ + Sanity checks state code mapping. + state: two-letter state abbreviation (see STATE_CODES) + returns the full state name associated with the state abbreviation + """ + if state not in HealthTweets.STATE_CODES: + raise Exception("invalid state") + state_code = HealthTweets.STATE_CODES[state] + response = self._go( + "https://www.healthtweets.org/trends/plot?resolution=Day" + "&count_type=normalized&dayNum=7&from=01%%2F01%%2F2015" + f"&to=01%%2F07%%2F2015&plot1_disease=65&location_plot1={int(state_code)}" + ) + lines = [line.strip() for line in response.text.split("\n")] + data_line = [line for line in lines if line[:29] == 'var plotNames = ["Influenza ('] + if len(data_line) == 0: + raise Exception("check failed") + name = data_line[0][29:] + name = name.split("(")[0] + return name.strip() + + def _get_token(self, html): + page = PageParser.parse(html) + hidden = PageParser.filter_all( + page, [("html",), ("body",), ("div",), ("div",), ("div",), ("form",), ("input",)] + ) + return hidden["attrs"]["value"] + + def _go(self, url, method=None, referer=None, data=None): + if self.debug: + print(url) + if method is None: + if data is None: + method = self.session.get + else: + method = self.session.post + response = method(url, headers={"referer": referer}, data=data) + html = response.text + if self.debug: + for item in response.history: + print(f" [{int(item.status_code)} to {item.headers['Location']}]") + print(f" {int(response.status_code)} ({len(html)} bytes)") + return response def main(): - # args and usage - parser = argparse.ArgumentParser() - parser.add_argument('username', action='store', type=str, help='healthtweets.org username') - parser.add_argument('password', action='store', type=str, help='healthtweets.org password') - parser.add_argument('state', action='store', type=str, choices=list(HealthTweets.STATE_CODES.keys()), help='U.S. state (ex: TX)') - parser.add_argument('date1', action='store', type=str, help='first date, inclusive (ex: 2015-01-01)') - parser.add_argument('date2', action='store', type=str, help='last date, inclusive (ex: 2015-01-01)') - parser.add_argument('-d', '--debug', action='store_const', const=True, default=False, help='enable debug mode') - args = parser.parse_args() - - ht = HealthTweets(args.username, args.password, debug=args.debug) - values = ht.get_values(args.state, args.date1, args.date2) - print('Daily counts in %s from %s to %s:'%(ht.check_state(args.state), args.date1, args.date2)) - for date in sorted(list(values.keys())): - print('%s: num=%-4d total=%-5d (%.3f%%)'%(date, values[date]['num'], values[date]['total'], 100 * values[date]['num'] / values[date]['total'])) - - -if __name__ == '__main__': - main() + # args and usage + parser = argparse.ArgumentParser() + # fmt: off + parser.add_argument( + "username", + action="store", + type=str, + help="healthtweets.org username" + ) + parser.add_argument( + "password", + action="store", + type=str, + help="healthtweets.org password" + ) + parser.add_argument( + "state", + action="store", + type=str, + choices=list(HealthTweets.STATE_CODES.keys()), + help="U.S. state (ex: TX)" + ) + parser.add_argument( + "date1", + action="store", + type=str, + help="first date, inclusive (ex: 2015-01-01)" + ) + parser.add_argument( + "date2", + action="store", + type=str, + help="last date, inclusive (ex: 2015-01-01)" + ) + parser.add_argument( + "-d", + "--debug", + action="store_const", + const=True, + default=False, + help="enable debug mode" + ) + # fmt: on + args = parser.parse_args() + + ht = HealthTweets(args.username, args.password, debug=args.debug) + values = ht.get_values(args.state, args.date1, args.date2) + print(f"Daily counts in {ht.check_state(args.state)} from {args.date1} to {args.date2}:") + for date in sorted(list(values.keys())): + print( + "%s: num=%-4d total=%-5d (%.3f%%)" + % ( + date, + values[date]["num"], + values[date]["total"], + 100 * values[date]["num"] / values[date]["total"], + ) + ) + + +if __name__ == "__main__": + main() diff --git a/src/acquisition/twtr/pageparser.py b/src/acquisition/twtr/pageparser.py index 5e9aaaea1..2b2183c89 100644 --- a/src/acquisition/twtr/pageparser.py +++ b/src/acquisition/twtr/pageparser.py @@ -5,74 +5,73 @@ class PageParser(HTMLParser): - ''' - This is an HTML parser! All of the hard work is done by the superclass - (which is a Python built-in). This class puts the HTML into a hierarchy - that's (hopefully) easier to work with than raw string parsing. - ''' + """ + This is an HTML parser! All of the hard work is done by the superclass + (which is a Python built-in). This class puts the HTML into a hierarchy + that's (hopefully) easier to work with than raw string parsing. + """ - @staticmethod - def parse(html): - parser = PageParser() - parser.feed(html) - return parser.get_root_node() + @staticmethod + def parse(html): + parser = PageParser() + parser.feed(html) + return parser.get_root_node() - @staticmethod - def banlist(): - '''Commonly unclosed tags''' - return ('br', 'img', 'meta') + @staticmethod + def banlist(): + """Commonly unclosed tags""" + return ("br", "img", "meta") - @staticmethod - def new_node(type): - '''An empty node of the HTML tree''' - return {'type': type, 'attrs': {}, 'nodes': [], 'data': ''} + @staticmethod + def new_node(type): + """An empty node of the HTML tree""" + return {"type": type, "attrs": {}, "nodes": [], "data": ""} - @staticmethod - def filter_all(node, filters): - '''Applies all filters''' - for f in filters: - node = PageParser.filter(node, *f) - return node + @staticmethod + def filter_all(node, filters): + """Applies all filters""" + for f in filters: + node = PageParser.filter(node, *f) + return node - @staticmethod - def filter(node, type, index=0): - '''Finds a sub-node of the given type, specified by index''' - i = 0 - for node in node['nodes']: - if node['type'] == type: - if i == index: - return node - i += 1 - return None + @staticmethod + def filter(node, type, index=0): + """Finds a sub-node of the given type, specified by index""" + i = 0 + for node in node["nodes"]: + if node["type"] == type: + if i == index: + return node + i += 1 + return None - def __init__(self): - HTMLParser.__init__(self) - self.root = PageParser.new_node(None) - self.stack = [self.root] - self.indent = 0 + def __init__(self): + HTMLParser.__init__(self) + self.root = PageParser.new_node(None) + self.stack = [self.root] + self.indent = 0 - def get_root_node(self): - '''After parsing, returns the abstract root node (which contains the html node)''' - return self.root + def get_root_node(self): + """After parsing, returns the abstract root node (which contains the html node)""" + return self.root - def handle_starttag(self, tag, attrs): - '''Inherited - called when a start tag is found''' - if tag in PageParser.banlist(): - return - element = PageParser.new_node(tag) - for (k, v) in attrs: - element['attrs'][k] = v - self.stack[-1]['nodes'].append(element) - self.stack.append(element) + def handle_starttag(self, tag, attrs): + """Inherited - called when a start tag is found""" + if tag in PageParser.banlist(): + return + element = PageParser.new_node(tag) + for (k, v) in attrs: + element["attrs"][k] = v + self.stack[-1]["nodes"].append(element) + self.stack.append(element) - def handle_endtag(self, tag): - '''Inherited - called when an end tag is found''' - if tag in PageParser.banlist(): - return - self.stack.pop() + def handle_endtag(self, tag): + """Inherited - called when an end tag is found""" + if tag in PageParser.banlist(): + return + self.stack.pop() - - def handle_data(self, data): - '''Inherited - called when a data string is found''' - element = self.stack[-1] - element['data'] += data + def handle_data(self, data): + """Inherited - called when a data string is found""" + element = self.stack[-1] + element["data"] += data diff --git a/src/acquisition/twtr/twitter_update.py b/src/acquisition/twtr/twitter_update.py index 5c1f3f45b..80a023f19 100644 --- a/src/acquisition/twtr/twitter_update.py +++ b/src/acquisition/twtr/twitter_update.py @@ -1,4 +1,4 @@ -''' +""" =============== === Purpose === =============== @@ -49,7 +49,7 @@ * Small documentation update 2015-05-22 * Original version -''' +""" # third party import mysql.connector @@ -60,46 +60,55 @@ def run(): - # connect to the database - u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - cur = cnx.cursor() - - def get_num_rows(): - cur.execute('SELECT count(1) `num` FROM `twitter`') - for (num,) in cur: - pass - return num - - # check from 7 days preceeding the last date with data through yesterday (healthtweets.org 404's if today's date is part of the range) - cur.execute('SELECT date_sub(max(`date`), INTERVAL 7 DAY) `date1`, date_sub(date(now()), INTERVAL 1 DAY) `date2` FROM `twitter`') - for (date1, date2) in cur: - date1, date2 = date1.strftime('%Y-%m-%d'), date2.strftime('%Y-%m-%d') - print('Checking dates between %s and %s...'%(date1, date2)) - - # keep track of how many rows were added - rows_before = get_num_rows() - - # check healthtweets.org for new and/or revised data - ht = HealthTweets(*secrets.healthtweets.login) - sql = 'INSERT INTO `twitter` (`date`, `state`, `num`, `total`) VALUES (%s, %s, %s, %s) ON DUPLICATE KEY UPDATE `num` = %s, `total` = %s' - total_rows = 0 - for state in sorted(HealthTweets.STATE_CODES.keys()): - values = ht.get_values(state, date1, date2) - for date in sorted(list(values.keys())): - sql_data = (date, state, values[date]['num'], values[date]['total'], values[date]['num'], values[date]['total']) - cur.execute(sql, sql_data) - total_rows += 1 - - # keep track of how many rows were added - rows_after = get_num_rows() - print('Inserted %d/%d row(s)'%(rows_after - rows_before, total_rows)) - - # cleanup - cur.close() - cnx.commit() - cnx.close() - - -if __name__ == '__main__': - run() + # connect to the database + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") + cur = cnx.cursor() + + def get_num_rows(): + cur.execute("SELECT count(1) `num` FROM `twitter`") + for (num,) in cur: + pass + return num + + # check from 7 days preceeding the last date with data through yesterday (healthtweets.org 404's if today's date is part of the range) + cur.execute( + "SELECT date_sub(max(`date`), INTERVAL 7 DAY) `date1`, date_sub(date(now()), INTERVAL 1 DAY) `date2` FROM `twitter`" + ) + for (date1, date2) in cur: + date1, date2 = date1.strftime("%Y-%m-%d"), date2.strftime("%Y-%m-%d") + print(f"Checking dates between {date1} and {date2}...") + + # keep track of how many rows were added + rows_before = get_num_rows() + + # check healthtweets.org for new and/or revised data + ht = HealthTweets(*secrets.healthtweets.login) + sql = "INSERT INTO `twitter` (`date`, `state`, `num`, `total`) VALUES (%s, %s, %s, %s) ON DUPLICATE KEY UPDATE `num` = %s, `total` = %s" + total_rows = 0 + for state in sorted(HealthTweets.STATE_CODES.keys()): + values = ht.get_values(state, date1, date2) + for date in sorted(list(values.keys())): + sql_data = ( + date, + state, + values[date]["num"], + values[date]["total"], + values[date]["num"], + values[date]["total"], + ) + cur.execute(sql, sql_data) + total_rows += 1 + + # keep track of how many rows were added + rows_after = get_num_rows() + print(f"Inserted {int(rows_after - rows_before)}/{int(total_rows)} row(s)") + + # cleanup + cur.close() + cnx.commit() + cnx.close() + + +if __name__ == "__main__": + run() diff --git a/src/acquisition/wiki/wiki.py b/src/acquisition/wiki/wiki.py index 602e21102..c57582918 100644 --- a/src/acquisition/wiki/wiki.py +++ b/src/acquisition/wiki/wiki.py @@ -1,112 +1,112 @@ """ -=============== -=== Purpose === -=============== - -Wrapper for the entire wiki data collection process: - 1. Uses wiki_update.py to fetch metadata for new access logs - 2. Uses wiki_download.py to download the access logs - 3. Uses wiki_extract.py to store article access counts - +=============== +=== Purpose === +=============== + +Wrapper for the entire wiki data collection process: + 1. Uses wiki_update.py to fetch metadata for new access logs + 2. Uses wiki_download.py to download the access logs + 3. Uses wiki_extract.py to store article access counts + See also: master.php - - -======================= -=== Data Dictionary === -======================= - -`wiki_raw` is a staging table where extracted access log data is stored for -further processing. When wiki_update.py finds a new log, it saves the name and -hash to this table, with a status of 0. This table is read by master.php, which -then hands out "jobs" (independently and in parallel) to wiki_download.py. -After wiki_download.py downloads the log and extracts the counts, it submits -the data (as JSON) to master.php, which then stores the "raw" JSON counts in -this table. -+----------+---------------+------+-----+---------+----------------+ -| Field | Type | Null | Key | Default | Extra | -+----------+---------------+------+-----+---------+----------------+ -| id | int(11) | NO | PRI | NULL | auto_increment | -| name | varchar(64) | NO | UNI | NULL | | -| hash | char(32) | NO | | NULL | | -| status | int(11) | NO | MUL | 0 | | -| size | int(11) | YES | | NULL | | -| datetime | datetime | YES | | NULL | | -| worker | varchar(256) | YES | | NULL | | -| elapsed | float | YES | | NULL | | -| data | varchar(2048) | YES | | NULL | | -+----------+---------------+------+-----+---------+----------------+ -id: unique identifier for each record -name: name of the access log -hash: md5 hash of the file, as reported by the dumps site (all zeroes if no - hash is provided) -status: the status of the job, using the following values: - 0: queued for download - 1: download in progress - 2: queued for extraction - 3: extracted to `wiki` table - (any negative value indicates failure) -size: the size, in bytes, of the downloaded file -datetime: the timestamp of the most recent status update -worker: name (user@hostname) of the machine working on the job -elapsed: time, in seconds, taken to complete the job -data: a JSON string containing counts for selected articles in the access log - -`wiki` is the table where access counts are stored (parsed from wiki_raw). The -"raw" JSON counts are parsed by wiki_extract.py and stored directly in this -table. -+----------+-------------+------+-----+---------+----------------+ -| Field | Type | Null | Key | Default | Extra | -+----------+-------------+------+-----+---------+----------------+ -| id | int(11) | NO | PRI | NULL | auto_increment | -| datetime | datetime | NO | MUL | NULL | | -| article | varchar(64) | NO | MUL | NULL | | -| count | int(11) | NO | | NULL | | -+----------+-------------+------+-----+---------+----------------+ -id: unique identifier for each record -datetime: UTC timestamp (rounded to the nearest hour) of article access -article: name of the article -count: number of times the article was accessed in the hour - -`wiki_meta` is a metadata table for this dataset. It contains pre-calculated -date and epiweeks fields, and more importantly, the total number of English -article hits (denominator) for each `datetime` in the `wiki` table. This table -is populated in parallel with `wiki` by the wiki_extract.py script. -+----------+----------+------+-----+---------+----------------+ -| Field | Type | Null | Key | Default | Extra | -+----------+----------+------+-----+---------+----------------+ -| id | int(11) | NO | PRI | NULL | auto_increment | -| datetime | datetime | NO | UNI | NULL | | -| date | date | NO | | NULL | | -| epiweek | int(11) | NO | | NULL | | -| total | int(11) | NO | | NULL | | -+----------+----------+------+-----+---------+----------------+ -id: unique identifier for each record -datetime: UTC timestamp (rounded to the nearest hour) of article access -date: the date portion of `datetime` -epiweek: the year and week containing `datetime` -total: total number of English article hits in the hour - - -================= -=== Changelog === -================= - + + +======================= +=== Data Dictionary === +======================= + +`wiki_raw` is a staging table where extracted access log data is stored for +further processing. When wiki_update.py finds a new log, it saves the name and +hash to this table, with a status of 0. This table is read by master.php, which +then hands out "jobs" (independently and in parallel) to wiki_download.py. +After wiki_download.py downloads the log and extracts the counts, it submits +the data (as JSON) to master.php, which then stores the "raw" JSON counts in +this table. ++----------+---------------+------+-----+---------+----------------+ +| Field | Type | Null | Key | Default | Extra | ++----------+---------------+------+-----+---------+----------------+ +| id | int(11) | NO | PRI | NULL | auto_increment | +| name | varchar(64) | NO | UNI | NULL | | +| hash | char(32) | NO | | NULL | | +| status | int(11) | NO | MUL | 0 | | +| size | int(11) | YES | | NULL | | +| datetime | datetime | YES | | NULL | | +| worker | varchar(256) | YES | | NULL | | +| elapsed | float | YES | | NULL | | +| data | varchar(2048) | YES | | NULL | | ++----------+---------------+------+-----+---------+----------------+ +id: unique identifier for each record +name: name of the access log +hash: md5 hash of the file, as reported by the dumps site (all zeroes if no + hash is provided) +status: the status of the job, using the following values: + 0: queued for download + 1: download in progress + 2: queued for extraction + 3: extracted to `wiki` table + (any negative value indicates failure) +size: the size, in bytes, of the downloaded file +datetime: the timestamp of the most recent status update +worker: name (user@hostname) of the machine working on the job +elapsed: time, in seconds, taken to complete the job +data: a JSON string containing counts for selected articles in the access log + +`wiki` is the table where access counts are stored (parsed from wiki_raw). The +"raw" JSON counts are parsed by wiki_extract.py and stored directly in this +table. ++----------+-------------+------+-----+---------+----------------+ +| Field | Type | Null | Key | Default | Extra | ++----------+-------------+------+-----+---------+----------------+ +| id | int(11) | NO | PRI | NULL | auto_increment | +| datetime | datetime | NO | MUL | NULL | | +| article | varchar(64) | NO | MUL | NULL | | +| count | int(11) | NO | | NULL | | ++----------+-------------+------+-----+---------+----------------+ +id: unique identifier for each record +datetime: UTC timestamp (rounded to the nearest hour) of article access +article: name of the article +count: number of times the article was accessed in the hour + +`wiki_meta` is a metadata table for this dataset. It contains pre-calculated +date and epiweeks fields, and more importantly, the total number of English +article hits (denominator) for each `datetime` in the `wiki` table. This table +is populated in parallel with `wiki` by the wiki_extract.py script. ++----------+----------+------+-----+---------+----------------+ +| Field | Type | Null | Key | Default | Extra | ++----------+----------+------+-----+---------+----------------+ +| id | int(11) | NO | PRI | NULL | auto_increment | +| datetime | datetime | NO | UNI | NULL | | +| date | date | NO | | NULL | | +| epiweek | int(11) | NO | | NULL | | +| total | int(11) | NO | | NULL | | ++----------+----------+------+-----+---------+----------------+ +id: unique identifier for each record +datetime: UTC timestamp (rounded to the nearest hour) of article access +date: the date portion of `datetime` +epiweek: the year and week containing `datetime` +total: total number of English article hits in the hour + + +================= +=== Changelog === +================= + 2017-02-24 * secrets and small improvements 2016-08-14 * Increased job limit (6 -> 12) (pageviews files are ~2x smaller) -2015-08-26 +2015-08-26 * Reduced job limit (8 -> 6) -2015-08-14 +2015-08-14 * Reduced job limit (10 -> 8) -2015-08-11 +2015-08-11 + New table `wiki_meta` -2015-05-22 +2015-05-22 * Updated status codes for `wiki_raw` table -2015-05-21 +2015-05-21 * Original version """ - + # first party from . import wiki_update from . import wiki_download @@ -115,31 +115,27 @@ def main(): - # step 1: find new access logs (aka "jobs") - print('looking for new jobs...') - try: - wiki_update.run() - except: - print('wiki_update failed') - - # step 2: run a few jobs - print('running jobs...') - try: - wiki_download.run( - secrets.wiki.hmac, - download_limit=1024 * 1024 * 1024, - job_limit=12 - ) - except: - print('wiki_download failed') - - # step 3: extract counts from the staging data - print('extracting counts...') - try: - wiki_extract.run(job_limit=100) - except: - print('wiki_extract failed') - - -if __name__ == '__main__': - main() + # step 1: find new access logs (aka "jobs") + print("looking for new jobs...") + try: + wiki_update.run() + except: + print("wiki_update failed") + + # step 2: run a few jobs + print("running jobs...") + try: + wiki_download.run(secrets.wiki.hmac, download_limit=1024 * 1024 * 1024, job_limit=12) + except: + print("wiki_download failed") + + # step 3: extract counts from the staging data + print("extracting counts...") + try: + wiki_extract.run(job_limit=100) + except: + print("wiki_extract failed") + + +if __name__ == "__main__": + main() diff --git a/src/acquisition/wiki/wiki_download.py b/src/acquisition/wiki/wiki_download.py index 1a01b7f8e..6192eab02 100644 --- a/src/acquisition/wiki/wiki_download.py +++ b/src/acquisition/wiki/wiki_download.py @@ -27,16 +27,16 @@ """ # python 2 and 3 -from __future__ import print_function import sys + if sys.version_info.major == 2: - # python 2 libraries - from urllib import urlencode - from urllib2 import urlopen + # python 2 libraries + from urllib import urlencode + from urllib2 import urlopen else: - # python 3 libraries - from urllib.parse import urlencode - from urllib.request import urlopen + # python 3 libraries + from urllib.parse import urlencode + from urllib.request import urlopen # common libraries import argparse @@ -53,234 +53,302 @@ VERSION = 10 -MASTER_URL = 'https://delphi.cmu.edu/~automation/public/wiki/master.php' +MASTER_URL = "https://delphi.cmu.edu/~automation/public/wiki/master.php" + def text(data_string): - return str(data_string.decode('utf-8')) + return str(data_string.decode("utf-8")) def data(text_string): - if sys.version_info.major == 2: - return text_string - else: - return bytes(text_string, 'utf-8') + if sys.version_info.major == 2: + return text_string + else: + return bytes(text_string, "utf-8") def get_hmac_sha256(key, msg): - key_bytes, msg_bytes = key.encode('utf-8'), msg.encode('utf-8') - return hmac.new(key_bytes, msg_bytes, hashlib.sha256).hexdigest() + key_bytes, msg_bytes = key.encode("utf-8"), msg.encode("utf-8") + return hmac.new(key_bytes, msg_bytes, hashlib.sha256).hexdigest() def extract_article_counts(filename, language, articles, debug_mode): - """ - Support multiple languages ('en' | 'es' | 'pt') - Running time optimized to O(M), which means only need to scan the whole file once - :param filename: - :param language: Different languages such as 'en', 'es', and 'pt' - :param articles: - :param debug_mode: - :return: - """ - counts = {} - articles_set = set(map(lambda x: x.lower(), articles)) - total = 0 - with open(filename, "r", encoding="utf8") as f: - for line in f: - content = line.strip().split() - if len(content) != 4: - print('unexpected article format: {0}'.format(line)) - continue - article_title = content[1].lower() - article_count = int(content[2]) - if content[0] == language: - total += article_count - if content[0] == language and article_title in articles_set: - if debug_mode: - print("Find article {0}: {1}".format(article_title, line)) - counts[article_title] = article_count - if debug_mode: - print("Total number of counts for language {0} is {1}".format(language, total)) - counts['total'] = total - return counts + """ + Support multiple languages ('en' | 'es' | 'pt') + Running time optimized to O(M), which means only need to scan the whole file once + :param filename: + :param language: Different languages such as 'en', 'es', and 'pt' + :param articles: + :param debug_mode: + :return: + """ + counts = {} + articles_set = set(map(lambda x: x.lower(), articles)) + total = 0 + with open(filename, encoding="utf8") as f: + for line in f: + content = line.strip().split() + if len(content) != 4: + print(f"unexpected article format: {line}") + continue + article_title = content[1].lower() + article_count = int(content[2]) + if content[0] == language: + total += article_count + if content[0] == language and article_title in articles_set: + if debug_mode: + print(f"Find article {article_title}: {line}") + counts[article_title] = article_count + if debug_mode: + print(f"Total number of counts for language {language} is {total}") + counts["total"] = total + return counts def extract_article_counts_orig(articles, debug_mode): - """ - The original method which extracts article counts by shell command grep (only support en articles). - As it is difficult to deal with other languages (utf-8 encoding), we choose to use python read files. - Another things is that it is slower to go over the whole file once and once again, the time complexity is O(NM), - where N is the number of articles and M is the lines in the file - In our new implementation extract_article_counts(), the time complexity is O(M), and it can cope with utf8 encoding - :param articles: - :param debug_mode: - :return: - """ - counts = {} - for article in articles: - if debug_mode: - print(' %s' % (article)) + """ + The original method which extracts article counts by shell command grep (only support en articles). + As it is difficult to deal with other languages (utf-8 encoding), we choose to use python read files. + Another things is that it is slower to go over the whole file once and once again, the time complexity is O(NM), + where N is the number of articles and M is the lines in the file + In our new implementation extract_article_counts(), the time complexity is O(M), and it can cope with utf8 encoding + :param articles: + :param debug_mode: + :return: + """ + counts = {} + for article in articles: + if debug_mode: + print(f" {article}") + out = text( + subprocess.check_output( + f'LC_ALL=C grep -a -i "^en {article.lower()} " raw2 | cat', shell=True + ) + ).strip() + count = 0 + if len(out) > 0: + for line in out.split("\n"): + fields = line.split() + if len(fields) != 4: + print(f"unexpected article format: [{line}]") + else: + count += int(fields[2]) + # print ' %4d %s'%(count, article) + counts[article.lower()] = count + if debug_mode: + print(f" {int(count)}") + print("getting total count...") out = text( - subprocess.check_output('LC_ALL=C grep -a -i "^en %s " raw2 | cat' % (article.lower()), shell=True)).strip() - count = 0 - if len(out) > 0: - for line in out.split('\n'): - fields = line.split() - if len(fields) != 4: - print('unexpected article format: [%s]' % (line)) - else: - count += int(fields[2]) - # print ' %4d %s'%(count, article) - counts[article.lower()] = count + subprocess.check_output( + 'cat raw2 | LC_ALL=C grep -a -i "^en " | cut -d" " -f 3 | awk \'{s+=$1} END {printf "%.0f", s}\'', + shell=True, + ) + ) + total = int(out) if debug_mode: - print(' %d' % (count)) - print('getting total count...') - out = text(subprocess.check_output( - 'cat raw2 | LC_ALL=C grep -a -i "^en " | cut -d" " -f 3 | awk \'{s+=$1} END {printf "%.0f", s}\'', shell=True)) - total = int(out) - if debug_mode: - print(total) - counts['total'] = total - return counts + print(total) + counts["total"] = total + return counts def run(secret, download_limit=None, job_limit=None, sleep_time=1, job_type=0, debug_mode=False): - worker = text(subprocess.check_output("echo `whoami`@`hostname`", shell=True)).strip() - print('this is [%s]'%(worker)) - if debug_mode: - print('*** running in debug mode ***') - - total_download = 0 - passed_jobs = 0 - failed_jobs = 0 - while (download_limit is None or total_download < download_limit) and (job_limit is None or (passed_jobs + failed_jobs) < job_limit): - try: - time_start = datetime.datetime.now() - req = urlopen(MASTER_URL + '?get=x&type=%s'%(job_type)) - code = req.getcode() - if code != 200: - if code == 201: - print('no jobs available') - if download_limit is None and job_limit is None: - time.sleep(60) - continue - else: - print('nothing to do, exiting') - return - else: - raise Exception('server response code (get) was %d'%(code)) - # Make the code compatible with mac os system - if platform == "darwin": - job_content = text(req.readlines()[1]) - else: - job_content = text(req.readlines()[0]) - if job_content == 'no jobs': - print('no jobs available') - if download_limit is None and job_limit is None: - time.sleep(60) - continue - else: - print('nothing to do, exiting') - return - job = json.loads(job_content) - print('received job [%d|%s]'%(job['id'], job['name'])) - # updated parsing for pageviews - maybe use a regex in the future - #year, month = int(job['name'][11:15]), int(job['name'][15:17]) - year, month = int(job['name'][10:14]), int(job['name'][14:16]) - #print 'year=%d | month=%d'%(year, month) - url = 'https://dumps.wikimedia.org/other/pageviews/%d/%d-%02d/%s'%(year, year, month, job['name']) - print('downloading file [%s]...'%(url)) - subprocess.check_call('curl -s %s > raw.gz'%(url), shell=True) - print('checking file size...') - # Make the code cross-platfrom, so use python to get the size of the file - # size = int(text(subprocess.check_output('ls -l raw.gz | cut -d" " -f 5', shell=True))) - size = os.stat("raw.gz").st_size - if debug_mode: - print(size) - total_download += size - if job['hash'] != '00000000000000000000000000000000': - print('checking hash...') - out = text(subprocess.check_output('md5sum raw.gz', shell=True)) - result = out[0:32] - if result != job['hash']: - raise Exception('wrong hash [expected %s, got %s]'%(job['hash'], result)) - if debug_mode: - print(result) - print('decompressing...') - subprocess.check_call('gunzip -f raw.gz', shell=True) - #print 'converting case...' - #subprocess.check_call('cat raw | tr "[:upper:]" "[:lower:]" > raw2', shell=True) - #subprocess.check_call('rm raw', shell=True) - subprocess.check_call('mv raw raw2', shell=True) - print('extracting article counts...') - - # Use python to read the file and extract counts, if you want to use the original shell method, please use - counts = {} - for language in wiki_util.Articles.available_languages: - lang2articles = {'en': wiki_util.Articles.en_articles, 'es': wiki_util.Articles.es_articles, 'pt': wiki_util.Articles.pt_articles} - articles = lang2articles[language] - articles = sorted(articles) - if debug_mode: - print("Language is {0} and target articles are {1}".format(language, articles)) - temp_counts = extract_article_counts("raw2", language, articles, debug_mode) - counts[language] = temp_counts - - if not debug_mode: - print('deleting files...') - subprocess.check_call('rm raw2', shell=True) - print('saving results...') - time_stop = datetime.datetime.now() - result = { - 'id': job['id'], - 'size': size, - 'data': json.dumps(counts), - 'worker': worker, - 'elapsed': (time_stop - time_start).total_seconds(), - } - payload = json.dumps(result) - hmac_str = get_hmac_sha256(secret, payload) - if debug_mode: - print(' hmac: %s' % hmac_str) - post_data = urlencode({'put': payload, 'hmac': hmac_str}) - req = urlopen(MASTER_URL, data=data(post_data)) - code = req.getcode() - if code != 200: - raise Exception('server response code (put) was %d'%(code)) - print('done! (dl=%d)'%(total_download)) - passed_jobs += 1 - except Exception as ex: - print('***** Caught Exception: %s *****'%(str(ex))) - failed_jobs += 1 - time.sleep(30) - print('passed=%d | failed=%d | total=%d'%(passed_jobs, failed_jobs, passed_jobs + failed_jobs)) - time.sleep(sleep_time) - - if download_limit is not None and total_download >= download_limit: - print('download limit has been reached [%d >= %d]'%(total_download, download_limit)) - if job_limit is not None and (passed_jobs + failed_jobs) >= job_limit: - print('job limit has been reached [%d >= %d]'%(passed_jobs + failed_jobs, job_limit)) + worker = text(subprocess.check_output("echo `whoami`@`hostname`", shell=True)).strip() + print(f"this is [{worker}]") + if debug_mode: + print("*** running in debug mode ***") + + total_download = 0 + passed_jobs = 0 + failed_jobs = 0 + while (download_limit is None or total_download < download_limit) and ( + job_limit is None or (passed_jobs + failed_jobs) < job_limit + ): + try: + time_start = datetime.datetime.now() + req = urlopen(MASTER_URL + f"?get=x&type={job_type}") + code = req.getcode() + if code != 200: + if code == 201: + print("no jobs available") + if download_limit is None and job_limit is None: + time.sleep(60) + continue + else: + print("nothing to do, exiting") + return + else: + raise Exception(f"server response code (get) was {int(code)}") + # Make the code compatible with mac os system + if platform == "darwin": + job_content = text(req.readlines()[1]) + else: + job_content = text(req.readlines()[0]) + if job_content == "no jobs": + print("no jobs available") + if download_limit is None and job_limit is None: + time.sleep(60) + continue + else: + print("nothing to do, exiting") + return + job = json.loads(job_content) + print(f"received job [{int(job['id'])}|{job['name']}]") + # updated parsing for pageviews - maybe use a regex in the future + # year, month = int(job['name'][11:15]), int(job['name'][15:17]) + year, month = int(job["name"][10:14]), int(job["name"][14:16]) + # print 'year=%d | month=%d'%(year, month) + url = ( + "https://dumps.wikimedia.org/other/" + f"pageviews/{year}/{year}-{month:02d}/{job['name']}" + ) + print(f"downloading file [{url}]...") + subprocess.check_call(f"curl -s {url} > raw.gz", shell=True) + print("checking file size...") + # Make the code cross-platfrom, so use python to get the size of the file + # size = int(text(subprocess.check_output('ls -l raw.gz | cut -d" " -f 5', shell=True))) + size = os.stat("raw.gz").st_size + if debug_mode: + print(size) + total_download += size + if job["hash"] != "00000000000000000000000000000000": + print("checking hash...") + out = text(subprocess.check_output("md5sum raw.gz", shell=True)) + result = out[0:32] + if result != job["hash"]: + raise Exception(f"wrong hash [expected {job['hash']}, got {result}]") + if debug_mode: + print(result) + print("decompressing...") + subprocess.check_call("gunzip -f raw.gz", shell=True) + # print 'converting case...' + # subprocess.check_call('cat raw | tr "[:upper:]" "[:lower:]" > raw2', shell=True) + # subprocess.check_call('rm raw', shell=True) + subprocess.check_call("mv raw raw2", shell=True) + print("extracting article counts...") + + # Use python to read the file and extract counts, if you want to use the original shell method, please use + counts = {} + for language in wiki_util.Articles.available_languages: + lang2articles = { + "en": wiki_util.Articles.en_articles, + "es": wiki_util.Articles.es_articles, + "pt": wiki_util.Articles.pt_articles, + } + articles = lang2articles[language] + articles = sorted(articles) + if debug_mode: + print(f"Language is {language} and target articles are {articles}") + temp_counts = extract_article_counts("raw2", language, articles, debug_mode) + counts[language] = temp_counts + + if not debug_mode: + print("deleting files...") + subprocess.check_call("rm raw2", shell=True) + print("saving results...") + time_stop = datetime.datetime.now() + result = { + "id": job["id"], + "size": size, + "data": json.dumps(counts), + "worker": worker, + "elapsed": (time_stop - time_start).total_seconds(), + } + payload = json.dumps(result) + hmac_str = get_hmac_sha256(secret, payload) + if debug_mode: + print(f" hmac: {hmac_str}") + post_data = urlencode({"put": payload, "hmac": hmac_str}) + req = urlopen(MASTER_URL, data=data(post_data)) + code = req.getcode() + if code != 200: + raise Exception(f"server response code (put) was {int(code)}") + print(f"done! (dl={int(total_download)})") + passed_jobs += 1 + except Exception as ex: + print(f"***** Caught Exception: {str(ex)} *****") + failed_jobs += 1 + time.sleep(30) + print( + "passed=%d | failed=%d | total=%d" + % (passed_jobs, failed_jobs, passed_jobs + failed_jobs) + ) + time.sleep(sleep_time) + + if download_limit is not None and total_download >= download_limit: + print(f"download limit has been reached [{int(total_download)} >= {int(download_limit)}]") + if job_limit is not None and (passed_jobs + failed_jobs) >= job_limit: + print(f"job limit has been reached [{int(passed_jobs + failed_jobs)} >= {int(job_limit)}]") def main(): - # version info - print('version', VERSION) - - # args and usage - parser = argparse.ArgumentParser() - parser.add_argument('secret', type=str, help='hmac secret key') - parser.add_argument('-b', '--blimit', action='store', type=int, default=None, help='download limit, in bytes') - parser.add_argument('-j', '--jlimit', action='store', type=int, default=None, help='job limit') - parser.add_argument('-s', '--sleep', action='store', type=int, default=1, help='seconds to sleep between each job') - parser.add_argument('-t', '--type', action='store', type=int, default=0, help='type of job') - parser.add_argument('-d', '--debug', action='store_const', const=True, default=False, help='enable debug mode') - args = parser.parse_args() - - # runtime options - secret, download_limit, job_limit, sleep_time, job_type, debug_mode = args.secret, args.blimit, args.jlimit, args.sleep, args.type, args.debug - - # run - run(secret, download_limit, job_limit, sleep_time, job_type, debug_mode) - - -if __name__ == '__main__': - main() + # version info + print("version", VERSION) + + # args and usage + parser = argparse.ArgumentParser() + # fmt: off + parser.add_argument( + "secret", + type=str, + help="hmac secret key" + ) + parser.add_argument( + "-b", + "--blimit", + action="store", + type=int, + default=None, + help="download limit, in bytes" + ) + parser.add_argument( + "-j", + "--jlimit", + action="store", + type=int, + default=None, + help="job limit" + ) + parser.add_argument( + "-s", + "--sleep", + action="store", + type=int, + default=1, + help="seconds to sleep between each job" + ) + parser.add_argument( + "-t", + "--type", + action="store", + type=int, + default=0, + help="type of job" + ) + parser.add_argument( + "-d", + "--debug", + action="store_const", + const=True, + default=False, + help="enable debug mode" + ) + # fmt: on + args = parser.parse_args() + + # runtime options + secret, download_limit, job_limit, sleep_time, job_type, debug_mode = ( + args.secret, + args.blimit, + args.jlimit, + args.sleep, + args.type, + args.debug, + ) + + # run + run(secret, download_limit, job_limit, sleep_time, job_type, debug_mode) + + +if __name__ == "__main__": + main() diff --git a/src/acquisition/wiki/wiki_extract.py b/src/acquisition/wiki/wiki_extract.py index 839d7d6dc..718a64c20 100644 --- a/src/acquisition/wiki/wiki_extract.py +++ b/src/acquisition/wiki/wiki_extract.py @@ -35,74 +35,96 @@ def floor_timestamp(timestamp): - return datetime(timestamp.year, timestamp.month, timestamp.day, timestamp.hour) + return datetime(timestamp.year, timestamp.month, timestamp.day, timestamp.hour) def ceil_timestamp(timestamp): - return floor_timestamp(timestamp) + timedelta(hours=1) + return floor_timestamp(timestamp) + timedelta(hours=1) def round_timestamp(timestamp): - before = floor_timestamp(timestamp) - after = ceil_timestamp(timestamp) - if (timestamp - before) < (after - timestamp): - return before - else: - return after + before = floor_timestamp(timestamp) + after = ceil_timestamp(timestamp) + if (timestamp - before) < (after - timestamp): + return before + else: + return after def get_timestamp(name): - # new parsing for pageviews compared to pagecounts - maybe switch to regex in the future - #return datetime(int(name[11:15]), int(name[15:17]), int(name[17:19]), int(name[20:22]), int(name[22:24]), int(name[24:26])) - return datetime(int(name[10:14]), int(name[14:16]), int(name[16:18]), int(name[19:21]), int(name[21:23]), int(name[23:25])) + # new parsing for pageviews compared to pagecounts - maybe switch to regex in the future + # return datetime(int(name[11:15]), int(name[15:17]), int(name[17:19]), int(name[20:22]), int(name[22:24]), int(name[24:26])) + return datetime( + int(name[10:14]), + int(name[14:16]), + int(name[16:18]), + int(name[19:21]), + int(name[21:23]), + int(name[23:25]), + ) def run(job_limit=100): - # connect to the database - u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - cur = cnx.cursor() - - # # Some preparation for utf-8, and it is a temporary trick solution. The real solution should change those char set and collation encoding to utf8 permanently - # cur.execute("SET NAMES utf8;") - # cur.execute("SET CHARACTER SET utf8;") - # # I print SHOW SESSION VARIABLES LIKE 'character\_set\_%'; and SHOW SESSION VARIABLES LIKE 'collation\_%'; on my local computer - # cur.execute("SET character_set_client=utf8mb4;") - # cur.execute("SET character_set_connection=utf8mb4;") - # cur.execute("SET character_set_database=utf8;") - # cur.execute("SET character_set_results=utf8mb4;") - # cur.execute("SET character_set_server=utf8;") - # cur.execute("SET collation_connection=utf8mb4_general_ci;") - # cur.execute("SET collation_database=utf8_general_ci;") - # cur.execute("SET collation_server=utf8_general_ci;") - - # find jobs that are queued for extraction - cur.execute('SELECT `id`, `name`, `data` FROM `wiki_raw` WHERE `status` = 2 ORDER BY `name` ASC LIMIT %s', (job_limit,)) - jobs = [] - for (id, name, data_str) in cur: - jobs.append((id, name, json.loads(data_str))) - print('Processing data from %d jobs'%(len(jobs))) - - # get the counts from the json object and insert into (or update) the database - # Notice that data_collect contains data with different languages - for (id, name, data_collect) in jobs: - print('processing job [%d|%s]...'%(id, name)) - timestamp = round_timestamp(get_timestamp(name)) - for language in data_collect.keys(): - data = data_collect[language] - for article in sorted(data.keys()): - count = data[article] - cur.execute('INSERT INTO `wiki` (`datetime`, `article`, `count`, `language`) VALUES (%s, %s, %s, %s) ON DUPLICATE KEY UPDATE `count` = `count` + %s', (str(timestamp), article.encode('utf-8').decode('latin-1'), count, language, count)) - if article == 'total': - cur.execute('INSERT INTO `wiki_meta` (`datetime`, `date`, `epiweek`, `total`, `language`) VALUES (%s, date(%s), yearweek(%s, 6), %s, %s) ON DUPLICATE KEY UPDATE `total` = `total` + %s', (str(timestamp), str(timestamp), str(timestamp), count, language, count)) - # update the job - cur.execute('UPDATE `wiki_raw` SET `status` = 3 WHERE `id` = %s', (id,)) - - # cleanup - cur.close() - cnx.commit() - cnx.close() - - -if __name__ == '__main__': - run() + # connect to the database + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") + cur = cnx.cursor() + + # # Some preparation for utf-8, and it is a temporary trick solution. The real solution should change those char set and collation encoding to utf8 permanently + # cur.execute("SET NAMES utf8;") + # cur.execute("SET CHARACTER SET utf8;") + # # I print SHOW SESSION VARIABLES LIKE 'character\_set\_%'; and SHOW SESSION VARIABLES LIKE 'collation\_%'; on my local computer + # cur.execute("SET character_set_client=utf8mb4;") + # cur.execute("SET character_set_connection=utf8mb4;") + # cur.execute("SET character_set_database=utf8;") + # cur.execute("SET character_set_results=utf8mb4;") + # cur.execute("SET character_set_server=utf8;") + # cur.execute("SET collation_connection=utf8mb4_general_ci;") + # cur.execute("SET collation_database=utf8_general_ci;") + # cur.execute("SET collation_server=utf8_general_ci;") + + # find jobs that are queued for extraction + cur.execute( + "SELECT `id`, `name`, `data` FROM `wiki_raw` WHERE `status` = 2 ORDER BY `name` ASC LIMIT %s", + (job_limit,), + ) + jobs = [] + for (id, name, data_str) in cur: + jobs.append((id, name, json.loads(data_str))) + print(f"Processing data from {len(jobs)} jobs") + + # get the counts from the json object and insert into (or update) the database + # Notice that data_collect contains data with different languages + for (id, name, data_collect) in jobs: + print(f"processing job [{int(id)}|{name}]...") + timestamp = round_timestamp(get_timestamp(name)) + for language in data_collect.keys(): + data = data_collect[language] + for article in sorted(data.keys()): + count = data[article] + cur.execute( + "INSERT INTO `wiki` (`datetime`, `article`, `count`, `language`) VALUES (%s, %s, %s, %s) ON DUPLICATE KEY UPDATE `count` = `count` + %s", + ( + str(timestamp), + article.encode("utf-8").decode("latin-1"), + count, + language, + count, + ), + ) + if article == "total": + cur.execute( + "INSERT INTO `wiki_meta` (`datetime`, `date`, `epiweek`, `total`, `language`) VALUES (%s, date(%s), yearweek(%s, 6), %s, %s) ON DUPLICATE KEY UPDATE `total` = `total` + %s", + (str(timestamp), str(timestamp), str(timestamp), count, language, count), + ) + # update the job + cur.execute("UPDATE `wiki_raw` SET `status` = 3 WHERE `id` = %s", (id,)) + + # cleanup + cur.close() + cnx.commit() + cnx.close() + + +if __name__ == "__main__": + run() diff --git a/src/acquisition/wiki/wiki_update.py b/src/acquisition/wiki/wiki_update.py index 411544810..a9f240629 100644 --- a/src/acquisition/wiki/wiki_update.py +++ b/src/acquisition/wiki/wiki_update.py @@ -32,87 +32,100 @@ def floor_timestamp(timestamp): - return datetime(timestamp.year, timestamp.month, timestamp.day, timestamp.hour) + return datetime(timestamp.year, timestamp.month, timestamp.day, timestamp.hour) def ceil_timestamp(timestamp): - return floor_timestamp(timestamp) + timedelta(hours=1) + return floor_timestamp(timestamp) + timedelta(hours=1) def round_timestamp(timestamp): - before = floor_timestamp(timestamp) - after = ceil_timestamp(timestamp) - if (timestamp - before) < (after - timestamp): - return before - else: - return after + before = floor_timestamp(timestamp) + after = ceil_timestamp(timestamp) + if (timestamp - before) < (after - timestamp): + return before + else: + return after def get_timestamp(name): - # If the program is cold start (there are no previous names in the table, and the name will be None) - if name is None: - curr = datetime.now() - return datetime(curr.year, curr.month, curr.day, curr.hour, curr.minute, curr.second) - # new parsing for pageviews compared to pagecounts - maybe switch to regex in the future - #return datetime(int(name[11:15]), int(name[15:17]), int(name[17:19]), int(name[20:22]), int(name[22:24]), int(name[24:26])) - return datetime(int(name[10:14]), int(name[14:16]), int(name[16:18]), int(name[19:21]), int(name[21:23]), int(name[23:25])) + # If the program is cold start (there are no previous names in the table, and the name will be None) + if name is None: + curr = datetime.now() + return datetime(curr.year, curr.month, curr.day, curr.hour, curr.minute, curr.second) + # new parsing for pageviews compared to pagecounts - maybe switch to regex in the future + # return datetime(int(name[11:15]), int(name[15:17]), int(name[17:19]), int(name[20:22]), int(name[22:24]), int(name[24:26])) + return datetime( + int(name[10:14]), + int(name[14:16]), + int(name[16:18]), + int(name[19:21]), + int(name[21:23]), + int(name[23:25]), + ) def get_manifest(year, month, optional=False): - # unlike pagecounts-raw, pageviews doesn't provide hashes - #url = 'https://dumps.wikimedia.org/other/pagecounts-raw/%d/%d-%02d/md5sums.txt'%(year, year, month) - url = 'https://dumps.wikimedia.org/other/pageviews/%d/%d-%02d/' % (year, year, month) - print('Checking manifest at %s...'%(url)) - response = requests.get(url) - if response.status_code == 200: - #manifest = [line.strip().split() for line in response.text.split('\n') if 'pagecounts' in line] - manifest = [('00000000000000000000000000000000', line[9:37]) for line in response.text.split('\n') if ' max_name: - new_logs[name] = hash - print(' New job: %s [%s]'%(name, hash)) - print('Found %d new job(s)'%(len(new_logs))) - - # store metadata for new jobs - for name in sorted(new_logs.keys()): - cur.execute('INSERT INTO `wiki_raw` (`name`, `hash`) VALUES (%s, %s)', (name, new_logs[name])) - - # cleanup - cur.close() - cnx.commit() - cnx.close() - - -if __name__ == '__main__': - run() + # connect to the database + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database="epidata") + cur = cnx.cursor() + + # get the most recent job in wiki_raw + # luckily, "pageviews" is lexicographically greater than "pagecounts-raw" + cur.execute("SELECT max(`name`) FROM `wiki_raw`") + for (max_name,) in cur: + pass + print(f"Last known file: {max_name}") + timestamp = get_timestamp(max_name) + + # crawl dumps.wikimedia.org to find more recent access logs + t1, t2 = floor_timestamp(timestamp), ceil_timestamp(timestamp) + manifest = get_manifest(t1.year, t1.month, optional=False) + if t2.month != t1.month: + manifest += get_manifest(t2.year, t2.month, optional=True) + + # find access logs newer than the most recent job + new_logs = {} + for (hash, name) in manifest: + if max_name is None or name > max_name: + new_logs[name] = hash + print(f" New job: {name} [{hash}]") + print(f"Found {len(new_logs)} new job(s)") + + # store metadata for new jobs + for name in sorted(new_logs.keys()): + cur.execute( + "INSERT INTO `wiki_raw` (`name`, `hash`) VALUES (%s, %s)", (name, new_logs[name]) + ) + + # cleanup + cur.close() + cnx.commit() + cnx.close() + + +if __name__ == "__main__": + run() diff --git a/src/acquisition/wiki/wiki_util.py b/src/acquisition/wiki/wiki_util.py index ed3c743bc..55bf3e2ca 100644 --- a/src/acquisition/wiki/wiki_util.py +++ b/src/acquisition/wiki/wiki_util.py @@ -1,159 +1,156 @@ - - - class Articles: # Notice that all languages must be two chars, because that `language` column in table `wiki` is CHAR(2) - available_languages = ['en', 'es', 'pt'] + available_languages = ["en", "es", "pt"] en_articles_flu = [ - 'Influenza_B_virus', - 'Influenza_A_virus', - 'Human_flu', - 'Influenzavirus_C', - 'Oseltamivir', - 'Influenza', - 'Influenzavirus_A', - 'Influenza_A_virus_subtype_H1N1', - 'Zanamivir', - 'Influenza-like_illness', - 'Common_cold', - 'Sore_throat', - 'Flu_season', - 'Chills', - 'Fever', - 'Influenza_A_virus_subtype_H2N2', - 'Swine_influenza', - 'Shivering', - 'Canine_influenza', - 'Influenza_A_virus_subtype_H3N2', - 'Neuraminidase_inhibitor', - 'Influenza_pandemic', - 'Viral_pneumonia', - 'Influenza_prevention', - 'Influenza_A_virus_subtype_H1N2', - 'Rhinorrhea', - 'Orthomyxoviridae', - 'Nasal_congestion', - 'Gastroenteritis', - 'Rimantadine', - 'Paracetamol', - 'Amantadine', - 'Viral_neuraminidase', - 'Headache', - 'Influenza_vaccine', - 'Vomiting', - 'Cough', - 'Influenza_A_virus_subtype_H5N1', - 'Nausea', - 'Avian_influenza', - 'Influenza_A_virus_subtype_H7N9', - 'Influenza_A_virus_subtype_H10N7', - 'Influenza_A_virus_subtype_H9N2', - 'Hemagglutinin_(influenza)', - 'Influenza_A_virus_subtype_H7N7', - 'Fatigue_(medical)', - 'Myalgia', - 'Influenza_A_virus_subtype_H7N3', - 'Malaise', - 'Equine_influenza', - 'Cat_flu', - 'Influenza_A_virus_subtype_H3N8', - 'Antiviral_drugs', - 'Influenza_A_virus_subtype_H7N2', + "Influenza_B_virus", + "Influenza_A_virus", + "Human_flu", + "Influenzavirus_C", + "Oseltamivir", + "Influenza", + "Influenzavirus_A", + "Influenza_A_virus_subtype_H1N1", + "Zanamivir", + "Influenza-like_illness", + "Common_cold", + "Sore_throat", + "Flu_season", + "Chills", + "Fever", + "Influenza_A_virus_subtype_H2N2", + "Swine_influenza", + "Shivering", + "Canine_influenza", + "Influenza_A_virus_subtype_H3N2", + "Neuraminidase_inhibitor", + "Influenza_pandemic", + "Viral_pneumonia", + "Influenza_prevention", + "Influenza_A_virus_subtype_H1N2", + "Rhinorrhea", + "Orthomyxoviridae", + "Nasal_congestion", + "Gastroenteritis", + "Rimantadine", + "Paracetamol", + "Amantadine", + "Viral_neuraminidase", + "Headache", + "Influenza_vaccine", + "Vomiting", + "Cough", + "Influenza_A_virus_subtype_H5N1", + "Nausea", + "Avian_influenza", + "Influenza_A_virus_subtype_H7N9", + "Influenza_A_virus_subtype_H10N7", + "Influenza_A_virus_subtype_H9N2", + "Hemagglutinin_(influenza)", + "Influenza_A_virus_subtype_H7N7", + "Fatigue_(medical)", + "Myalgia", + "Influenza_A_virus_subtype_H7N3", + "Malaise", + "Equine_influenza", + "Cat_flu", + "Influenza_A_virus_subtype_H3N8", + "Antiviral_drugs", + "Influenza_A_virus_subtype_H7N2", ] en_articles_noro = [ - 'Norovirus', - 'Diarrhea', - 'Dehydration', - 'Gastroenteritis', - 'Vomiting', - 'Abdominal_pain', - 'Nausea', - 'Foodborne_illness', - 'Rotavirus', - 'Fecal–oral_route', - 'Intravenous_therapy', - 'Oral_rehydration_therapy', - 'Shellfish', - 'Caliciviridae', - 'Leaky_scanning', + "Norovirus", + "Diarrhea", + "Dehydration", + "Gastroenteritis", + "Vomiting", + "Abdominal_pain", + "Nausea", + "Foodborne_illness", + "Rotavirus", + "Fecal–oral_route", + "Intravenous_therapy", + "Oral_rehydration_therapy", + "Shellfish", + "Caliciviridae", + "Leaky_scanning", ] en_articles_dengue = [ - 'Dengue_fever', - 'Dengue_virus', - 'Aedes', - 'Aedes_aegypti', - 'Dengue_vaccine', - 'Mosquito', - 'Mosquito-borne_disease', - 'Blood_transfusion', - 'Paracetamol', - 'Fever', - 'Headache', - 'Rhinitis', - 'Flavivirus', - 'Exanthem', - 'Myalgia', - 'Arthralgia', - 'Thrombocytopenia', - 'Hematuria', - 'Nosebleed', - 'Petechia', - 'Nausea', - 'Vomiting', - 'Diarrhea', + "Dengue_fever", + "Dengue_virus", + "Aedes", + "Aedes_aegypti", + "Dengue_vaccine", + "Mosquito", + "Mosquito-borne_disease", + "Blood_transfusion", + "Paracetamol", + "Fever", + "Headache", + "Rhinitis", + "Flavivirus", + "Exanthem", + "Myalgia", + "Arthralgia", + "Thrombocytopenia", + "Hematuria", + "Nosebleed", + "Petechia", + "Nausea", + "Vomiting", + "Diarrhea", ] en_articles = list(set(en_articles_flu + en_articles_noro + en_articles_dengue)) es_articles = [ - 'Dengue', - 'Virus_dengue', - 'Aedes', - 'Aedes_aegypti', - 'Culicidae', - 'Transfusión_de_sangre', - 'Paracetamol', - 'Fiebre', - 'Cefalea', - 'Coriza', - 'Flavivirus', - 'Exantema', - 'Mosquito', - 'Mialgia', - 'Artralgia', - 'Trombocitopenia', - 'Hematuria', - 'Epistaxis', - 'Petequia', - 'Náusea', - 'Vómito', - 'Diarrea', + "Dengue", + "Virus_dengue", + "Aedes", + "Aedes_aegypti", + "Culicidae", + "Transfusión_de_sangre", + "Paracetamol", + "Fiebre", + "Cefalea", + "Coriza", + "Flavivirus", + "Exantema", + "Mosquito", + "Mialgia", + "Artralgia", + "Trombocitopenia", + "Hematuria", + "Epistaxis", + "Petequia", + "Náusea", + "Vómito", + "Diarrea", ] pt_articles = [ - 'Dengue', - 'Vírus_da_dengue', - 'Aedes', - 'Aedes_aegypti', - 'Culicidae', - 'Transfusão_de_sangue', - 'Paracetamol', - 'Febre', - 'Cefaleia', - 'Coriza', - 'Flavivírus', - 'Exantema', - 'Mialgia', - 'Artralgia', - 'Trombocitopenia', - 'Hematúria', - 'Epistaxe', - 'Petéquia', - 'Náusea', - 'Vômito', - 'Diarreia', + "Dengue", + "Vírus_da_dengue", + "Aedes", + "Aedes_aegypti", + "Culicidae", + "Transfusão_de_sangue", + "Paracetamol", + "Febre", + "Cefaleia", + "Coriza", + "Flavivírus", + "Exantema", + "Mialgia", + "Artralgia", + "Trombocitopenia", + "Hematúria", + "Epistaxe", + "Petéquia", + "Náusea", + "Vômito", + "Diarreia", ]